2 Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
\r
3 Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc.
\r
5 This software is provided 'as-is', without any express or implied warranty.
\r
6 In no event will the authors be held liable for any damages arising from the use of this software.
\r
7 Permission is granted to anyone to use this software for any purpose,
\r
8 including commercial applications, and to alter it and redistribute it freely,
\r
9 subject to the following restrictions:
\r
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
\r
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
\r
13 3. This notice may not be removed or altered from any source distribution.
\r
18 //CL_PLATFORM_MINI_CL could be defined in build system
\r
20 #include <GL/glew.h>
\r
23 #include <MiniCL/cl_platform.h> //for CL_PLATFORM_MINI_CL definition
\r
25 #include <CL/cl_platform.h> //for CL_PLATFORM_MINI_CL definition
\r
30 #include "btOpenCLUtils.h"
\r
32 #include "btBulletDynamicsCommon.h"
\r
33 #include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
\r
34 #include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
\r
35 #include "BulletCollision/CollisionShapes/btCollisionShape.h"
\r
36 #include "BulletDynamics/Dynamics/btRigidBody.h"
\r
37 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"
\r
38 #include "BulletDynamics/ConstraintSolver/btContactSolverInfo.h"
\r
39 #include "LinearMath/btQuickprof.h"
\r
40 #include "GlutStuff.h"
\r
41 #include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"
\r
42 #include "BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h"
\r
44 #include "btParticlesDynamicsWorld.h"
\r
45 #include "GL_DialogWindow.h"
\r
47 //when loading from disk, you need to remove the 'MSTRINGIFY' line at the start, and ); at the end of the .cl file
\r
49 #define LOAD_FROM_MEMORY
\r
50 #ifdef LOAD_FROM_MEMORY
\r
51 #define MSTRINGIFY(A) #A
\r
52 static const char* source=
\r
53 #include "ParticlesOCL.cl"
\r
54 #endif //LOAD_FROM_MEMORY
\r
56 btParticlesDynamicsWorld::~btParticlesDynamicsWorld()
\r
60 static int gStepNum = 0;
\r
62 int btParticlesDynamicsWorld::stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep)
\r
64 startProfiling(timeStep);
\r
65 m_timeStep = timeStep;
\r
66 BT_PROFILE("stepSimulation");
\r
67 // printf("Step : %d\n", gStepNum);
\r
69 BT_PROFILE("IntegrateMotion");
\r
70 runIntegrateMotionKernel();
\r
73 runComputeCellIdKernel();
\r
76 BT_PROFILE("SortHash");
\r
77 runSortHashKernel();
\r
80 BT_PROFILE("FindCellStart");
\r
81 runFindCellStartKernel();
\r
84 BT_PROFILE("CollideParticles");
\r
85 // printf("\ncollide particles\n\n");
\r
86 runCollideParticlesKernel();
\r
90 #ifndef BT_NO_PROFILE
\r
91 CProfileManager::Increment_Frame_Counter();
\r
92 #endif //BT_NO_PROFILE
\r
96 static unsigned int getMaxPowOf2(unsigned int num)
\r
98 unsigned int maxPowOf2 = 1;
\r
99 for(int bit = 1; bit < 32; bit++)
\r
101 if(maxPowOf2 >= num)
\r
111 void btParticlesDynamicsWorld::initDeviceData()
\r
118 void btParticlesDynamicsWorld::postInitDeviceData()
\r
120 m_hashSize = getMaxPowOf2(m_numParticles);
\r
124 grabSimulationData();
\r
128 void btParticlesDynamicsWorld::getShapeData()
\r
130 int numObjects = getNumCollisionObjects();
\r
131 btCollisionObjectArray& collisionObjects = getCollisionObjectArray();
\r
132 for(int i = 0; i < numObjects; i++)
\r
134 btCollisionObject* colObj = collisionObjects[i];
\r
135 btCollisionShape* pShape = colObj->getCollisionShape();
\r
136 int shapeType = pShape->getShapeType();
\r
137 if(shapeType == SPHERE_SHAPE_PROXYTYPE)
\r
139 btSphereShape* pSph = (btSphereShape*)pShape;
\r
140 btScalar sphRad = pSph->getRadius();
\r
143 m_particleRad = sphRad;
\r
147 btAssert(m_particleRad == sphRad);
\r
155 printf("Total number of particles : %d\n", m_numParticles);
\r
158 void btParticlesDynamicsWorld::allocateBuffers()
\r
161 // positions of spheres
\r
162 m_hPos.resize(m_numParticles);
\r
163 m_hVel.resize(m_numParticles);
\r
164 m_hSortedPos.resize(m_numParticles);
\r
165 m_hSortedVel.resize(m_numParticles);
\r
166 m_hPosHash.resize(m_hashSize);
\r
167 for(int i = 0; i < m_hashSize; i++) { m_hPosHash[i].x = 0x7FFFFFFF; m_hPosHash[i].y = 0; }
\r
168 unsigned int memSize = sizeof(btVector3) * m_numParticles;
\r
169 m_dPos = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
\r
170 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
171 m_dVel = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
\r
172 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
173 m_dSortedPos = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
\r
174 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
175 m_dSortedVel = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
\r
176 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
177 memSize = m_hashSize * sizeof(btInt2);
\r
178 m_dPosHash = clCreateBuffer(m_cxMainContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum);
\r
179 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
181 // global simulation parameters
\r
182 memSize = sizeof(btSimParams);
\r
183 m_dSimParams = clCreateBuffer(m_cxMainContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum);
\r
184 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
187 void btParticlesDynamicsWorld::adjustGrid()
\r
189 //btVector3 wmin( BT_LARGE_FLOAT, BT_LARGE_FLOAT, BT_LARGE_FLOAT);
\r
190 //btVector3 wmax(-BT_LARGE_FLOAT, -BT_LARGE_FLOAT, -BT_LARGE_FLOAT);
\r
192 btVector3 wmin( BT_LARGE_FLOAT, BT_LARGE_FLOAT, BT_LARGE_FLOAT);
\r
193 btVector3 wmax(-BT_LARGE_FLOAT, -BT_LARGE_FLOAT, -BT_LARGE_FLOAT);
\r
194 btVector3 boxDiag(m_particleRad, m_particleRad, m_particleRad);
\r
195 for(int i = 0; i < m_numParticles; i++)
\r
197 btVector3 pos = m_hPos[i];
\r
198 btVector3 boxMin = pos - boxDiag;
\r
199 btVector3 boxMax = pos + boxDiag;
\r
200 wmin.setMin(boxMin);
\r
201 wmax.setMax(boxMax);
\r
205 btVector3 wsize = m_worldMax - m_worldMin;
\r
208 glBindBufferARB(GL_ARRAY_BUFFER, m_colVbo);
\r
209 btVector3* color = (btVector3*)glMapBufferARB(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
\r
210 for(int i = 0; i < m_numParticles; i++, color++)
\r
212 *color = (m_hPos[i] - m_worldMin) / wsize;
\r
215 glUnmapBufferARB(GL_ARRAY_BUFFER);
\r
221 m_worldMin -= wsize;
\r
222 m_worldMax += wsize;
\r
224 m_worldMin.setValue(-WORLD_SIZE, -WORLD_SIZE, -WORLD_SIZE);
\r
225 m_worldMax.setValue( WORLD_SIZE, WORLD_SIZE, WORLD_SIZE);
\r
226 wsize = m_worldMax - m_worldMin;
\r
228 m_cellSize[0] = m_cellSize[1] = m_cellSize[2] = m_particleRad * btScalar(2.f);
\r
230 m_simParams.m_worldMin[0] = m_worldMin[0];
\r
231 m_simParams.m_worldMin[1] = m_worldMin[1];
\r
232 m_simParams.m_worldMin[2] = m_worldMin[2];
\r
234 m_simParams.m_worldMax[0] = m_worldMax[0];
\r
235 m_simParams.m_worldMax[1] = m_worldMax[1];
\r
236 m_simParams.m_worldMax[2] = m_worldMax[2];
\r
238 m_simParams.m_cellSize[0] = m_cellSize[0];
\r
239 m_simParams.m_cellSize[1] = m_cellSize[1];
\r
240 m_simParams.m_cellSize[2] = m_cellSize[2];
\r
242 m_simParams.m_gridSize[0] = (int)(wsize[0] / m_cellSize[0] + 0.999999f);
\r
243 m_simParams.m_gridSize[1] = (int)(wsize[1] / m_cellSize[1] + 0.999999f);
\r
244 m_simParams.m_gridSize[2] = (int)(wsize[2] / m_cellSize[2] + 0.999999f);
\r
246 m_numGridCells = m_simParams.m_gridSize[0] * m_simParams.m_gridSize[1] * m_simParams.m_gridSize[2];
\r
247 m_hCellStart.resize(m_numGridCells);
\r
248 unsigned int memSize = sizeof(int) * m_numGridCells;
\r
250 m_dCellStart = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
\r
251 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
256 void btParticlesDynamicsWorld::grabSimulationData()
\r
258 // const btVector3& gravity = getGravity();
\r
259 //btVector3 gravity(0., -0.06, 0.);
\r
260 //btVector3 gravity(0., -0.0003f, 0.);
\r
261 btVector3 gravity(0,-0.0003,0);
\r
265 m_simParams.m_gravity[0] = gravity[0];
\r
266 m_simParams.m_gravity[1] = gravity[1];
\r
267 m_simParams.m_gravity[2] = gravity[2];
\r
268 m_simParams.m_particleRad = m_particleRad;
\r
269 m_simParams.m_globalDamping = 1.0f;
\r
270 m_simParams.m_boundaryDamping = -0.5f;
\r
272 // m_simParams.m_collisionDamping = 0.02f;
\r
273 // m_simParams.m_spring = 0.5f;
\r
274 // m_simParams.m_shear = 0.1f;
\r
275 // m_simParams.m_attraction = 0.0f;
\r
276 m_simParams.m_collisionDamping = 0.025f;//0.02f;
\r
277 m_simParams.m_spring = 0.5f;
\r
278 m_simParams.m_shear = 0.1f;
\r
279 m_simParams.m_attraction = 0.001f;
\r
283 // copy data to GPU
\r
285 unsigned int memSize = sizeof(btVector3) * m_numParticles;
\r
286 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);
\r
287 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
288 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);
\r
289 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
290 memSize = sizeof(btSimParams);
\r
291 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dSimParams, CL_TRUE, 0, memSize, &m_simParams, 0, NULL, NULL);
\r
292 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
293 memSize = m_hashSize * sizeof(btInt2);
\r
294 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
\r
295 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
299 void btParticlesDynamicsWorld::createVBO()
\r
301 // create buffer object
\r
302 glGenBuffers(1, &m_vbo);
\r
303 glBindBuffer(GL_ARRAY_BUFFER, m_vbo);
\r
304 // positions of spheres
\r
305 unsigned int memSize = sizeof(btVector3) * m_numParticles;
\r
306 glBufferData(GL_ARRAY_BUFFER, memSize, 0, GL_DYNAMIC_DRAW);
\r
309 glGenBuffers(1, &vbo);
\r
310 glBindBuffer(GL_ARRAY_BUFFER, vbo);
\r
311 glBufferData(GL_ARRAY_BUFFER, memSize, 0, GL_DYNAMIC_DRAW);
\r
312 glBindBuffer(GL_ARRAY_BUFFER, 0);
\r
314 // fill color buffer
\r
315 glBindBufferARB(GL_ARRAY_BUFFER, m_colVbo);
\r
316 float *data = (float*)glMapBufferARB(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
\r
318 for(int i = 0; i < m_numParticles; i++)
\r
320 float t = i / (float)m_numParticles;
\r
327 glUnmapBufferARB(GL_ARRAY_BUFFER);
\r
328 glBindBufferARB(GL_ARRAY_BUFFER, 0);
\r
333 void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv)
\r
337 if (!m_cxMainContext)
\r
340 cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
\r
341 m_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0, 0);
\r
343 int numDev = btOpenCLUtils::getNumDevices(m_cxMainContext);
\r
347 exit(0);//this is just a demo, exit now
\r
350 m_cdDevice = btOpenCLUtils::getDevice(m_cxMainContext,0);
\r
351 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
353 btOpenCLDeviceInfo clInfo;
\r
354 btOpenCLUtils::getDeviceInfo(m_cdDevice,clInfo);
\r
355 btOpenCLUtils::printDeviceInfo(m_cdDevice);
\r
357 // create a command-queue
\r
358 m_cqCommandQue = clCreateCommandQueue(m_cxMainContext, m_cdDevice, 0, &ciErrNum);
\r
359 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
362 size_t program_length;
\r
365 #ifdef LOAD_FROM_MEMORY
\r
366 program_length = strlen(source);
\r
367 printf("OpenCL compiles ParticlesOCL.cl ... ");
\r
370 const char* fileName = "ParticlesOCL.cl";
\r
371 FILE * fp = fopen(fileName, "rb");
\r
372 char newFileName[512];
\r
376 sprintf(newFileName,"..//%s",fileName);
\r
377 fp = fopen(newFileName, "rb");
\r
379 fileName = newFileName;
\r
384 sprintf(newFileName,"Demos//ParticlesOpenCL//%s",fileName);
\r
385 fp = fopen(newFileName, "rb");
\r
387 fileName = newFileName;
\r
392 sprintf(newFileName,"..//..//..//..//..//Demos//ParticlesOpenCL//%s",fileName);
\r
393 fp = fopen(newFileName, "rb");
\r
395 fileName = newFileName;
\r
398 printf("cannot find %s\n",newFileName);
\r
403 // char *source = oclLoadProgSource(".//Demos//SpheresGrid//SpheresGrid.cl", "", &program_length);
\r
404 //char *source = btOclLoadProgSource(".//Demos//SpheresOpenCL//Shared//SpheresGrid.cl", "", &program_length);
\r
406 char *source = btOclLoadProgSource(fileName, "", &program_length);
\r
409 printf("ERROR : OpenCL can't load file %s\n", fileName);
\r
411 // oclCHECKERROR (source == NULL, oclFALSE);
\r
412 btAssert(source != NULL);
\r
414 // create the program
\r
415 printf("OpenCL compiles %s ...", fileName);
\r
417 #endif //LOAD_FROM_MEMORY
\r
420 //printf("%s\n", source);
\r
422 m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&source, &program_length, &ciErrNum);
\r
423 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
424 #ifndef LOAD_FROM_MEMORY
\r
426 #endif //LOAD_FROM_MEMORY
\r
428 //#define LOCAL_SIZE_LIMIT 1024U
\r
429 #define LOCAL_SIZE_MAX 1024U
\r
431 // Build the program with 'mad' Optimization option
\r
433 const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -cl-mad-enable -DMAC -DGUID_ARG";
\r
435 const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -DGUID_ARG= ";
\r
437 // build the program
\r
438 ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL);
\r
439 if(ciErrNum != CL_SUCCESS)
\r
441 // write out standard error
\r
442 // oclLog(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
\r
443 // write out the build log and ptx, then exit
\r
444 char cBuildLog[10240];
\r
446 // size_t szPtxLength;
\r
447 clGetProgramBuildInfo(m_cpProgram, m_cdDevice, CL_PROGRAM_BUILD_LOG,
\r
448 sizeof(cBuildLog), cBuildLog, NULL );
\r
449 // oclGetProgBinary(m_cpProgram, oclGetFirstDev(m_cxMainContext), &cPtx, &szPtxLength);
\r
450 // oclLog(LOGBOTH | CLOSELOG, 0.0, "\n\nLog:\n%s\n\n\n\n\nPtx:\n%s\n\n\n", cBuildLog, cPtx);
\r
451 printf("\n\n%s\n\n\n", cBuildLog);
\r
452 printf("Press ENTER key to terminate the program\n");
\r
458 // create the kernels
\r
460 postInitDeviceData();
\r
462 initKernel(PARTICLES_KERNEL_COMPUTE_CELL_ID, "kComputeCellId");
\r
463 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPos);
\r
464 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 2, sizeof(cl_mem), (void*) &m_dPosHash);
\r
465 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSimParams);
\r
466 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
468 initKernel(PARTICLES_KERNEL_INTEGRATE_MOTION, "kIntegrateMotion");
\r
469 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 1, sizeof(cl_mem), (void *) &m_dPos);
\r
470 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 2, sizeof(cl_mem), (void *) &m_dVel);
\r
471 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 3, sizeof(cl_mem), (void *) &m_dSimParams);
\r
472 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
475 initKernel(PARTICLES_KERNEL_CLEAR_CELL_START, "kClearCellStart");
\r
476 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 0, sizeof(int), (void *) &m_numGridCells);
\r
477 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dCellStart);
\r
479 initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart");
\r
480 // ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 0, sizeof(int), (void*) &m_numParticles);
\r
481 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPosHash);
\r
482 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem), (void*) &m_dCellStart);
\r
483 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem), (void*) &m_dPos);
\r
484 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 4, sizeof(cl_mem), (void*) &m_dVel);
\r
485 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 5, sizeof(cl_mem), (void*) &m_dSortedPos);
\r
486 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 6, sizeof(cl_mem), (void*) &m_dSortedVel);
\r
487 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
489 initKernel(PARTICLES_KERNEL_COLLIDE_PARTICLES, "kCollideParticles");
\r
490 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 1, sizeof(cl_mem), (void*) &m_dVel);
\r
491 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 2, sizeof(cl_mem), (void*) &m_dSortedPos);
\r
492 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSortedVel);
\r
493 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 4, sizeof(cl_mem), (void*) &m_dPosHash);
\r
494 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 5, sizeof(cl_mem), (void*) &m_dCellStart);
\r
495 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 6, sizeof(cl_mem), (void*) &m_dSimParams);
\r
497 initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL, "kBitonicSortCellIdLocal");
\r
498 initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1, "kBitonicSortCellIdLocal1");
\r
499 initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL, "kBitonicSortCellIdMergeGlobal");
\r
500 initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL, "kBitonicSortCellIdMergeLocal");
\r
503 static btInt4 cpu_getGridPos(btVector3& worldPos, btSimParams* pParams)
\r
506 gridPos.x = (int)floor((worldPos[0] - pParams->m_worldMin[0]) / pParams->m_cellSize[0]);
\r
507 gridPos.y = (int)floor((worldPos[1] - pParams->m_worldMin[1]) / pParams->m_cellSize[1]);
\r
508 gridPos.z = (int)floor((worldPos[2] - pParams->m_worldMin[2]) / pParams->m_cellSize[2]);
\r
512 static unsigned int cpu_getPosHash(btInt4& gridPos, btSimParams* pParams)
\r
514 btInt4 gridDim = *((btInt4*)(pParams->m_gridSize));
\r
515 if(gridPos.x < 0) gridPos.x = 0;
\r
516 if(gridPos.x >= gridDim.x) gridPos.x = gridDim.x - 1;
\r
517 if(gridPos.y < 0) gridPos.y = 0;
\r
518 if(gridPos.y >= gridDim.y) gridPos.y = gridDim.y - 1;
\r
519 if(gridPos.z < 0) gridPos.z = 0;
\r
520 if(gridPos.z >= gridDim.z) gridPos.z = gridDim.z - 1;
\r
521 unsigned int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;
\r
528 void btParticlesDynamicsWorld::runComputeCellIdKernel()
\r
532 if(m_useCpuControls[SIMSTAGE_COMPUTE_CELL_ID]->m_active)
\r
534 unsigned int memSize = sizeof(btVector3) * m_numParticles;
\r
535 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);
\r
536 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
537 for(int index = 0; index < m_numParticles; index++)
\r
539 btVector3 pos = m_hPos[index];
\r
540 btInt4 gridPos = cpu_getGridPos(pos, &m_simParams);
\r
541 unsigned int hash = cpu_getPosHash(gridPos, &m_simParams);
\r
542 m_hPosHash[index].x = hash;
\r
543 m_hPosHash[index].y = index;
\r
545 memSize = sizeof(btInt2) * m_numParticles;
\r
546 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
\r
547 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
552 BT_PROFILE("ComputeCellId");
\r
553 runKernelWithWorkgroupSize(PARTICLES_KERNEL_COMPUTE_CELL_ID, m_numParticles);
\r
554 ciErrNum = clFinish(m_cqCommandQue);
\r
555 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
559 int memSize = sizeof(btInt2) * m_hashSize;
\r
560 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
\r
561 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
563 memSize = sizeof(float) * 4 * m_numParticles;
\r
564 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);
\r
565 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
569 BT_PROFILE("Copy VBO");
\r
570 // Explicit Copy (until OpenGL interop will work)
\r
571 // map the PBO to copy data from the CL buffer via host
\r
572 glBindBufferARB(GL_ARRAY_BUFFER, m_vbo);
\r
573 // map the buffer object into client's memory
\r
574 void* ptr = glMapBufferARB(GL_ARRAY_BUFFER, GL_WRITE_ONLY_ARB);
\r
575 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, sizeof(float) * 4 * m_numParticles, ptr, 0, NULL, NULL);
\r
576 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
577 glUnmapBufferARB(GL_ARRAY_BUFFER);
\r
578 glBindBufferARB(GL_ARRAY_BUFFER,0);
\r
584 static btVector3 cpu_collideTwoParticles(
\r
597 //Calculate relative position
\r
598 btVector3 relPos = posB - posA; relPos[3] = 0.f;
\r
599 float dist = sqrt(relPos[0] * relPos[0] + relPos[1] * relPos[1] + relPos[2] * relPos[2]);
\r
600 float collideDist = radiusA + radiusB;
\r
602 btVector3 force = btVector3(0, 0, 0);
\r
603 if(dist < collideDist)
\r
605 btVector3 norm = relPos / dist;
\r
607 //Relative velocity
\r
608 btVector3 relVel = velB - velA; relVel[3] = 0.f;;
\r
610 //Relative tangential velocity
\r
611 float relVelDotNorm = relVel.dot(norm);
\r
612 btVector3 tanVel = relVel - relVelDotNorm * norm;
\r
613 //Spring force (potential)
\r
614 //float springFactor = -spring * (collideDist - dist);
\r
615 float springFactor = -0.4 * (collideDist - dist);
\r
616 force = springFactor * norm + damping * relVel;// + shear * tanVel + attraction * relPos;
\r
630 void btParticlesDynamicsWorld::runCollideParticlesKernel()
\r
632 btAlignedObjectArray<int> pairs;
\r
634 float particleRad = m_simParams.m_particleRad;
\r
635 float collideDist2 = (particleRad + particleRad)*(particleRad + particleRad);
\r
637 if(m_useCpuControls[SIMSTAGE_COLLIDE_PARTICLES]->m_active)
\r
639 int memSize = sizeof(btVector3) * m_numParticles;
\r
641 BT_PROFILE("Copy from GPU");
\r
642 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dSortedPos, CL_TRUE, 0, memSize, &(m_hSortedPos[0]), 0, NULL, NULL);
\r
643 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
644 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dSortedVel, CL_TRUE, 0, memSize, &(m_hSortedVel[0]), 0, NULL, NULL);
\r
645 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
646 memSize = sizeof(btInt2) * m_numParticles;
\r
647 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
\r
648 memSize = m_numGridCells * sizeof(int);
\r
649 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dCellStart, CL_TRUE, 0, memSize, &(m_hCellStart[0]), 0, NULL, NULL);
\r
650 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
653 for(int index = 0; index < m_numParticles; index++)
\r
655 btVector3 posA = m_hSortedPos[index];
\r
656 btVector3 velA = m_hSortedVel[index];
\r
657 btVector3 force = btVector3(0, 0, 0);
\r
658 float particleRad = m_simParams.m_particleRad;
\r
659 float collisionDamping = m_simParams.m_collisionDamping;
\r
660 float spring = m_simParams.m_spring;
\r
661 float shear = m_simParams.m_shear;
\r
662 float attraction = m_simParams.m_attraction;
\r
663 int unsortedIndex = m_hPosHash[index].y;
\r
664 //Get address in grid
\r
665 btInt4 gridPosA = cpu_getGridPos(posA, &m_simParams);
\r
666 //Accumulate surrounding cells
\r
668 for(int z = -1; z <= 1; z++)
\r
670 gridPosB.z = gridPosA.z + z;
\r
671 for(int y = -1; y <= 1; y++)
\r
673 gridPosB.y = gridPosA.y + y;
\r
674 for(int x = -1; x <= 1; x++)
\r
676 gridPosB.x = gridPosA.x + x;
\r
677 //Get start particle index for this cell
\r
678 unsigned int hashB = cpu_getPosHash(gridPosB, &m_simParams);
\r
679 int startI = m_hCellStart[hashB];
\r
685 //Iterate over particles in this cell
\r
686 int endI = startI + 32;
\r
687 if(endI > m_numParticles)
\r
688 endI = m_numParticles;
\r
690 for(int j = startI; j < endI; j++)
\r
692 unsigned int hashC = m_hPosHash[j].x;
\r
703 pair.v0[0] = index;
\r
705 pairs.push_back(pair.value);
\r
707 // printf("index=%d, j=%d\n",index,j);
\r
708 // printf("(index=%d, j=%d) ",index,j);
\r
709 btVector3 posB = m_hSortedPos[j];
\r
710 btVector3 velB = m_hSortedVel[j];
\r
711 //Collide two spheres
\r
712 force += cpu_collideTwoParticles( posA, posB, velA, velB, particleRad, particleRad,
\r
713 spring, collisionDamping, shear, attraction);
\r
718 //Write new velocity back to original unsorted location
\r
719 m_hVel[unsortedIndex] = velA + force;
\r
722 //#define BRUTE_FORCE_CHECK 1
\r
723 #ifdef BRUTE_FORCE_CHECK
\r
724 for(int index = 0; index < m_numParticles; index++)
\r
726 btVector3 posA = m_hSortedPos[index];
\r
727 btVector3 velA = m_hSortedVel[index];
\r
728 btVector3 force = btVector3(0, 0, 0);
\r
729 int unsortedIndex = m_hPosHash[index].y;
\r
731 float collisionDamping = m_simParams.m_collisionDamping;
\r
732 float spring = m_simParams.m_spring;
\r
733 float shear = m_simParams.m_shear;
\r
734 float attraction = m_simParams.m_attraction;
\r
735 for(int j = 0 ; j < m_numParticles; j++)
\r
739 btVector3 posB = m_hSortedPos[j];
\r
740 btVector3 velB = m_hSortedVel[j];
\r
743 btVector3 relPos = posB - posA; relPos[3] = 0.f;
\r
744 float dist2 = (relPos[0] * relPos[0] + relPos[1] * relPos[1] + relPos[2] * relPos[2]);
\r
748 if(dist2 < collideDist2)
\r
750 //Collide two spheres
\r
751 // force += cpu_collideTwoParticles( posA, posB, velA, velB, particleRad, particleRad,
\r
752 // spring, collisionDamping, shear, attraction);
\r
755 pair.v0[0] = index;
\r
757 if (pairs.findLinearSearch(pair.value)==pairs.size())
\r
759 printf("not found index=%d, j=%d\n",index,j);
\r
766 //Write new velocity back to original unsorted location
\r
767 //m_hVel[unsortedIndex] = velA + force;
\r
769 #endif //BRUTE_FORCE_CHECK
\r
771 memSize = sizeof(btVector3) * m_numParticles;
\r
772 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);
\r
773 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
777 runKernelWithWorkgroupSize(PARTICLES_KERNEL_COLLIDE_PARTICLES, m_numParticles);
\r
778 cl_int ciErrNum = clFinish(m_cqCommandQue);
\r
779 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
784 void btParticlesDynamicsWorld::runIntegrateMotionKernel()
\r
787 if(m_useCpuControls[SIMSTAGE_INTEGRATE_MOTION]->m_active)
\r
792 unsigned int memSize = sizeof(btVector3) * m_numParticles;
\r
793 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);
\r
794 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
795 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);
\r
796 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
797 for(int index = 0; index < m_numParticles; index++)
\r
799 btVector3 pos = m_hPos[index];
\r
800 btVector3 vel = m_hVel[index];
\r
805 gravity[0] = m_simParams.m_gravity[0];
\r
806 gravity[1] = m_simParams.m_gravity[1];
\r
807 gravity[2] = m_simParams.m_gravity[2];
\r
809 float particleRad = m_simParams.m_particleRad;
\r
810 float globalDamping = m_simParams.m_globalDamping;
\r
811 float boundaryDamping = m_simParams.m_boundaryDamping;
\r
812 vel += gravity * m_timeStep;
\r
813 vel *= globalDamping;
\r
814 // integrate position
\r
815 pos += vel * m_timeStep;
\r
816 // collide with world boundaries
\r
817 btVector3 worldMin;
\r
818 worldMin[0] = m_simParams.m_worldMin[0];
\r
819 worldMin[1] = m_simParams.m_worldMin[1];
\r
820 worldMin[2] = m_simParams.m_worldMin[2];
\r
822 btVector3 worldMax;
\r
823 worldMax[0] = m_simParams.m_worldMax[0];
\r
824 worldMax[1] = m_simParams.m_worldMax[1];
\r
825 worldMax[2] = m_simParams.m_worldMax[2];
\r
827 for(int j = 0; j < 3; j++)
\r
829 if(pos[j] < (worldMin[j] + particleRad))
\r
831 pos[j] = worldMin[j] + particleRad;
\r
832 vel[j] *= boundaryDamping;
\r
834 if(pos[j] > (worldMax[j] - particleRad))
\r
836 pos[j] = worldMax[j] - particleRad;
\r
837 vel[j] *= boundaryDamping;
\r
840 // write back position and velocity
\r
841 m_hPos[index] = pos;
\r
842 m_hVel[index] = vel;
\r
845 // write back to GPU
\r
846 memSize = sizeof(btVector3) * m_numParticles;
\r
847 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);
\r
848 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
849 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);
\r
850 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
854 // Set work size and execute the kernel
\r
855 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 4, sizeof(float), &m_timeStep);
\r
856 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
857 runKernelWithWorkgroupSize(PARTICLES_KERNEL_INTEGRATE_MOTION, m_numParticles);
\r
858 ciErrNum = clFinish(m_cqCommandQue);
\r
859 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
863 void btParticlesDynamicsWorld::runSortHashKernel()
\r
866 int memSize = m_numParticles * sizeof(btInt2);
\r
867 if(m_useCpuControls[SIMSTAGE_SORT_CELL_ID]->m_active)
\r
870 // get hash from GPU
\r
871 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
\r
872 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
878 unsigned int index;
\r
879 void quickSort(btHashPosKey* pData, int lo, int hi)
\r
882 btHashPosKey x = pData[(lo+hi)/2];
\r
885 while(pData[i].hash < x.hash) i++;
\r
886 while(x.hash < pData[j].hash) j--;
\r
889 btHashPosKey t = pData[i];
\r
890 pData[i] = pData[j];
\r
895 if(lo < j) pData->quickSort(pData, lo, j);
\r
896 if(i < hi) pData->quickSort(pData, i, hi);
\r
898 void bitonicSort(btHashPosKey* pData, int lo, int n, bool dir)
\r
903 bitonicSort(pData, lo, m, !dir);
\r
904 bitonicSort(pData, lo + m, n - m, dir);
\r
905 bitonicMerge(pData, lo, n, dir);
\r
908 void bitonicMerge(btHashPosKey* pData, int lo, int n, bool dir)
\r
912 int m = greatestPowerOfTwoLessThan(n);
\r
913 for(int i = lo; i < (lo + n - m); i++)
\r
915 compare(pData, i, i + m, dir);
\r
917 bitonicMerge(pData, lo, m, dir);
\r
918 bitonicMerge(pData, lo + m, n - m, dir);
\r
921 void compare(btHashPosKey* pData, int i, int j, bool dir)
\r
923 if(dir == (pData[i].hash > pData[j].hash))
\r
925 btHashPosKey t = pData[i];
\r
926 pData[i] = pData[j];
\r
930 int greatestPowerOfTwoLessThan(int n)
\r
940 btHashPosKey* pHash = (btHashPosKey*)(&m_hPosHash[0]);
\r
941 pHash->quickSort(pHash, 0, m_numParticles-1 );
\r
942 // pHash->bitonicSort(pHash, 0, m_hashSize, true);
\r
943 // write back to GPU
\r
944 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
\r
945 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
949 // bitonic sort on GPU (shared memory)
\r
951 bitonicSortNv(m_dPosHash, 1, m_hashSize, dir);
\r
952 ciErrNum = clFinish(m_cqCommandQue);
\r
953 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
957 memSize = m_numParticles * sizeof(btInt2);
\r
958 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
\r
959 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
960 for(int i = 1; i < m_hashSize; i++)
\r
962 if(m_hPosHash[i-1].x > m_hPosHash[i].x)
\r
964 printf("Hash sort error at %d\n", i);
\r
971 void btParticlesDynamicsWorld::runFindCellStartKernel()
\r
974 if(m_useCpuControls[SIMSTAGE_FIND_CELL_START]->m_active)
\r
977 // get hash from GPU
\r
978 int memSize = m_numParticles * sizeof(btInt2);
\r
979 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);
\r
980 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
981 memSize = sizeof(btVector3) * m_numParticles;
\r
982 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);
\r
983 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
984 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);
\r
985 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
987 for(int i = 0; i < m_numGridCells; i++)
\r
989 m_hCellStart[i] = -1;
\r
991 // find start of each cell in sorted hash
\r
992 btInt2 hash = m_hPosHash[0];
\r
993 m_hCellStart[hash.x] = 0;
\r
994 int unsortedIndex = hash.y;
\r
995 btVector3 pos = m_hPos[unsortedIndex];
\r
996 btVector3 vel = m_hVel[unsortedIndex];
\r
997 m_hSortedPos[0] = pos;
\r
998 m_hSortedVel[0] = vel;
\r
999 for(int i = 1; i < m_numParticles; i++)
\r
1001 if(m_hPosHash[i-1].x != m_hPosHash[i].x)
\r
1003 m_hCellStart[m_hPosHash[i].x] = i;
\r
1005 unsortedIndex = m_hPosHash[i].y;
\r
1006 pos = m_hPos[unsortedIndex];
\r
1007 vel = m_hVel[unsortedIndex];
\r
1008 m_hSortedPos[i] = pos;
\r
1009 m_hSortedVel[i] = vel;
\r
1011 // write back to GPU
\r
1012 memSize = m_numGridCells * sizeof(int);
\r
1013 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dCellStart, CL_TRUE, 0, memSize, &(m_hCellStart[0]), 0, NULL, NULL);
\r
1014 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1015 memSize = sizeof(btVector3) * m_numParticles;
\r
1016 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dSortedPos, CL_TRUE, 0, memSize, &(m_hSortedPos[0]), 0, NULL, NULL);
\r
1017 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1018 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dSortedVel, CL_TRUE, 0, memSize, &(m_hSortedVel[0]), 0, NULL, NULL);
\r
1019 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1023 runKernelWithWorkgroupSize(PARTICLES_KERNEL_CLEAR_CELL_START, m_numGridCells);
\r
1024 runKernelWithWorkgroupSize(PARTICLES_KERNEL_FIND_CELL_START, m_numParticles);
\r
1025 ciErrNum = clFinish(m_cqCommandQue);
\r
1026 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1031 void btParticlesDynamicsWorld::initKernel(int kernelId, const char* pName)
\r
1035 cl_kernel kernel = clCreateKernel(m_cpProgram, pName, &ciErrNum);
\r
1036 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1038 ciErrNum = clGetKernelWorkGroupInfo(kernel, m_cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
\r
1039 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1044 // if (wgSize > 256)
\r
1050 // if (wgSize > 1024)
\r
1053 m_kernels[kernelId].m_Id = kernelId;
\r
1054 m_kernels[kernelId].m_kernel = kernel;
\r
1055 m_kernels[kernelId].m_name = pName;
\r
1056 m_kernels[kernelId].m_workgroupSize = wgSize;
\r
1061 void btParticlesDynamicsWorld::runKernelWithWorkgroupSize(int kernelId, int globalSize)
\r
1063 if(globalSize <= 0)
\r
1067 cl_kernel kernelFunc = m_kernels[kernelId].m_kernel;
\r
1068 cl_int ciErrNum = clSetKernelArg(kernelFunc, 0, sizeof(int), (void*)&globalSize);
\r
1069 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1070 int workgroupSize = m_kernels[kernelId].m_workgroupSize;
\r
1071 if(workgroupSize <= 0)
\r
1072 { // let OpenCL library calculate workgroup size
\r
1073 size_t globalWorkSize[2];
\r
1074 globalWorkSize[0] = globalSize;
\r
1075 globalWorkSize[1] = 1;
\r
1076 ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, NULL, 0,0,0 );
\r
1080 size_t localWorkSize[2], globalWorkSize[2];
\r
1081 workgroupSize = btMin(workgroupSize, globalSize);
\r
1082 int num_t = globalSize / workgroupSize;
\r
1083 int num_g = num_t * workgroupSize;
\r
1084 if(num_g < globalSize)
\r
1088 localWorkSize[0] = workgroupSize;
\r
1089 globalWorkSize[0] = num_t * workgroupSize;
\r
1090 localWorkSize[1] = 1;
\r
1091 globalWorkSize[1] = 1;
\r
1092 ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, localWorkSize, 0,0,0 );
\r
1094 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1098 //Note: logically shared with BitonicSort OpenCL code!
\r
1099 // TODO : get parameter from OpenCL and pass it to kernel (needed for platforms other than NVIDIA)
\r
1100 //static const unsigned int LOCAL_SIZE_LIMIT = 1024U;
\r
1102 void btParticlesDynamicsWorld::bitonicSortNv(cl_mem pKey, unsigned int batch, unsigned int arrayLength, unsigned int dir)
\r
1104 unsigned int localSizeLimit = m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL].m_workgroupSize * 2;
\r
1105 if(arrayLength < 2)
\r
1107 //Only power-of-two array lengths are supported so far
\r
1110 size_t localWorkSize, globalWorkSize;
\r
1111 if(arrayLength <= localSizeLimit)
\r
1113 btAssert( (batch * arrayLength) % localSizeLimit == 0);
\r
1114 //Launch bitonicSortLocal
\r
1115 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL].m_kernel, 0, sizeof(cl_mem), (void *)&pKey);
\r
1116 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL].m_kernel, 1, sizeof(cl_uint), (void *)&arrayLength);
\r
1117 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL].m_kernel, 2, sizeof(cl_uint), (void *)&dir);
\r
1118 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1120 localWorkSize = localSizeLimit / 2;
\r
1121 globalWorkSize = batch * arrayLength / 2;
\r
1122 ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL].m_kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
\r
1123 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1127 //Launch bitonicSortLocal1
\r
1128 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1].m_kernel, 0, sizeof(cl_mem), (void *)&pKey);
\r
1129 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1131 localWorkSize = localSizeLimit / 2;
\r
1132 globalWorkSize = batch * arrayLength / 2;
\r
1133 ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1].m_kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
\r
1134 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1136 for(unsigned int size = 2 * localSizeLimit; size <= arrayLength; size <<= 1)
\r
1138 for(unsigned stride = size / 2; stride > 0; stride >>= 1)
\r
1140 if(stride >= localSizeLimit)
\r
1142 //Launch bitonicMergeGlobal
\r
1143 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 0, sizeof(cl_mem), (void *)&pKey);
\r
1144 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 1, sizeof(cl_uint), (void *)&arrayLength);
\r
1145 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 2, sizeof(cl_uint), (void *)&size);
\r
1146 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 3, sizeof(cl_uint), (void *)&stride);
\r
1147 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 4, sizeof(cl_uint), (void *)&dir);
\r
1148 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1150 localWorkSize = localSizeLimit / 4;
\r
1151 globalWorkSize = batch * arrayLength / 2;
\r
1153 ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
\r
1154 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1158 //Launch bitonicMergeLocal
\r
1159 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 0, sizeof(cl_mem), (void *)&pKey);
\r
1160 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 1, sizeof(cl_uint), (void *)&arrayLength);
\r
1161 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 2, sizeof(cl_uint), (void *)&stride);
\r
1162 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 3, sizeof(cl_uint), (void *)&size);
\r
1163 ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 4, sizeof(cl_uint), (void *)&dir);
\r
1164 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r
1166 localWorkSize = localSizeLimit / 2;
\r
1167 globalWorkSize = batch * arrayLength / 2;
\r
1169 ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
\r
1170 oclCHECKERROR(ciErrNum, CL_SUCCESS);
\r