Initialize libbullet git in 2.0_beta.
[platform/upstream/libbullet.git] / Demos / ParticlesOpenCL / btParticlesDemoDynamicsWorld.cpp
1 /*\r
2 Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org\r
3 Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc. \r
4 \r
5 This software is provided 'as-is', without any express or implied warranty.\r
6 In no event will the authors be held liable for any damages arising from the use of this software.\r
7 Permission is granted to anyone to use this software for any purpose, \r
8 including commercial applications, and to alter it and redistribute it freely, \r
9 subject to the following restrictions:\r
10 \r
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\r
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\r
13 3. This notice may not be removed or altered from any source distribution.\r
14 */\r
15 \r
16 #include <stdio.h>\r
17 #ifdef __APPLE__\r
18 //CL_PLATFORM_MINI_CL could be defined in build system\r
19 #else\r
20 #include <GL/glew.h>\r
21 #ifdef USE_MINICL\r
22 \r
23 #include <MiniCL/cl_platform.h> //for CL_PLATFORM_MINI_CL definition\r
24 #else\r
25 #include <CL/cl_platform.h> //for CL_PLATFORM_MINI_CL definition\r
26 #endif\r
27 #endif //__APPLE__\r
28 \r
29 \r
30 #include "btOpenCLUtils.h"\r
31 \r
32 #include "btBulletDynamicsCommon.h"\r
33 #include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"\r
34 #include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"\r
35 #include "BulletCollision/CollisionShapes/btCollisionShape.h"\r
36 #include "BulletDynamics/Dynamics/btRigidBody.h"\r
37 #include "BulletDynamics/ConstraintSolver/btSequentialImpulseConstraintSolver.h"\r
38 #include "BulletDynamics/ConstraintSolver/btContactSolverInfo.h"\r
39 #include "LinearMath/btQuickprof.h"\r
40 #include "GlutStuff.h"\r
41 #include "BulletDynamics/ConstraintSolver/btTypedConstraint.h"\r
42 #include "BulletDynamics/ConstraintSolver/btPoint2PointConstraint.h"\r
43 \r
44 #include "btParticlesDynamicsWorld.h"\r
45 #include "GL_DialogWindow.h"\r
46 \r
47 //when loading from disk, you need to remove the 'MSTRINGIFY' line at the start, and ); at the end of the .cl file\r
48 \r
49 #define LOAD_FROM_MEMORY\r
50 #ifdef LOAD_FROM_MEMORY\r
51 #define MSTRINGIFY(A) #A\r
52 static const char* source= \r
53 #include "ParticlesOCL.cl"\r
54 #endif //LOAD_FROM_MEMORY\r
55 \r
56 btParticlesDynamicsWorld::~btParticlesDynamicsWorld()\r
57 {\r
58 }\r
59 \r
60 static int gStepNum = 0;\r
61 \r
62 int     btParticlesDynamicsWorld::stepSimulation( btScalar timeStep, int maxSubSteps, btScalar fixedTimeStep)\r
63 {\r
64         startProfiling(timeStep);\r
65         m_timeStep = timeStep;\r
66         BT_PROFILE("stepSimulation");\r
67 //      printf("Step : %d\n", gStepNum);\r
68         {\r
69                 BT_PROFILE("IntegrateMotion");\r
70                 runIntegrateMotionKernel();\r
71         }\r
72         {\r
73                 runComputeCellIdKernel();\r
74         }\r
75         {\r
76                 BT_PROFILE("SortHash");\r
77                 runSortHashKernel();\r
78         }\r
79         {\r
80                 BT_PROFILE("FindCellStart");\r
81                 runFindCellStartKernel();\r
82         }\r
83         {\r
84                 BT_PROFILE("CollideParticles");\r
85 //              printf("\ncollide particles\n\n");\r
86                 runCollideParticlesKernel();\r
87         }\r
88         gStepNum++;\r
89 \r
90 #ifndef BT_NO_PROFILE\r
91         CProfileManager::Increment_Frame_Counter();\r
92 #endif //BT_NO_PROFILE\r
93         return 1;\r
94 }\r
95 \r
96 static unsigned int getMaxPowOf2(unsigned int num)\r
97 {\r
98         unsigned int maxPowOf2 = 1;\r
99         for(int bit = 1; bit < 32; bit++)\r
100         {\r
101                 if(maxPowOf2 >= num)\r
102                 {\r
103                         break;\r
104                 }\r
105                 maxPowOf2 <<= 1;\r
106         }\r
107         return maxPowOf2;\r
108 }\r
109 \r
110 \r
111 void btParticlesDynamicsWorld::initDeviceData()\r
112 {\r
113         getShapeData();\r
114 }\r
115 \r
116 \r
117 \r
118 void btParticlesDynamicsWorld::postInitDeviceData()\r
119 {\r
120         m_hashSize = getMaxPowOf2(m_numParticles);\r
121         createVBO();\r
122         allocateBuffers();\r
123         adjustGrid();\r
124         grabSimulationData();\r
125 }\r
126 \r
127 \r
128 void btParticlesDynamicsWorld::getShapeData()\r
129 {\r
130         int numObjects = getNumCollisionObjects();\r
131         btCollisionObjectArray& collisionObjects = getCollisionObjectArray();\r
132         for(int i = 0; i < numObjects; i++)\r
133         {\r
134                 btCollisionObject* colObj = collisionObjects[i];\r
135                 btCollisionShape* pShape = colObj->getCollisionShape();\r
136                 int shapeType = pShape->getShapeType();\r
137                 if(shapeType == SPHERE_SHAPE_PROXYTYPE)\r
138                 {\r
139                         btSphereShape* pSph = (btSphereShape*)pShape;\r
140                         btScalar sphRad = pSph->getRadius();\r
141                         if(!i)\r
142                         {\r
143                                 m_particleRad = sphRad;\r
144                         }\r
145                         else\r
146                         {\r
147                                 btAssert(m_particleRad == sphRad);\r
148                         }\r
149                 }\r
150                 else\r
151                 {\r
152                         btAssert(0);\r
153                 }\r
154         }\r
155         printf("Total number of particles : %d\n", m_numParticles);\r
156 }\r
157 \r
158 void btParticlesDynamicsWorld::allocateBuffers()\r
159 {\r
160     cl_int ciErrNum;\r
161         // positions of spheres\r
162         m_hPos.resize(m_numParticles);\r
163         m_hVel.resize(m_numParticles);\r
164         m_hSortedPos.resize(m_numParticles);\r
165         m_hSortedVel.resize(m_numParticles);\r
166         m_hPosHash.resize(m_hashSize); \r
167         for(int i = 0; i < m_hashSize; i++) { m_hPosHash[i].x = 0x7FFFFFFF; m_hPosHash[i].y = 0; }\r
168     unsigned int memSize = sizeof(btVector3) *  m_numParticles;\r
169     m_dPos = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);\r
170     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
171     m_dVel = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);\r
172     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
173     m_dSortedPos = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);\r
174     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
175     m_dSortedVel = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);\r
176     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
177         memSize = m_hashSize * sizeof(btInt2);\r
178         m_dPosHash = clCreateBuffer(m_cxMainContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum);\r
179     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
180 \r
181         // global simulation parameters\r
182         memSize = sizeof(btSimParams);\r
183         m_dSimParams = clCreateBuffer(m_cxMainContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum);\r
184     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
185 }\r
186 \r
187 void btParticlesDynamicsWorld::adjustGrid()\r
188 {\r
189         //btVector3 wmin( BT_LARGE_FLOAT,  BT_LARGE_FLOAT,  BT_LARGE_FLOAT);\r
190         //btVector3 wmax(-BT_LARGE_FLOAT, -BT_LARGE_FLOAT, -BT_LARGE_FLOAT);\r
191 \r
192         btVector3 wmin( BT_LARGE_FLOAT,  BT_LARGE_FLOAT, BT_LARGE_FLOAT);\r
193         btVector3 wmax(-BT_LARGE_FLOAT, -BT_LARGE_FLOAT, -BT_LARGE_FLOAT);\r
194         btVector3 boxDiag(m_particleRad, m_particleRad, m_particleRad);\r
195         for(int i = 0; i < m_numParticles; i++)\r
196         {\r
197                 btVector3 pos = m_hPos[i];\r
198                 btVector3 boxMin = pos - boxDiag;\r
199                 btVector3 boxMax = pos + boxDiag;\r
200                 wmin.setMin(boxMin);\r
201                 wmax.setMax(boxMax);\r
202         }\r
203         m_worldMin = wmin;\r
204         m_worldMax = wmax;\r
205         btVector3 wsize = m_worldMax - m_worldMin;\r
206         wsize[3] = 1.0f;\r
207 \r
208         glBindBufferARB(GL_ARRAY_BUFFER, m_colVbo);\r
209     btVector3* color = (btVector3*)glMapBufferARB(GL_ARRAY_BUFFER, GL_WRITE_ONLY);\r
210     for(int i = 0; i < m_numParticles; i++, color++)\r
211         {\r
212                 *color = (m_hPos[i] - m_worldMin) / wsize;\r
213                 (*color)[3] = 1.f;\r
214         }\r
215     glUnmapBufferARB(GL_ARRAY_BUFFER);\r
216 \r
217 /*\r
218         wsize[0] *= 0.5f;\r
219         wsize[1] *= 0.1f;\r
220         wsize[2] *= 0.5f;\r
221         m_worldMin -= wsize;\r
222         m_worldMax += wsize;\r
223 */\r
224         m_worldMin.setValue(-WORLD_SIZE, -WORLD_SIZE, -WORLD_SIZE);\r
225         m_worldMax.setValue( WORLD_SIZE,  WORLD_SIZE,  WORLD_SIZE);\r
226         wsize = m_worldMax - m_worldMin;\r
227 \r
228         m_cellSize[0] = m_cellSize[1] = m_cellSize[2] = m_particleRad * btScalar(2.f);\r
229 \r
230         m_simParams.m_worldMin[0] = m_worldMin[0];\r
231         m_simParams.m_worldMin[1] = m_worldMin[1];\r
232         m_simParams.m_worldMin[2] = m_worldMin[2];\r
233 \r
234         m_simParams.m_worldMax[0] = m_worldMax[0];\r
235         m_simParams.m_worldMax[1] = m_worldMax[1];\r
236         m_simParams.m_worldMax[2] = m_worldMax[2];\r
237 \r
238         m_simParams.m_cellSize[0] = m_cellSize[0];\r
239         m_simParams.m_cellSize[1] = m_cellSize[1];\r
240         m_simParams.m_cellSize[2] = m_cellSize[2];\r
241 \r
242         m_simParams.m_gridSize[0] = (int)(wsize[0] / m_cellSize[0] + 0.999999f);\r
243         m_simParams.m_gridSize[1] = (int)(wsize[1] / m_cellSize[1] + 0.999999f);\r
244         m_simParams.m_gridSize[2] = (int)(wsize[2] / m_cellSize[2] + 0.999999f);\r
245 \r
246         m_numGridCells = m_simParams.m_gridSize[0] * m_simParams.m_gridSize[1] * m_simParams.m_gridSize[2];\r
247         m_hCellStart.resize(m_numGridCells);\r
248     unsigned int memSize = sizeof(int) *  m_numGridCells;\r
249     cl_int ciErrNum;\r
250         m_dCellStart = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);\r
251     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
252 \r
253 }\r
254 \r
255 \r
256 void btParticlesDynamicsWorld::grabSimulationData()\r
257 {\r
258 //      const btVector3& gravity = getGravity();\r
259         //btVector3 gravity(0., -0.06, 0.);\r
260         //btVector3 gravity(0., -0.0003f, 0.);\r
261         btVector3 gravity(0,-0.0003,0);\r
262 \r
263         \r
264 \r
265         m_simParams.m_gravity[0] = gravity[0];\r
266         m_simParams.m_gravity[1] = gravity[1];\r
267         m_simParams.m_gravity[2] = gravity[2];\r
268         m_simParams.m_particleRad = m_particleRad;\r
269         m_simParams.m_globalDamping = 1.0f;\r
270         m_simParams.m_boundaryDamping = -0.5f;\r
271 \r
272 //      m_simParams.m_collisionDamping = 0.02f;\r
273 //      m_simParams.m_spring = 0.5f;\r
274 //      m_simParams.m_shear = 0.1f;\r
275 //      m_simParams.m_attraction = 0.0f;\r
276         m_simParams.m_collisionDamping = 0.025f;//0.02f;\r
277         m_simParams.m_spring = 0.5f;\r
278         m_simParams.m_shear = 0.1f;\r
279         m_simParams.m_attraction = 0.001f;\r
280 \r
281 \r
282 \r
283         // copy data to GPU\r
284     cl_int ciErrNum;\r
285         unsigned int memSize = sizeof(btVector3) * m_numParticles;\r
286     ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);\r
287     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
288     ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);\r
289     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
290         memSize = sizeof(btSimParams);\r
291     ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dSimParams, CL_TRUE, 0, memSize, &m_simParams, 0, NULL, NULL);\r
292     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
293         memSize = m_hashSize * sizeof(btInt2);\r
294         ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);\r
295         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
296 }\r
297 \r
298 \r
299 void btParticlesDynamicsWorld::createVBO()\r
300 {\r
301     // create buffer object\r
302     glGenBuffers(1, &m_vbo);\r
303     glBindBuffer(GL_ARRAY_BUFFER, m_vbo);\r
304         // positions of spheres\r
305     unsigned int memSize = sizeof(btVector3) *  m_numParticles;\r
306     glBufferData(GL_ARRAY_BUFFER, memSize, 0, GL_DYNAMIC_DRAW);\r
307         // colors\r
308         GLuint vbo;\r
309     glGenBuffers(1, &vbo);\r
310     glBindBuffer(GL_ARRAY_BUFFER, vbo);\r
311     glBufferData(GL_ARRAY_BUFFER, memSize, 0, GL_DYNAMIC_DRAW);\r
312     glBindBuffer(GL_ARRAY_BUFFER, 0);\r
313     m_colVbo = vbo;\r
314     // fill color buffer\r
315     glBindBufferARB(GL_ARRAY_BUFFER, m_colVbo);\r
316     float *data = (float*)glMapBufferARB(GL_ARRAY_BUFFER, GL_WRITE_ONLY);\r
317     float *ptr = data;\r
318     for(int i = 0; i < m_numParticles; i++) \r
319         {\r
320         float t = i / (float)m_numParticles;\r
321                 ptr[0] = 0.f;\r
322                 ptr[1] = 1.f;\r
323                 ptr[2] = 0.f;\r
324         ptr+=3;\r
325         *ptr++ = 1.0f;\r
326     }\r
327     glUnmapBufferARB(GL_ARRAY_BUFFER);\r
328         glBindBufferARB(GL_ARRAY_BUFFER, 0);\r
329 }\r
330 \r
331 \r
332 \r
333 void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv)\r
334 {\r
335     cl_int ciErrNum;\r
336 \r
337         if (!m_cxMainContext)\r
338         {\r
339                 \r
340                 cl_device_type deviceType = CL_DEVICE_TYPE_ALL;\r
341                 m_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0, 0);\r
342         \r
343                 int numDev = btOpenCLUtils::getNumDevices(m_cxMainContext);\r
344                 if (!numDev)\r
345                 {\r
346                         btAssert(0);\r
347                         exit(0);//this is just a demo, exit now\r
348                 }\r
349 \r
350                 m_cdDevice =  btOpenCLUtils::getDevice(m_cxMainContext,0);\r
351         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
352 \r
353                 btOpenCLDeviceInfo clInfo;\r
354                 btOpenCLUtils::getDeviceInfo(m_cdDevice,clInfo);\r
355                 btOpenCLUtils::printDeviceInfo(m_cdDevice);\r
356 \r
357                 // create a command-queue\r
358                 m_cqCommandQue = clCreateCommandQueue(m_cxMainContext, m_cdDevice, 0, &ciErrNum);\r
359                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
360         }\r
361         // Program Setup\r
362         size_t program_length;\r
363 \r
364 \r
365 #ifdef LOAD_FROM_MEMORY\r
366         program_length = strlen(source);\r
367         printf("OpenCL compiles ParticlesOCL.cl ... ");\r
368 #else\r
369 \r
370         const char* fileName = "ParticlesOCL.cl";\r
371         FILE * fp = fopen(fileName, "rb");\r
372         char newFileName[512];\r
373         \r
374         if (fp == NULL)\r
375         {\r
376                 sprintf(newFileName,"..//%s",fileName);\r
377                 fp = fopen(newFileName, "rb");\r
378                 if (fp)\r
379                         fileName = newFileName;\r
380         }\r
381         \r
382         if (fp == NULL)\r
383         {\r
384                 sprintf(newFileName,"Demos//ParticlesOpenCL//%s",fileName);\r
385                 fp = fopen(newFileName, "rb");\r
386                 if (fp)\r
387                         fileName = newFileName;\r
388         }\r
389 \r
390         if (fp == NULL)\r
391         {\r
392                 sprintf(newFileName,"..//..//..//..//..//Demos//ParticlesOpenCL//%s",fileName);\r
393                 fp = fopen(newFileName, "rb");\r
394                 if (fp)\r
395                         fileName = newFileName;\r
396                 else\r
397                 {\r
398                         printf("cannot find %s\n",newFileName);\r
399                         exit(0);\r
400                 }\r
401         }\r
402 \r
403 //      char *source = oclLoadProgSource(".//Demos//SpheresGrid//SpheresGrid.cl", "", &program_length);\r
404         //char *source = btOclLoadProgSource(".//Demos//SpheresOpenCL//Shared//SpheresGrid.cl", "", &program_length);\r
405 \r
406         char *source = btOclLoadProgSource(fileName, "", &program_length);\r
407         if(source == NULL)\r
408         {\r
409                 printf("ERROR : OpenCL can't load file %s\n", fileName);\r
410         }\r
411 //      oclCHECKERROR (source == NULL, oclFALSE);   \r
412         btAssert(source != NULL);\r
413 \r
414         // create the program\r
415         printf("OpenCL compiles %s ...", fileName);\r
416 \r
417 #endif //LOAD_FROM_MEMORY\r
418 \r
419 \r
420         //printf("%s\n", source);\r
421 \r
422         m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&source, &program_length, &ciErrNum);\r
423         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
424 #ifndef LOAD_FROM_MEMORY\r
425         free(source);\r
426 #endif //LOAD_FROM_MEMORY\r
427 \r
428         //#define LOCAL_SIZE_LIMIT 1024U\r
429 #define LOCAL_SIZE_MAX 1024U\r
430 \r
431                     // Build the program with 'mad' Optimization option\r
432 #ifdef MAC\r
433         const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -cl-mad-enable -DMAC -DGUID_ARG";\r
434 #else\r
435         const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -DGUID_ARG= ";\r
436 #endif\r
437         // build the program\r
438         ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL);\r
439         if(ciErrNum != CL_SUCCESS)\r
440         {\r
441                 // write out standard error\r
442 //              oclLog(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);\r
443                 // write out the build log and ptx, then exit\r
444                 char cBuildLog[10240];\r
445 //              char* cPtx;\r
446 //              size_t szPtxLength;\r
447                 clGetProgramBuildInfo(m_cpProgram, m_cdDevice, CL_PROGRAM_BUILD_LOG, \r
448                                                           sizeof(cBuildLog), cBuildLog, NULL );\r
449 //              oclGetProgBinary(m_cpProgram, oclGetFirstDev(m_cxMainContext), &cPtx, &szPtxLength);\r
450 //              oclLog(LOGBOTH | CLOSELOG, 0.0, "\n\nLog:\n%s\n\n\n\n\nPtx:\n%s\n\n\n", cBuildLog, cPtx);\r
451                 printf("\n\n%s\n\n\n", cBuildLog);\r
452                 printf("Press ENTER key to terminate the program\n");\r
453                 getchar();\r
454                 exit(-1); \r
455         }\r
456         printf("OK\n");\r
457 \r
458         // create the kernels\r
459 \r
460         postInitDeviceData();\r
461 \r
462         initKernel(PARTICLES_KERNEL_COMPUTE_CELL_ID, "kComputeCellId");\r
463         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPos);\r
464         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 2, sizeof(cl_mem), (void*) &m_dPosHash);\r
465         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSimParams);\r
466         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
467 \r
468         initKernel(PARTICLES_KERNEL_INTEGRATE_MOTION, "kIntegrateMotion");\r
469         ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 1, sizeof(cl_mem), (void *) &m_dPos);\r
470         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 2, sizeof(cl_mem), (void *) &m_dVel);\r
471         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 3, sizeof(cl_mem), (void *) &m_dSimParams);\r
472         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
473 \r
474 \r
475         initKernel(PARTICLES_KERNEL_CLEAR_CELL_START, "kClearCellStart");\r
476         ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 0, sizeof(int),               (void *) &m_numGridCells);\r
477         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem),    (void*) &m_dCellStart);\r
478 \r
479         initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart");\r
480 //      ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 0, sizeof(int),        (void*) &m_numParticles);\r
481         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem),     (void*) &m_dPosHash);\r
482         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem),     (void*) &m_dCellStart);\r
483         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem),     (void*) &m_dPos);\r
484         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 4, sizeof(cl_mem),     (void*) &m_dVel);\r
485         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 5, sizeof(cl_mem),     (void*) &m_dSortedPos);\r
486         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 6, sizeof(cl_mem),     (void*) &m_dSortedVel);\r
487         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
488 \r
489         initKernel(PARTICLES_KERNEL_COLLIDE_PARTICLES, "kCollideParticles");\r
490         ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 1, sizeof(cl_mem),   (void*) &m_dVel);\r
491         ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 2, sizeof(cl_mem),   (void*) &m_dSortedPos);\r
492         ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 3, sizeof(cl_mem),   (void*) &m_dSortedVel);\r
493         ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 4, sizeof(cl_mem),   (void*) &m_dPosHash);\r
494         ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 5, sizeof(cl_mem),   (void*) &m_dCellStart);\r
495         ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 6, sizeof(cl_mem),   (void*) &m_dSimParams);\r
496 \r
497         initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL, "kBitonicSortCellIdLocal");\r
498         initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1, "kBitonicSortCellIdLocal1");\r
499         initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL, "kBitonicSortCellIdMergeGlobal");\r
500         initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL, "kBitonicSortCellIdMergeLocal");\r
501 }\r
502 \r
503 static btInt4 cpu_getGridPos(btVector3& worldPos, btSimParams* pParams)\r
504 {\r
505     btInt4 gridPos;\r
506         gridPos.x = (int)floor((worldPos[0] - pParams->m_worldMin[0]) / pParams->m_cellSize[0]);\r
507         gridPos.y = (int)floor((worldPos[1] - pParams->m_worldMin[1]) / pParams->m_cellSize[1]);\r
508         gridPos.z = (int)floor((worldPos[2] - pParams->m_worldMin[2]) / pParams->m_cellSize[2]);\r
509     return gridPos;\r
510 }\r
511 \r
512 static unsigned int cpu_getPosHash(btInt4& gridPos, btSimParams* pParams)\r
513 {\r
514         btInt4 gridDim = *((btInt4*)(pParams->m_gridSize));\r
515         if(gridPos.x < 0) gridPos.x = 0;\r
516         if(gridPos.x >= gridDim.x) gridPos.x = gridDim.x - 1;\r
517         if(gridPos.y < 0) gridPos.y = 0;\r
518         if(gridPos.y >= gridDim.y) gridPos.y = gridDim.y - 1;\r
519         if(gridPos.z < 0) gridPos.z = 0;\r
520         if(gridPos.z >= gridDim.z) gridPos.z = gridDim.z - 1;\r
521         unsigned int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\r
522         return hash;\r
523\r
524 \r
525 \r
526 \r
527 \r
528 void btParticlesDynamicsWorld::runComputeCellIdKernel()\r
529 {\r
530     cl_int ciErrNum;\r
531 #if 0\r
532         if(m_useCpuControls[SIMSTAGE_COMPUTE_CELL_ID]->m_active)\r
533         {       // CPU version\r
534                 unsigned int memSize = sizeof(btVector3) * m_numParticles;\r
535                 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);\r
536                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
537                 for(int index = 0; index < m_numParticles; index++)\r
538                 {\r
539                         btVector3 pos = m_hPos[index];\r
540                         btInt4 gridPos = cpu_getGridPos(pos, &m_simParams);\r
541                         unsigned int hash = cpu_getPosHash(gridPos, &m_simParams);\r
542                         m_hPosHash[index].x = hash;\r
543                         m_hPosHash[index].y = index;\r
544                 }\r
545                 memSize = sizeof(btInt2) * m_numParticles;\r
546                 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);\r
547                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
548         }\r
549         else\r
550 #endif\r
551         {\r
552                 BT_PROFILE("ComputeCellId");\r
553                 runKernelWithWorkgroupSize(PARTICLES_KERNEL_COMPUTE_CELL_ID, m_numParticles);\r
554                 ciErrNum = clFinish(m_cqCommandQue);\r
555                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
556         }\r
557 /*\r
558         // check\r
559         int memSize = sizeof(btInt2) * m_hashSize;\r
560     ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);\r
561     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
562 \r
563         memSize = sizeof(float) * 4 * m_numParticles;\r
564     ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);\r
565     oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
566 */\r
567 \r
568         {\r
569                 BT_PROFILE("Copy VBO");\r
570                 // Explicit Copy (until OpenGL interop will work)\r
571                 // map the PBO to copy data from the CL buffer via host\r
572                 glBindBufferARB(GL_ARRAY_BUFFER, m_vbo);    \r
573                 // map the buffer object into client's memory\r
574                 void* ptr = glMapBufferARB(GL_ARRAY_BUFFER, GL_WRITE_ONLY_ARB);\r
575                 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, sizeof(float) * 4 * m_numParticles, ptr, 0, NULL, NULL);\r
576                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
577                 glUnmapBufferARB(GL_ARRAY_BUFFER); \r
578                 glBindBufferARB(GL_ARRAY_BUFFER,0);\r
579         }\r
580 }\r
581 \r
582 \r
583 \r
584 static btVector3 cpu_collideTwoParticles(\r
585     btVector3& posA,\r
586     btVector3& posB,\r
587     btVector3& velA,\r
588     btVector3& velB,\r
589     float radiusA,\r
590     float radiusB,\r
591     float spring,\r
592     float damping,\r
593     float shear,\r
594     float attraction\r
595 )\r
596 {\r
597     //Calculate relative position\r
598     btVector3  relPos = posB - posA; relPos[3] = 0.f;\r
599     float        dist = sqrt(relPos[0] * relPos[0] + relPos[1] * relPos[1] + relPos[2] * relPos[2]);\r
600     float collideDist = radiusA + radiusB;\r
601 \r
602     btVector3 force = btVector3(0, 0, 0);\r
603     if(dist < collideDist)\r
604         {\r
605         btVector3 norm = relPos / dist;\r
606 \r
607         //Relative velocity\r
608         btVector3 relVel = velB - velA; relVel[3] = 0.f;;\r
609 \r
610         //Relative tangential velocity\r
611         float relVelDotNorm = relVel.dot(norm);\r
612                 btVector3 tanVel = relVel - relVelDotNorm * norm; \r
613         //Spring force (potential)\r
614         //float springFactor = -spring * (collideDist - dist);\r
615                 float springFactor = -0.4 * (collideDist - dist);\r
616                 force = springFactor * norm + damping * relVel;// + shear * tanVel + attraction * relPos;\r
617     }\r
618     return force;\r
619 }\r
620 \r
621 struct btPair\r
622 {\r
623         union\r
624         {\r
625                 int value;\r
626                 short v0[2];\r
627         };\r
628 };\r
629 \r
630 void btParticlesDynamicsWorld::runCollideParticlesKernel()\r
631 {\r
632         btAlignedObjectArray<int>       pairs;\r
633 \r
634         float particleRad = m_simParams.m_particleRad;\r
635         float collideDist2 = (particleRad + particleRad)*(particleRad + particleRad);\r
636         cl_int ciErrNum;\r
637         if(m_useCpuControls[SIMSTAGE_COLLIDE_PARTICLES]->m_active)\r
638         {       // CPU version\r
639                 int memSize = sizeof(btVector3) * m_numParticles;\r
640                 {\r
641                         BT_PROFILE("Copy from GPU");\r
642                         ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dSortedPos, CL_TRUE, 0, memSize, &(m_hSortedPos[0]), 0, NULL, NULL);\r
643                         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
644                         ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dSortedVel, CL_TRUE, 0, memSize, &(m_hSortedVel[0]), 0, NULL, NULL);\r
645                         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
646                         memSize = sizeof(btInt2) * m_numParticles;\r
647                         ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);\r
648                         memSize = m_numGridCells * sizeof(int);\r
649                         ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dCellStart, CL_TRUE, 0, memSize, &(m_hCellStart[0]), 0, NULL, NULL);\r
650                         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
651                 }\r
652 \r
653                 for(int index = 0; index < m_numParticles; index++)\r
654                 {\r
655                         btVector3 posA = m_hSortedPos[index];\r
656                         btVector3 velA = m_hSortedVel[index];\r
657                         btVector3 force = btVector3(0, 0, 0);\r
658                         float particleRad = m_simParams.m_particleRad;\r
659                         float collisionDamping = m_simParams.m_collisionDamping;\r
660                         float spring = m_simParams.m_spring;\r
661                         float shear = m_simParams.m_shear;\r
662                         float attraction = m_simParams.m_attraction;\r
663                         int unsortedIndex = m_hPosHash[index].y;\r
664                         //Get address in grid\r
665                         btInt4 gridPosA = cpu_getGridPos(posA, &m_simParams);\r
666                         //Accumulate surrounding cells\r
667                         btInt4 gridPosB; \r
668                         for(int z = -1; z <= 1; z++)\r
669                         {\r
670                                 gridPosB.z = gridPosA.z + z;\r
671                                 for(int y = -1; y <= 1; y++)\r
672                                 {\r
673                                         gridPosB.y = gridPosA.y + y;\r
674                                         for(int x = -1; x <= 1; x++)\r
675                                         {\r
676                                                 gridPosB.x = gridPosA.x + x;\r
677                                                 //Get start particle index for this cell\r
678                                                 unsigned int hashB = cpu_getPosHash(gridPosB, &m_simParams);\r
679                                                 int startI = m_hCellStart[hashB];\r
680                                                 //Skip empty cell\r
681                                                 if(startI < 0)\r
682                                                 {\r
683                                                         continue;\r
684                                                 }\r
685                                                 //Iterate over particles in this cell\r
686                                                 int endI = startI + 32;\r
687                                                 if(endI > m_numParticles) \r
688                                                         endI = m_numParticles;\r
689 \r
690                                                 for(int j = startI; j < endI; j++)\r
691                                                 {\r
692                                                         unsigned int hashC = m_hPosHash[j].x;\r
693                                                         if(hashC != hashB)\r
694                                                         {\r
695                                                                 break;\r
696                                                         }\r
697                                                         if(j == index)\r
698                                                         {\r
699                                                                 continue;\r
700                                                         }\r
701 \r
702                                                         btPair pair;\r
703                                                         pair.v0[0] = index;\r
704                                                         pair.v0[1] = j;\r
705                                                         pairs.push_back(pair.value);\r
706 \r
707 //                                                      printf("index=%d, j=%d\n",index,j);\r
708 //                                                      printf("(index=%d, j=%d) ",index,j);\r
709                                                         btVector3 posB = m_hSortedPos[j];\r
710                                                         btVector3 velB = m_hSortedVel[j];\r
711                                                         //Collide two spheres\r
712                                                         force += cpu_collideTwoParticles(       posA, posB, velA, velB, particleRad, particleRad, \r
713                                                                                                                                 spring, collisionDamping, shear, attraction);\r
714                                                 }\r
715                                         }\r
716                                 }\r
717                         }     \r
718                         //Write new velocity back to original unsorted location\r
719                         m_hVel[unsortedIndex] = velA + force;\r
720                 }       \r
721 \r
722 //#define BRUTE_FORCE_CHECK 1\r
723 #ifdef BRUTE_FORCE_CHECK\r
724                 for(int index = 0; index < m_numParticles; index++)\r
725                 {\r
726                         btVector3 posA = m_hSortedPos[index];\r
727                         btVector3 velA = m_hSortedVel[index];\r
728                         btVector3 force = btVector3(0, 0, 0);\r
729                         int unsortedIndex = m_hPosHash[index].y;\r
730                         \r
731                         float collisionDamping = m_simParams.m_collisionDamping;\r
732                         float spring = m_simParams.m_spring;\r
733                         float shear = m_simParams.m_shear;\r
734                         float attraction = m_simParams.m_attraction;\r
735                         for(int j = 0 ; j < m_numParticles; j++)\r
736                         {\r
737                                 if (index!=j)\r
738                                 {\r
739                                         btVector3 posB = m_hSortedPos[j];\r
740                                         btVector3 velB = m_hSortedVel[j];\r
741 \r
742 \r
743                                         btVector3  relPos = posB - posA; relPos[3] = 0.f;\r
744                                         float        dist2 = (relPos[0] * relPos[0] + relPos[1] * relPos[1] + relPos[2] * relPos[2]);\r
745                                         \r
746 \r
747                                         \r
748                                         if(dist2 < collideDist2)\r
749                                         {\r
750                                                                                 //Collide two spheres\r
751                                                 //                              force += cpu_collideTwoParticles(       posA, posB, velA, velB, particleRad, particleRad, \r
752                                                 //                                                                                                      spring, collisionDamping, shear, attraction);\r
753 \r
754                                                 btPair pair;\r
755                                                 pair.v0[0] = index;\r
756                                                 pair.v0[1] = j;\r
757                                                 if (pairs.findLinearSearch(pair.value)==pairs.size())\r
758                                                 {\r
759                                                         printf("not found index=%d, j=%d\n",index,j);\r
760                                                 } \r
761 \r
762                                                                                 \r
763                                         }\r
764                                 }\r
765                         }\r
766                         //Write new velocity back to original unsorted location\r
767                         //m_hVel[unsortedIndex] = velA + force;\r
768                 }\r
769 #endif //BRUTE_FORCE_CHECK\r
770 \r
771                 memSize = sizeof(btVector3) * m_numParticles;\r
772                 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);\r
773                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
774         }\r
775         else\r
776         {\r
777                 runKernelWithWorkgroupSize(PARTICLES_KERNEL_COLLIDE_PARTICLES, m_numParticles);\r
778                 cl_int ciErrNum = clFinish(m_cqCommandQue);\r
779                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
780         }\r
781 }\r
782 \r
783 \r
784 void btParticlesDynamicsWorld::runIntegrateMotionKernel()\r
785 {\r
786     cl_int ciErrNum;\r
787         if(m_useCpuControls[SIMSTAGE_INTEGRATE_MOTION]->m_active)\r
788         {\r
789                 // CPU version\r
790 #if 1\r
791                 // read from GPU\r
792                 unsigned int memSize = sizeof(btVector3) * m_numParticles;\r
793                 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);\r
794                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
795                 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);\r
796                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
797                 for(int index = 0; index < m_numParticles; index++)\r
798                 {\r
799                         btVector3 pos = m_hPos[index];\r
800                         btVector3 vel = m_hVel[index];\r
801                         pos[3] = 1.0f;\r
802                         vel[3] = 0.0f;\r
803                         // apply gravity\r
804                         btVector3 gravity;\r
805                         gravity[0] = m_simParams.m_gravity[0];\r
806                         gravity[1] = m_simParams.m_gravity[1];\r
807                         gravity[2] = m_simParams.m_gravity[2];\r
808 \r
809                         float particleRad = m_simParams.m_particleRad;\r
810                         float globalDamping = m_simParams.m_globalDamping;\r
811                         float boundaryDamping = m_simParams.m_boundaryDamping;\r
812                         vel += gravity * m_timeStep;\r
813                         vel *= globalDamping;\r
814                         // integrate position\r
815                         pos += vel * m_timeStep;\r
816                         // collide with world boundaries\r
817                         btVector3 worldMin;\r
818                         worldMin[0] = m_simParams.m_worldMin[0];\r
819                         worldMin[1] = m_simParams.m_worldMin[1];\r
820                         worldMin[2] = m_simParams.m_worldMin[2];\r
821 \r
822                         btVector3 worldMax;\r
823                         worldMax[0] = m_simParams.m_worldMax[0];\r
824                         worldMax[1] = m_simParams.m_worldMax[1];\r
825                         worldMax[2] = m_simParams.m_worldMax[2];\r
826 \r
827                         for(int j = 0; j < 3; j++)\r
828                         {\r
829                                 if(pos[j] < (worldMin[j] + particleRad))\r
830                                 {\r
831                                         pos[j] = worldMin[j] + particleRad;\r
832                                         vel[j] *= boundaryDamping;\r
833                                 }\r
834                                 if(pos[j] > (worldMax[j] - particleRad))\r
835                                 {\r
836                                         pos[j] = worldMax[j] - particleRad;\r
837                                         vel[j] *= boundaryDamping;\r
838                                 }\r
839                         }\r
840                         // write back position and velocity\r
841                         m_hPos[index] = pos;\r
842                         m_hVel[index] = vel;\r
843                 }\r
844 #endif\r
845                 // write back to GPU\r
846                 memSize = sizeof(btVector3) * m_numParticles;\r
847                 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);\r
848                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
849                 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);\r
850                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
851         }\r
852         else\r
853         {\r
854                 // Set work size and execute the kernel\r
855                 ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 4, sizeof(float), &m_timeStep);\r
856                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
857                 runKernelWithWorkgroupSize(PARTICLES_KERNEL_INTEGRATE_MOTION, m_numParticles);\r
858                 ciErrNum = clFinish(m_cqCommandQue);\r
859                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
860         }\r
861 }\r
862 \r
863 void btParticlesDynamicsWorld::runSortHashKernel()\r
864 {\r
865         cl_int ciErrNum;\r
866         int memSize = m_numParticles * sizeof(btInt2);\r
867         if(m_useCpuControls[SIMSTAGE_SORT_CELL_ID]->m_active)\r
868         {\r
869                 // CPU version\r
870                 // get hash from GPU\r
871                 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);\r
872                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
873                 // sort\r
874                 class btHashPosKey\r
875                 {\r
876                 public:\r
877                         unsigned int hash;\r
878                         unsigned int index;\r
879                         void quickSort(btHashPosKey* pData, int lo, int hi)\r
880                         {\r
881                                 int i=lo, j=hi;\r
882                                 btHashPosKey x = pData[(lo+hi)/2];\r
883                                 do\r
884                                 {    \r
885                                         while(pData[i].hash < x.hash) i++; \r
886                                         while(x.hash < pData[j].hash) j--;\r
887                                         if(i <= j)\r
888                                         {\r
889                                                 btHashPosKey t = pData[i];\r
890                                                 pData[i] = pData[j];\r
891                                                 pData[j] = t;\r
892                                                 i++; j--;\r
893                                         }\r
894                                 } while(i <= j);\r
895                                 if(lo < j) pData->quickSort(pData, lo, j);\r
896                                 if(i < hi) pData->quickSort(pData, i, hi);\r
897                         }\r
898                         void bitonicSort(btHashPosKey* pData, int lo, int n, bool dir)\r
899                         {\r
900                                 if(n > 1)\r
901                                 {\r
902                                         int m = n / 2;\r
903                                         bitonicSort(pData, lo, m, !dir);\r
904                                         bitonicSort(pData, lo + m, n - m, dir);\r
905                                         bitonicMerge(pData, lo, n, dir);\r
906                                 }\r
907                         }\r
908                         void bitonicMerge(btHashPosKey* pData, int lo, int n, bool dir)\r
909                         {\r
910                                 if(n > 1)\r
911                                 {\r
912                                         int m = greatestPowerOfTwoLessThan(n);\r
913                                         for(int i = lo; i < (lo + n - m); i++)\r
914                                         {\r
915                                                 compare(pData, i, i + m, dir);\r
916                                         }\r
917                                         bitonicMerge(pData, lo, m, dir);\r
918                                         bitonicMerge(pData, lo + m, n - m, dir);\r
919                                 }\r
920                         }\r
921                         void compare(btHashPosKey* pData, int i, int j, bool dir)\r
922                         {\r
923                                 if(dir == (pData[i].hash > pData[j].hash))\r
924                                 {\r
925                                         btHashPosKey t = pData[i];\r
926                                         pData[i] = pData[j];\r
927                                         pData[j] = t;\r
928                                 }\r
929                         }\r
930                         int greatestPowerOfTwoLessThan(int n)\r
931                         {\r
932                                 int k = 1;\r
933                                 while(k < n)\r
934                                 {\r
935                                         k = k << 1;\r
936                                 }\r
937                                 return k>>1;\r
938                         }\r
939                 };\r
940                 btHashPosKey* pHash = (btHashPosKey*)(&m_hPosHash[0]);\r
941                 pHash->quickSort(pHash, 0, m_numParticles-1 );\r
942         //      pHash->bitonicSort(pHash, 0, m_hashSize, true);\r
943                 // write back to GPU\r
944                 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);\r
945                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
946         }\r
947         else\r
948         {\r
949                  // bitonic sort on GPU (shared memory) \r
950                 int dir = 1;\r
951                 bitonicSortNv(m_dPosHash, 1, m_hashSize, dir);\r
952                 ciErrNum = clFinish(m_cqCommandQue);\r
953                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
954         }\r
955 #if 0\r
956         // check order\r
957         memSize = m_numParticles * sizeof(btInt2);\r
958         ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);\r
959         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
960         for(int i = 1; i < m_hashSize; i++)\r
961         {\r
962                 if(m_hPosHash[i-1].x > m_hPosHash[i].x)\r
963                 {\r
964                         printf("Hash sort error at %d\n", i);\r
965                 }\r
966         }\r
967 #endif\r
968 }\r
969 \r
970 \r
971 void btParticlesDynamicsWorld::runFindCellStartKernel()\r
972 {\r
973     cl_int ciErrNum;\r
974         if(m_useCpuControls[SIMSTAGE_FIND_CELL_START]->m_active)\r
975         {\r
976                 // CPU version\r
977                 // get hash from GPU\r
978                 int memSize = m_numParticles * sizeof(btInt2);\r
979                 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL);\r
980                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
981                 memSize = sizeof(btVector3) * m_numParticles;\r
982                 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL);\r
983                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
984                 ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL);\r
985                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
986                 // clear cells\r
987                 for(int i = 0; i < m_numGridCells; i++)\r
988                 {\r
989                         m_hCellStart[i] = -1;\r
990                 }\r
991                 // find start of each cell in sorted hash\r
992                 btInt2 hash = m_hPosHash[0];\r
993                 m_hCellStart[hash.x] = 0;\r
994                 int unsortedIndex = hash.y;\r
995                 btVector3 pos = m_hPos[unsortedIndex];\r
996                 btVector3 vel = m_hVel[unsortedIndex];\r
997                 m_hSortedPos[0] = pos;\r
998                 m_hSortedVel[0] = vel;\r
999                 for(int i = 1; i < m_numParticles; i++)\r
1000                 {\r
1001                         if(m_hPosHash[i-1].x != m_hPosHash[i].x)\r
1002                         {\r
1003                                 m_hCellStart[m_hPosHash[i].x] = i;\r
1004                         }\r
1005                         unsortedIndex = m_hPosHash[i].y;\r
1006                         pos = m_hPos[unsortedIndex];\r
1007                         vel = m_hVel[unsortedIndex];\r
1008                         m_hSortedPos[i] = pos;\r
1009                         m_hSortedVel[i] = vel;\r
1010                 }\r
1011                 // write back to GPU\r
1012                 memSize = m_numGridCells * sizeof(int);\r
1013                 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dCellStart, CL_TRUE, 0, memSize, &(m_hCellStart[0]), 0, NULL, NULL);\r
1014                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1015                 memSize = sizeof(btVector3) * m_numParticles;\r
1016                 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dSortedPos, CL_TRUE, 0, memSize, &(m_hSortedPos[0]), 0, NULL, NULL);\r
1017                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1018                 ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dSortedVel, CL_TRUE, 0, memSize, &(m_hSortedVel[0]), 0, NULL, NULL);\r
1019                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1020         }\r
1021         else\r
1022         {       // GPU\r
1023                 runKernelWithWorkgroupSize(PARTICLES_KERNEL_CLEAR_CELL_START, m_numGridCells);\r
1024                 runKernelWithWorkgroupSize(PARTICLES_KERNEL_FIND_CELL_START, m_numParticles);\r
1025                 ciErrNum = clFinish(m_cqCommandQue);\r
1026                 oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1027         }\r
1028 }\r
1029 \r
1030 \r
1031 void btParticlesDynamicsWorld::initKernel(int kernelId, const char* pName)\r
1032 {\r
1033         \r
1034         cl_int ciErrNum;\r
1035         cl_kernel kernel = clCreateKernel(m_cpProgram, pName, &ciErrNum);\r
1036         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1037         size_t wgSize;\r
1038         ciErrNum = clGetKernelWorkGroupInfo(kernel, m_cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);\r
1039         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1040         \r
1041 \r
1042         \r
1043 \r
1044 //      if (wgSize > 256)\r
1045 //              wgSize = 256;\r
1046 \r
1047         if (wgSize > 512)\r
1048                 wgSize = 512;\r
1049 \r
1050 //      if (wgSize > 1024)\r
1051 //              wgSize = 1024;\r
1052 \r
1053         m_kernels[kernelId].m_Id = kernelId;\r
1054         m_kernels[kernelId].m_kernel = kernel;\r
1055         m_kernels[kernelId].m_name = pName;\r
1056         m_kernels[kernelId].m_workgroupSize = wgSize;\r
1057 \r
1058         return;\r
1059 }\r
1060 \r
1061 void btParticlesDynamicsWorld::runKernelWithWorkgroupSize(int kernelId, int globalSize)\r
1062 {\r
1063         if(globalSize <= 0)\r
1064         {\r
1065                 return;\r
1066         }\r
1067         cl_kernel kernelFunc = m_kernels[kernelId].m_kernel;\r
1068         cl_int ciErrNum = clSetKernelArg(kernelFunc, 0, sizeof(int), (void*)&globalSize);\r
1069         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1070         int workgroupSize = m_kernels[kernelId].m_workgroupSize;\r
1071         if(workgroupSize <= 0)\r
1072         { // let OpenCL library calculate workgroup size\r
1073                 size_t globalWorkSize[2];\r
1074                 globalWorkSize[0] = globalSize;\r
1075                 globalWorkSize[1] = 1;\r
1076                 ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, NULL, 0,0,0 );\r
1077         }\r
1078         else\r
1079         {\r
1080                 size_t localWorkSize[2], globalWorkSize[2];\r
1081                 workgroupSize = btMin(workgroupSize, globalSize);\r
1082                 int num_t = globalSize / workgroupSize;\r
1083                 int num_g = num_t * workgroupSize;\r
1084                 if(num_g < globalSize)\r
1085                 {\r
1086                         num_t++;\r
1087                 }\r
1088                 localWorkSize[0]  = workgroupSize;\r
1089                 globalWorkSize[0] = num_t * workgroupSize;\r
1090                 localWorkSize[1] = 1;\r
1091                 globalWorkSize[1] = 1;\r
1092                 ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, localWorkSize, 0,0,0 );\r
1093         }\r
1094         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1095 }\r
1096 \r
1097 \r
1098 //Note: logically shared with BitonicSort OpenCL code!\r
1099 // TODO : get parameter from OpenCL and pass it to kernel (needed for platforms other than NVIDIA)\r
1100 //static const unsigned int LOCAL_SIZE_LIMIT = 1024U;\r
1101 \r
1102 void btParticlesDynamicsWorld::bitonicSortNv(cl_mem pKey, unsigned int batch, unsigned int arrayLength, unsigned int dir)\r
1103 {\r
1104         unsigned int localSizeLimit = m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL].m_workgroupSize * 2;\r
1105     if(arrayLength < 2)\r
1106         return;\r
1107     //Only power-of-two array lengths are supported so far\r
1108     dir = (dir != 0);\r
1109     cl_int ciErrNum;\r
1110     size_t localWorkSize, globalWorkSize;\r
1111     if(arrayLength <= localSizeLimit)\r
1112     {\r
1113         btAssert( (batch * arrayLength) % localSizeLimit == 0);\r
1114         //Launch bitonicSortLocal\r
1115                 ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL].m_kernel, 0,   sizeof(cl_mem), (void *)&pKey);\r
1116         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL].m_kernel, 1,  sizeof(cl_uint), (void *)&arrayLength);\r
1117         ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL].m_kernel, 2,  sizeof(cl_uint), (void *)&dir);\r
1118         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1119 \r
1120         localWorkSize  = localSizeLimit / 2;\r
1121         globalWorkSize = batch * arrayLength / 2;\r
1122         ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL].m_kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);\r
1123         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1124     }\r
1125     else\r
1126     {\r
1127         //Launch bitonicSortLocal1\r
1128         ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1].m_kernel, 0,  sizeof(cl_mem), (void *)&pKey);\r
1129         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1130 \r
1131         localWorkSize  = localSizeLimit / 2;\r
1132         globalWorkSize = batch * arrayLength / 2;\r
1133         ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1].m_kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);\r
1134         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1135 \r
1136         for(unsigned int size = 2 * localSizeLimit; size <= arrayLength; size <<= 1)\r
1137         {\r
1138             for(unsigned stride = size / 2; stride > 0; stride >>= 1)\r
1139             {\r
1140                 if(stride >= localSizeLimit)\r
1141                 {\r
1142                     //Launch bitonicMergeGlobal\r
1143                     ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 0,  sizeof(cl_mem), (void *)&pKey);\r
1144                     ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 1, sizeof(cl_uint), (void *)&arrayLength);\r
1145                     ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 2, sizeof(cl_uint), (void *)&size);\r
1146                     ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 3, sizeof(cl_uint), (void *)&stride);\r
1147                     ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 4, sizeof(cl_uint), (void *)&dir);\r
1148                                         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1149 \r
1150                     localWorkSize  = localSizeLimit / 4;\r
1151                     globalWorkSize = batch * arrayLength / 2;\r
1152 \r
1153                     ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL].m_kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);\r
1154                                         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1155                 }\r
1156                 else\r
1157                 {\r
1158                     //Launch bitonicMergeLocal\r
1159                                         ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 0,  sizeof(cl_mem), (void *)&pKey);\r
1160                     ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 1, sizeof(cl_uint), (void *)&arrayLength);\r
1161                     ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 2, sizeof(cl_uint), (void *)&stride);\r
1162                     ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 3, sizeof(cl_uint), (void *)&size);\r
1163                     ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 4, sizeof(cl_uint), (void *)&dir);\r
1164                                         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1165 \r
1166                     localWorkSize  = localSizeLimit / 2;\r
1167                     globalWorkSize = batch * arrayLength / 2;\r
1168 \r
1169                     ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, m_kernels[PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL].m_kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);\r
1170                                         oclCHECKERROR(ciErrNum, CL_SUCCESS);\r
1171                     break;\r
1172                 }\r
1173             }\r
1174         }\r
1175     }\r
1176 }\r
1177 \r