src/BulletMultiThreaded/GpuSoftBodySolvers/OpenCL/btSoftBodySolver_OpenCLSIMDAware.cpp

   1 /*\r
   2 Bullet Continuous Collision Detection and Physics Library\r
   3 Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/\r
   4 \r
   5 This software is provided 'as-is', without any express or implied warranty.\r
   6 In no event will the authors be held liable for any damages arising from the use of this software.\r
   7 Permission is granted to anyone to use this software for any purpose, \r
   8 including commercial applications, and to alter it and redistribute it freely, \r
   9 subject to the following restrictions:\r
  10 \r
  11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\r
  12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\r
  13 3. This notice may not be removed or altered from any source distribution.\r
  14 */\r
  15 \r
  16 \r
  17 #include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"\r
  18 #include "vectormath/vmInclude.h"\r
  19 #include <stdio.h> //@todo: remove the debugging printf at some stage\r
  20 #include "btSoftBodySolver_OpenCLSIMDAware.h"\r
  21 #include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"\r
  22 #include "BulletSoftBody/btSoftBody.h"\r
  23 #include "BulletCollision/CollisionShapes/btCapsuleShape.h"\r
  24 #include <limits.h>\r
  25 \r
  26 #define WAVEFRONT_SIZE 32\r
  27 #define WAVEFRONT_BLOCK_MULTIPLIER 2\r
  28 #define GROUP_SIZE (WAVEFRONT_SIZE*WAVEFRONT_BLOCK_MULTIPLIER)\r
  29 #define LINKS_PER_SIMD_LANE 16\r
  30 \r
  31 static const size_t workGroupSize = GROUP_SIZE;\r
  32 \r
  33 \r
  34 //CL_VERSION_1_1 seems broken on NVidia SDK so just disable it\r
  35 \r
  36 ////OpenCL 1.0 kernels don't use float3\r
  37 #define MSTRINGIFY(A) #A\r
  38 static const char* UpdatePositionsFromVelocitiesCLString = \r
  39 #include "OpenCLC10/UpdatePositionsFromVelocities.cl"\r
  40 static const char* SolvePositionsCLString = \r
  41 #include "OpenCLC10/SolvePositionsSIMDBatched.cl"\r
  42 static const char* UpdateNodesCLString = \r
  43 #include "OpenCLC10/UpdateNodes.cl"\r
  44 static const char* UpdatePositionsCLString = \r
  45 #include "OpenCLC10/UpdatePositions.cl"\r
  46 static const char* UpdateConstantsCLString = \r
  47 #include "OpenCLC10/UpdateConstants.cl"\r
  48 static const char* IntegrateCLString = \r
  49 #include "OpenCLC10/Integrate.cl"\r
  50 static const char* ApplyForcesCLString = \r
  51 #include "OpenCLC10/ApplyForces.cl"\r
  52 static const char* UpdateFixedVertexPositionsCLString = \r
  53 #include "OpenCLC10/UpdateFixedVertexPositions.cl"\r
  54 static const char* UpdateNormalsCLString = \r
  55 #include "OpenCLC10/UpdateNormals.cl"\r
  56 static const char* VSolveLinksCLString = \r
  57 #include "OpenCLC10/VSolveLinks.cl"\r
  58 static const char* SolveCollisionsAndUpdateVelocitiesCLString =\r
  59 #include "OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl"\r
  60 static const char* OutputToVertexArrayCLString =\r
  61 #include "OpenCLC10/OutputToVertexArray.cl"\r
  62 \r
  63 \r
  64 \r
  65 btSoftBodyLinkDataOpenCLSIMDAware::btSoftBodyLinkDataOpenCLSIMDAware(cl_command_queue queue,  cl_context ctx) :\r
  66         m_cqCommandQue(queue),\r
  67         m_wavefrontSize( WAVEFRONT_SIZE ),\r
  68         m_linksPerWorkItem( LINKS_PER_SIMD_LANE ),\r
  69         m_maxBatchesWithinWave( 0 ),\r
  70         m_maxLinksPerWavefront( m_wavefrontSize * m_linksPerWorkItem ),\r
  71         m_numWavefronts( 0 ),\r
  72         m_maxVertex( 0 ),\r
  73         m_clNumBatchesAndVerticesWithinWaves( queue, ctx, &m_numBatchesAndVerticesWithinWaves, true ),\r
  74         m_clWavefrontVerticesGlobalAddresses( queue, ctx, &m_wavefrontVerticesGlobalAddresses, true ),\r
  75         m_clLinkVerticesLocalAddresses( queue, ctx, &m_linkVerticesLocalAddresses, true ),\r
  76         m_clLinkStrength( queue, ctx, &m_linkStrength, false ),\r
  77         m_clLinksMassLSC( queue, ctx, &m_linksMassLSC, false ),\r
  78         m_clLinksRestLengthSquared( queue, ctx, &m_linksRestLengthSquared, false ),\r
  79         m_clLinksRestLength( queue, ctx, &m_linksRestLength, false ),\r
  80         m_clLinksMaterialLinearStiffnessCoefficient( queue, ctx, &m_linksMaterialLinearStiffnessCoefficient, false )\r
  81 {\r
  82 }\r
  83 \r
  84 btSoftBodyLinkDataOpenCLSIMDAware::~btSoftBodyLinkDataOpenCLSIMDAware()\r
  85 {\r
  86 }\r
  87 \r
  88 static Vectormath::Aos::Vector3 toVector3( const btVector3 &vec )\r
  89 {\r
  90         Vectormath::Aos::Vector3 outVec( vec.getX(), vec.getY(), vec.getZ() );\r
  91         return outVec;\r
  92 }\r
  93 \r
  94 /** Allocate enough space in all link-related arrays to fit numLinks links */\r
  95 void btSoftBodyLinkDataOpenCLSIMDAware::createLinks( int numLinks )\r
  96 {\r
  97         int previousSize = m_links.size();\r
  98         int newSize = previousSize + numLinks;\r
  99 \r
 100         btSoftBodyLinkData::createLinks( numLinks );\r
 101 \r
 102         // Resize the link addresses array as well\r
 103         m_linkAddresses.resize( newSize );\r
 104 }\r
 105 \r
 106 /** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */\r
 107 void btSoftBodyLinkDataOpenCLSIMDAware::setLinkAt( \r
 108         const LinkDescription &link, \r
 109         int linkIndex )\r
 110 {\r
 111         btSoftBodyLinkData::setLinkAt( link, linkIndex );\r
 112 \r
 113         if( link.getVertex0() > m_maxVertex )\r
 114                 m_maxVertex = link.getVertex0();\r
 115         if( link.getVertex1() > m_maxVertex )\r
 116                 m_maxVertex = link.getVertex1();\r
 117 \r
 118         // Set the link index correctly for initialisation\r
 119         m_linkAddresses[linkIndex] = linkIndex;\r
 120 }\r
 121 \r
 122 bool btSoftBodyLinkDataOpenCLSIMDAware::onAccelerator()\r
 123 {\r
 124         return m_onGPU;\r
 125 }\r
 126 \r
 127 bool btSoftBodyLinkDataOpenCLSIMDAware::moveToAccelerator()\r
 128 {\r
 129         bool success = true;\r
 130         success = success && m_clNumBatchesAndVerticesWithinWaves.moveToGPU();\r
 131         success = success && m_clWavefrontVerticesGlobalAddresses.moveToGPU();\r
 132         success = success && m_clLinkVerticesLocalAddresses.moveToGPU();\r
 133         success = success && m_clLinkStrength.moveToGPU();\r
 134         success = success && m_clLinksMassLSC.moveToGPU();\r
 135         success = success && m_clLinksRestLengthSquared.moveToGPU();\r
 136         success = success && m_clLinksRestLength.moveToGPU();\r
 137         success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveToGPU();\r
 138 \r
 139         if( success ) {\r
 140                 m_onGPU = true;\r
 141         }\r
 142 \r
 143         return success;\r
 144 }\r
 145 \r
 146 bool btSoftBodyLinkDataOpenCLSIMDAware::moveFromAccelerator()\r
 147 {\r
 148         bool success = true;\r
 149         success = success && m_clNumBatchesAndVerticesWithinWaves.moveToGPU();\r
 150         success = success && m_clWavefrontVerticesGlobalAddresses.moveToGPU();\r
 151         success = success && m_clLinkVerticesLocalAddresses.moveToGPU();\r
 152         success = success && m_clLinkStrength.moveFromGPU();\r
 153         success = success && m_clLinksMassLSC.moveFromGPU();\r
 154         success = success && m_clLinksRestLengthSquared.moveFromGPU();\r
 155         success = success && m_clLinksRestLength.moveFromGPU();\r
 156         success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveFromGPU();\r
 157 \r
 158         if( success ) {\r
 159                 m_onGPU = false;\r
 160         }\r
 161 \r
 162         return success;\r
 163 }\r
 164 \r
 165 \r
 166 \r
 167 \r
 168 \r
 169 \r
 170 \r
 171 \r
 172 btOpenCLSoftBodySolverSIMDAware::btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue, cl_context ctx, bool bUpdateAchchoredNodePos) :\r
 173         btOpenCLSoftBodySolver( queue, ctx, bUpdateAchchoredNodePos ),\r
 174         m_linkData(queue, ctx)\r
 175 {\r
 176         // Initial we will clearly need to update solver constants\r
 177         // For now this is global for the cloths linked with this solver - we should probably make this body specific \r
 178         // for performance in future once we understand more clearly when constants need to be updated\r
 179         m_updateSolverConstants = true;\r
 180 \r
 181         m_shadersInitialized = false;\r
 182 }\r
 183 \r
 184 btOpenCLSoftBodySolverSIMDAware::~btOpenCLSoftBodySolverSIMDAware()\r
 185 {\r
 186         releaseKernels();\r
 187 }\r
 188 \r
 189 void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody * > &softBodies ,bool forceUpdate)\r
 190 {\r
 191         if( forceUpdate || m_softBodySet.size() != softBodies.size() )\r
 192         {\r
 193                 // Have a change in the soft body set so update, reloading all the data\r
 194                 getVertexData().clear();\r
 195                 getTriangleData().clear();\r
 196                 getLinkData().clear();\r
 197                 m_softBodySet.resize(0);\r
 198                 m_anchorIndex.clear();\r
 199 \r
 200                 int maxPiterations = 0;\r
 201                 int maxViterations = 0;\r
 202 \r
 203                 for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )\r
 204                 {\r
 205                         btSoftBody *softBody = softBodies[ softBodyIndex ];\r
 206                         using Vectormath::Aos::Matrix3;\r
 207                         using Vectormath::Aos::Point3;\r
 208 \r
 209                         // Create SoftBody that will store the information within the solver\r
 210                         btOpenCLAcceleratedSoftBodyInterface* newSoftBody = new btOpenCLAcceleratedSoftBodyInterface( softBody );\r
 211                         m_softBodySet.push_back( newSoftBody );\r
 212 \r
 213                         m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );\r
 214                         m_perClothDampingFactor.push_back(softBody->m_cfg.kDP);\r
 215                         m_perClothVelocityCorrectionCoefficient.push_back( softBody->m_cfg.kVCF );\r
 216                         m_perClothLiftFactor.push_back( softBody->m_cfg.kLF );\r
 217                         m_perClothDragFactor.push_back( softBody->m_cfg.kDG );\r
 218                         m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density);\r
 219                         // Simple init values. Actually we'll put 0 and -1 into them at the appropriate time\r
 220                         m_perClothFriction.push_back(softBody->m_cfg.kDF);\r
 221                         m_perClothCollisionObjects.push_back( CollisionObjectIndices(-1, -1) );\r
 222 \r
 223                         // Add space for new vertices and triangles in the default solver for now\r
 224                         // TODO: Include space here for tearing too later\r
 225                         int firstVertex = getVertexData().getNumVertices();\r
 226                         int numVertices = softBody->m_nodes.size();\r
 227                         // Round maxVertices to a multiple of the workgroup size so we know we're safe to run over in a given group\r
 228                         // maxVertices can be increased to allow tearing, but should be used sparingly because these extra verts will always be processed\r
 229                         int maxVertices = GROUP_SIZE*((numVertices+GROUP_SIZE)/GROUP_SIZE);\r
 230                         // Allocate space for new vertices in all the vertex arrays\r
 231                         getVertexData().createVertices( numVertices, softBodyIndex, maxVertices );\r
 232 \r
 233 \r
 234                         int firstTriangle = getTriangleData().getNumTriangles();\r
 235                         int numTriangles = softBody->m_faces.size();\r
 236                         int maxTriangles = numTriangles;\r
 237                         getTriangleData().createTriangles( maxTriangles );\r
 238 \r
 239                         // Copy vertices from softbody into the solver\r
 240                         for( int vertex = 0; vertex < numVertices; ++vertex )\r
 241                         {\r
 242                                 Point3 multPoint(softBody->m_nodes[vertex].m_x.getX(), softBody->m_nodes[vertex].m_x.getY(), softBody->m_nodes[vertex].m_x.getZ());\r
 243                                 btSoftBodyVertexData::VertexDescription desc;\r
 244 \r
 245                                 // TODO: Position in the softbody might be pre-transformed\r
 246                                 // or we may need to adapt for the pose.\r
 247                                 //desc.setPosition( cloth.getMeshTransform()*multPoint );\r
 248                                 desc.setPosition( multPoint );\r
 249 \r
 250                                 float vertexInverseMass = softBody->m_nodes[vertex].m_im;\r
 251                                 desc.setInverseMass(vertexInverseMass);\r
 252                                 getVertexData().setVertexAt( desc, firstVertex + vertex );\r
 253 \r
 254                                 m_anchorIndex.push_back(-1);\r
 255                         }\r
 256                         for( int vertex = numVertices; vertex < maxVertices; ++vertex )\r
 257                         {\r
 258                                 m_anchorIndex.push_back(-1.0);\r
 259                         }\r
 260 \r
 261                         // Copy triangles similarly\r
 262                         // We're assuming here that vertex indices are based on the firstVertex rather than the entire scene\r
 263                         for( int triangle = 0; triangle < numTriangles; ++triangle )\r
 264                         {\r
 265                                 // Note that large array storage is relative to the array not to the cloth\r
 266                                 // So we need to add firstVertex to each value\r
 267                                 int vertexIndex0 = (softBody->m_faces[triangle].m_n[0] - &(softBody->m_nodes[0]));\r
 268                                 int vertexIndex1 = (softBody->m_faces[triangle].m_n[1] - &(softBody->m_nodes[0]));\r
 269                                 int vertexIndex2 = (softBody->m_faces[triangle].m_n[2] - &(softBody->m_nodes[0]));\r
 270                                 btSoftBodyTriangleData::TriangleDescription newTriangle(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, vertexIndex2 + firstVertex);\r
 271                                 getTriangleData().setTriangleAt( newTriangle, firstTriangle + triangle );\r
 272                                 \r
 273                                 // Increase vertex triangle counts for this triangle            \r
 274                                 getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex0)++;\r
 275                                 getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex1)++;\r
 276                                 getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex2)++;\r
 277                         }\r
 278 \r
 279                         int firstLink = getLinkData().getNumLinks();\r
 280                         int numLinks = softBody->m_links.size();\r
 281                         int maxLinks = numLinks;\r
 282                         \r
 283                         // Allocate space for the links\r
 284                         getLinkData().createLinks( numLinks );\r
 285 \r
 286                         // Add the links\r
 287                         for( int link = 0; link < numLinks; ++link )\r
 288                         {\r
 289                                 int vertexIndex0 = softBody->m_links[link].m_n[0] - &(softBody->m_nodes[0]);\r
 290                                 int vertexIndex1 = softBody->m_links[link].m_n[1] - &(softBody->m_nodes[0]);\r
 291 \r
 292                                 btSoftBodyLinkData::LinkDescription newLink(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, softBody->m_links[link].m_material->m_kLST);\r
 293                                 newLink.setLinkStrength(1.f);\r
 294                                 getLinkData().setLinkAt(newLink, firstLink + link);\r
 295                         }\r
 296                         \r
 297                         newSoftBody->setFirstVertex( firstVertex );\r
 298                         newSoftBody->setFirstTriangle( firstTriangle );\r
 299                         newSoftBody->setNumVertices( numVertices );\r
 300                         newSoftBody->setMaxVertices( maxVertices );\r
 301                         newSoftBody->setNumTriangles( numTriangles );\r
 302                         newSoftBody->setMaxTriangles( maxTriangles );\r
 303                         newSoftBody->setFirstLink( firstLink );\r
 304                         newSoftBody->setNumLinks( numLinks );\r
 305 \r
 306                         // Find maximum piterations and viterations\r
 307                         int piterations = softBody->m_cfg.piterations;\r
 308 \r
 309             if ( piterations > maxPiterations )\r
 310                   maxPiterations = piterations;\r
 311 \r
 312             int viterations = softBody->m_cfg.viterations;\r
 313 \r
 314                         if ( viterations > maxViterations )\r
 315                   maxViterations = viterations;\r
 316 \r
 317                         // zero mass\r
 318                         for( int vertex = 0; vertex < numVertices; ++vertex )\r
 319                         {\r
 320                                 if ( softBody->m_nodes[vertex].m_im == 0 )\r
 321                                 {\r
 322                                         AnchorNodeInfoCL nodeInfo;\r
 323                                         nodeInfo.clVertexIndex = firstVertex + vertex;\r
 324                                         nodeInfo.pNode = &softBody->m_nodes[vertex];\r
 325 \r
 326                                         m_anchorNodeInfoArray.push_back(nodeInfo);\r
 327                                 }\r
 328                         }                       \r
 329 \r
 330                         // anchor position\r
 331                         if ( numVertices > 0 )\r
 332                         {\r
 333                                 for ( int anchorIndex = 0; anchorIndex < softBody->m_anchors.size(); anchorIndex++ )\r
 334                                 {\r
 335                                         btSoftBody::Node* anchorNode = softBody->m_anchors[anchorIndex].m_node;\r
 336                                         btSoftBody::Node* firstNode = &softBody->m_nodes[0];\r
 337 \r
 338                                         AnchorNodeInfoCL nodeInfo;\r
 339                                         nodeInfo.clVertexIndex = firstVertex + (int)(anchorNode - firstNode);\r
 340                                         nodeInfo.pNode = anchorNode;\r
 341 \r
 342                                         m_anchorNodeInfoArray.push_back(nodeInfo);\r
 343                                 }\r
 344                         }                       \r
 345                 }\r
 346 \r
 347                 m_anchorPosition.clear();               \r
 348                 m_anchorPosition.resize(m_anchorNodeInfoArray.size());\r
 349 \r
 350                 for ( int anchorNode = 0; anchorNode < m_anchorNodeInfoArray.size(); anchorNode++ )\r
 351                 {\r
 352                         const AnchorNodeInfoCL& anchorNodeInfo = m_anchorNodeInfoArray[anchorNode];\r
 353                         m_anchorIndex[anchorNodeInfo.clVertexIndex] = anchorNode;\r
 354                         getVertexData().getInverseMass(anchorNodeInfo.clVertexIndex) = 0.0f;\r
 355                 }\r
 356                 \r
 357                 updateConstants(0.f);\r
 358 \r
 359                 // set position and velocity iterations\r
 360                 setNumberOfPositionIterations(maxPiterations);\r
 361                 setNumberOfVelocityIterations(maxViterations);\r
 362 \r
 363                 // set wind velocity\r
 364                 m_perClothWindVelocity.resize( m_softBodySet.size() );\r
 365                 for( int softBodyIndex = 0; softBodyIndex < m_softBodySet.size(); ++softBodyIndex )\r
 366                 {\r
 367                         btSoftBody *softBody = m_softBodySet[softBodyIndex]->getSoftBody();                     \r
 368                         m_perClothWindVelocity[softBodyIndex] = toVector3(softBody->getWindVelocity());\r
 369                 }\r
 370 \r
 371                 m_clPerClothWindVelocity.changedOnCPU();\r
 372 \r
 373                 // generate batches\r
 374                 m_linkData.generateBatches();           \r
 375                 m_triangleData.generateBatches();\r
 376 \r
 377                 // Build the shaders to match the batching parameters\r
 378                 buildShaders();\r
 379         }\r
 380 }\r
 381 \r
 382 \r
 383 btSoftBodyLinkData &btOpenCLSoftBodySolverSIMDAware::getLinkData()\r
 384 {\r
 385         // TODO: Consider setting link data to "changed" here\r
 386         return m_linkData;\r
 387 }\r
 388 \r
 389 \r
 390 \r
 391 \r
 392 void btOpenCLSoftBodySolverSIMDAware::updateConstants( float timeStep )\r
 393 {                       \r
 394 \r
 395         using namespace Vectormath::Aos;\r
 396 \r
 397         if( m_updateSolverConstants )\r
 398         {\r
 399                 m_updateSolverConstants = false;\r
 400 \r
 401                 // Will have to redo this if we change the structure (tear, maybe) or various other possible changes\r
 402 \r
 403                 // Initialise link constants\r
 404                 const int numLinks = m_linkData.getNumLinks();\r
 405                 for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )\r
 406                 {\r
 407                         btSoftBodyLinkData::LinkNodePair &vertices( m_linkData.getVertexPair(linkIndex) );\r
 408                         m_linkData.getRestLength(linkIndex) = length((m_vertexData.getPosition( vertices.vertex0 ) - m_vertexData.getPosition( vertices.vertex1 )));\r
 409                         float invMass0 = m_vertexData.getInverseMass(vertices.vertex0);\r
 410                         float invMass1 = m_vertexData.getInverseMass(vertices.vertex1);\r
 411                         float linearStiffness = m_linkData.getLinearStiffnessCoefficient(linkIndex);\r
 412                         float massLSC = (invMass0 + invMass1)/linearStiffness;\r
 413                         m_linkData.getMassLSC(linkIndex) = massLSC;\r
 414                         float restLength = m_linkData.getRestLength(linkIndex);\r
 415                         float restLengthSquared = restLength*restLength;\r
 416                         m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared;\r
 417                 }\r
 418         }\r
 419 \r
 420 }\r
 421 \r
 422 \r
 423 \r
 424 void btOpenCLSoftBodySolverSIMDAware::solveConstraints( float solverdt )\r
 425 {\r
 426 \r
 427         using Vectormath::Aos::Vector3;\r
 428         using Vectormath::Aos::Point3;\r
 429         using Vectormath::Aos::lengthSqr;\r
 430         using Vectormath::Aos::dot;\r
 431 \r
 432         // Prepare links\r
 433         int numLinks = m_linkData.getNumLinks();\r
 434         int numVertices = m_vertexData.getNumVertices();\r
 435 \r
 436         float kst = 1.f;\r
 437         float ti = 0.f;\r
 438 \r
 439 \r
 440         m_clPerClothDampingFactor.moveToGPU();\r
 441         m_clPerClothVelocityCorrectionCoefficient.moveToGPU();\r
 442 \r
 443 \r
 444         // Ensure data is on accelerator\r
 445         m_linkData.moveToAccelerator();\r
 446         m_vertexData.moveToAccelerator();\r
 447 \r
 448         \r
 449         //prepareLinks();       \r
 450 \r
 451         prepareCollisionConstraints();\r
 452 \r
 453         // Solve drift\r
 454         for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )\r
 455         {\r
 456 \r
 457                 for( int i = 0; i < m_linkData.m_wavefrontBatchStartLengths.size(); ++i )\r
 458                 {\r
 459                         int startWave = m_linkData.m_wavefrontBatchStartLengths[i].start;\r
 460                         int numWaves = m_linkData.m_wavefrontBatchStartLengths[i].length;\r
 461                         solveLinksForPosition( startWave, numWaves, kst, ti );\r
 462                 }\r
 463         } // for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )\r
 464 \r
 465         \r
 466         // At this point assume that the force array is blank - we will overwrite it\r
 467         solveCollisionsAndUpdateVelocities( 1.f/solverdt );\r
 468 }\r
 469 \r
 470 \r
 471 //////////////////////////////////////\r
 472 // Kernel dispatches\r
 473 \r
 474 \r
 475 void btOpenCLSoftBodySolverSIMDAware::solveLinksForPosition( int startWave, int numWaves, float kst, float ti )\r
 476 {\r
 477         cl_int ciErrNum;\r
 478         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,0, sizeof(int), &startWave);\r
 479         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,1, sizeof(int), &numWaves);\r
 480         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,2, sizeof(float), &kst);\r
 481         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,3, sizeof(float), &ti);\r
 482         \r
 483         \r
 484         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clNumBatchesAndVerticesWithinWaves.m_buffer);\r
 485         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clWavefrontVerticesGlobalAddresses.m_buffer);\r
 486         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,6, sizeof(cl_mem), &m_linkData.m_clLinkVerticesLocalAddresses.m_buffer);\r
 487         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,7, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);\r
 488 \r
 489         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,8, sizeof(cl_mem), &m_linkData.m_clLinksRestLengthSquared.m_buffer);\r
 490         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,9, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);\r
 491         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,10, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);\r
 492 \r
 493         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,11, WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_int2), 0);\r
 494         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,12, m_linkData.getMaxVerticesPerWavefront()*WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_float4), 0);\r
 495         ciErrNum = clSetKernelArg(m_solvePositionsFromLinksKernel,13, m_linkData.getMaxVerticesPerWavefront()*WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_float), 0);\r
 496 \r
 497         size_t  numWorkItems = workGroupSize*((numWaves*WAVEFRONT_SIZE + (workGroupSize-1)) / workGroupSize);\r
 498         \r
 499         ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_solvePositionsFromLinksKernel,1,NULL,&numWorkItems,&workGroupSize,0,0,0);\r
 500         \r
 501         if( ciErrNum!= CL_SUCCESS ) \r
 502         {\r
 503                 btAssert( 0 &&  "enqueueNDRangeKernel(m_solvePositionsFromLinksKernel)");\r
 504         }\r
 505 \r
 506 } // solveLinksForPosition\r
 507 \r
 508 void btOpenCLSoftBodySolverSIMDAware::solveCollisionsAndUpdateVelocities( float isolverdt )\r
 509 {\r
 510         // Copy kernel parameters to GPU\r
 511         m_vertexData.moveToAccelerator();\r
 512         m_clPerClothFriction.moveToGPU();\r
 513         m_clPerClothDampingFactor.moveToGPU();\r
 514         m_clPerClothCollisionObjects.moveToGPU();\r
 515         m_clCollisionObjectDetails.moveToGPU();\r
 516         \r
 517         cl_int ciErrNum;\r
 518         int numVerts = m_vertexData.getNumVertices();\r
 519         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 0, sizeof(int), &numVerts);\r
 520         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 1, sizeof(int), &isolverdt);\r
 521         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 2, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer);\r
 522         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPreviousPosition.m_buffer);\r
 523         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 4, sizeof(cl_mem),&m_clPerClothFriction.m_buffer);\r
 524         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 5, sizeof(cl_mem),&m_clPerClothDampingFactor.m_buffer);\r
 525         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 6, sizeof(cl_mem),&m_clPerClothCollisionObjects.m_buffer);\r
 526         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 7, sizeof(cl_mem),&m_clCollisionObjectDetails.m_buffer);\r
 527         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 8, sizeof(cl_mem),&m_vertexData.m_clVertexForceAccumulator.m_buffer);\r
 528         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 9, sizeof(cl_mem),&m_vertexData.m_clVertexVelocity.m_buffer);\r
 529         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 10, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer);\r
 530         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 11, sizeof(CollisionShapeDescription)*16,0);\r
 531         ciErrNum = clSetKernelArg(m_solveCollisionsAndUpdateVelocitiesKernel, 12, sizeof(cl_mem),&m_vertexData.m_clVertexInverseMass.m_buffer);\r
 532         size_t  numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);\r
 533         \r
 534         if (numWorkItems)\r
 535         {\r
 536                 ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,m_solveCollisionsAndUpdateVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);\r
 537                 \r
 538                 if( ciErrNum != CL_SUCCESS ) \r
 539                 {\r
 540                         btAssert( 0 &&  "enqueueNDRangeKernel(m_solveCollisionsAndUpdateVelocitiesKernel)");\r
 541                 }\r
 542         }\r
 543 \r
 544 } // btOpenCLSoftBodySolverSIMDAware::updateVelocitiesFromPositionsWithoutVelocities\r
 545 \r
 546 // End kernel dispatches\r
 547 /////////////////////////////////////\r
 548 \r
 549 \r
 550 \r
 551 bool btOpenCLSoftBodySolverSIMDAware::buildShaders()\r
 552 {\r
 553         releaseKernels();\r
 554 \r
 555         if( m_shadersInitialized )\r
 556                 return true;\r
 557 \r
 558         const char* additionalMacros="";\r
 559 \r
 560         m_currentCLFunctions->clearKernelCompilationFailures();\r
 561 \r
 562         char *wavefrontMacros = new char[256];\r
 563 \r
 564         sprintf(\r
 565                 wavefrontMacros, \r
 566                 "-DMAX_NUM_VERTICES_PER_WAVE=%d -DMAX_BATCHES_PER_WAVE=%d -DWAVEFRONT_SIZE=%d -DWAVEFRONT_BLOCK_MULTIPLIER=%d -DBLOCK_SIZE=%d", \r
 567                 m_linkData.getMaxVerticesPerWavefront(),\r
 568                 m_linkData.getMaxBatchesPerWavefront(),\r
 569                 m_linkData.getWavefrontSize(),\r
 570                 WAVEFRONT_BLOCK_MULTIPLIER,\r
 571                 WAVEFRONT_BLOCK_MULTIPLIER*m_linkData.getWavefrontSize());\r
 572         \r
 573         m_updatePositionsFromVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel", additionalMacros,"OpenCLC10/UpdatePositionsFromVelocities.cl");\r
 574         m_solvePositionsFromLinksKernel = m_currentCLFunctions->compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel", wavefrontMacros ,"OpenCLC10/SolvePositionsSIMDBatched.cl");\r
 575         m_updateVelocitiesFromPositionsWithVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", additionalMacros ,"OpenCLC10/UpdateNodes.cl");\r
 576         m_updateVelocitiesFromPositionsWithoutVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", additionalMacros,"OpenCLC10/UpdatePositions.cl");\r
 577         m_integrateKernel = m_currentCLFunctions->compileCLKernelFromString( IntegrateCLString, "IntegrateKernel", additionalMacros ,"OpenCLC10/Integrate.cl");\r
 578         m_applyForcesKernel = m_currentCLFunctions->compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel", additionalMacros,"OpenCLC10/ApplyForces.cl" );\r
 579         m_updateFixedVertexPositionsKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateFixedVertexPositionsCLString, "UpdateFixedVertexPositions" ,additionalMacros,"OpenCLC10/UpdateFixedVertexPositions.cl");\r
 580         m_solveCollisionsAndUpdateVelocitiesKernel = m_currentCLFunctions->compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel", additionalMacros ,"OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl");\r
 581 \r
 582         // TODO: Rename to UpdateSoftBodies\r
 583         m_resetNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");\r
 584         m_normalizeNormalsAndAreasKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");\r
 585         m_updateSoftBodiesKernel = m_currentCLFunctions->compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel", additionalMacros ,"OpenCLC10/UpdateNormals.cl");\r
 586 \r
 587         delete [] wavefrontMacros;\r
 588 \r
 589         if( m_currentCLFunctions->getKernelCompilationFailures()==0)\r
 590         {\r
 591                 m_shadersInitialized = true;\r
 592         }\r
 593 \r
 594         return m_shadersInitialized;\r
 595 }\r
 596 \r
 597 \r
 598 \r
 599 \r
 600 static Vectormath::Aos::Transform3 toTransform3( const btTransform &transform )\r
 601 {\r
 602         Vectormath::Aos::Transform3 outTransform;\r
 603         outTransform.setCol(0, toVector3(transform.getBasis().getColumn(0)));\r
 604         outTransform.setCol(1, toVector3(transform.getBasis().getColumn(1)));\r
 605         outTransform.setCol(2, toVector3(transform.getBasis().getColumn(2)));\r
 606         outTransform.setCol(3, toVector3(transform.getOrigin()));\r
 607         return outTransform;    \r
 608 }\r
 609 \r
 610 \r
 611 static void generateBatchesOfWavefronts( btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts, btSoftBodyLinkData &linkData, int numVertices, btAlignedObjectArray < btAlignedObjectArray <int> > &wavefrontBatches )\r
 612 {\r
 613         // A per-batch map of truth values stating whether a given vertex is in that batch\r
 614         // This allows us to significantly optimize the batching\r
 615         btAlignedObjectArray <btAlignedObjectArray<bool> > mapOfVerticesInBatches;\r
 616 \r
 617         for( int waveIndex = 0; waveIndex < linksForWavefronts.size(); ++waveIndex )\r
 618         {\r
 619                 btAlignedObjectArray <int> &wavefront( linksForWavefronts[waveIndex] );\r
 620 \r
 621                 int batch = 0;\r
 622                 bool placed = false;\r
 623                 while( batch < wavefrontBatches.size() && !placed )\r
 624                 {\r
 625                         // Test the current batch, see if this wave shares any vertex with the waves in the batch\r
 626                         bool foundSharedVertex = false;\r
 627                         for( int link = 0; link < wavefront.size(); ++link )\r
 628                         {\r
 629                                 btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );\r
 630                                 if( (mapOfVerticesInBatches[batch])[vertices.vertex0] || (mapOfVerticesInBatches[batch])[vertices.vertex1] )\r
 631                                 {\r
 632                                         foundSharedVertex = true;\r
 633                                 }\r
 634                         }\r
 635 \r
 636                         if( !foundSharedVertex )\r
 637                         {\r
 638                                 wavefrontBatches[batch].push_back( waveIndex ); \r
 639                                 // Insert vertices into this batch too\r
 640                                 for( int link = 0; link < wavefront.size(); ++link )\r
 641                                 {\r
 642                                         btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );\r
 643                                         (mapOfVerticesInBatches[batch])[vertices.vertex0] = true;\r
 644                                         (mapOfVerticesInBatches[batch])[vertices.vertex1] = true;\r
 645                                 }\r
 646                                 placed = true;\r
 647                         }\r
 648                         batch++;\r
 649                 }\r
 650                 if( batch == wavefrontBatches.size() && !placed )\r
 651                 {\r
 652                         wavefrontBatches.resize( batch + 1 );\r
 653                         wavefrontBatches[batch].push_back( waveIndex );\r
 654 \r
 655                         // And resize map as well\r
 656                         mapOfVerticesInBatches.resize( batch + 1 );\r
 657                         \r
 658                         // Resize maps with total number of vertices\r
 659                         mapOfVerticesInBatches[batch].resize( numVertices+1, false );\r
 660 \r
 661                         // Insert vertices into this batch too\r
 662                         for( int link = 0; link < wavefront.size(); ++link )\r
 663                         {\r
 664                                 btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );\r
 665                                 (mapOfVerticesInBatches[batch])[vertices.vertex0] = true;\r
 666                                 (mapOfVerticesInBatches[batch])[vertices.vertex1] = true;\r
 667                         }\r
 668                 }\r
 669         }\r
 670         mapOfVerticesInBatches.clear();\r
 671 }\r
 672 \r
 673 // Function to remove an object from a vector maintaining correct ordering of the vector\r
 674 template< typename T > static void removeFromVector( btAlignedObjectArray< T > &vectorToUpdate, int indexToRemove )\r
 675 {\r
 676         int currentSize = vectorToUpdate.size();\r
 677         for( int i = indexToRemove; i < (currentSize-1); ++i )\r
 678         {\r
 679                 vectorToUpdate[i] = vectorToUpdate[i+1];\r
 680         }\r
 681         if( currentSize > 0 )\r
 682                 vectorToUpdate.resize( currentSize - 1 );\r
 683 }\r
 684 \r
 685 /**\r
 686  * Insert element into vectorToUpdate at index index.\r
 687  */\r
 688 template< typename T > static void insertAtIndex( btAlignedObjectArray< T > &vectorToUpdate, int index, T element )\r
 689 {\r
 690         vectorToUpdate.resize( vectorToUpdate.size() + 1 );\r
 691         for( int i = (vectorToUpdate.size() - 1); i > index; --i )\r
 692         {\r
 693                 vectorToUpdate[i] = vectorToUpdate[i-1];\r
 694         }\r
 695         vectorToUpdate[index] = element;\r
 696 }\r
 697 \r
 698 /** \r
 699  * Insert into btAlignedObjectArray assuming the array is ordered and maintaining both ordering and uniqueness.\r
 700  * ie it treats vectorToUpdate as an ordered set.\r
 701  */\r
 702 template< typename T > static void insertUniqueAndOrderedIntoVector( btAlignedObjectArray<T> &vectorToUpdate, T element )\r
 703 {\r
 704         int index = 0;\r
 705         while( index < vectorToUpdate.size() && vectorToUpdate[index] < element )\r
 706         {\r
 707                 index++;\r
 708         }\r
 709         if( index == vectorToUpdate.size() || vectorToUpdate[index] != element )\r
 710                 insertAtIndex( vectorToUpdate, index, element );\r
 711 }\r
 712 \r
 713 static void generateLinksPerVertex( int numVertices, btSoftBodyLinkData &linkData, btAlignedObjectArray< int > &listOfLinksPerVertex, btAlignedObjectArray <int> &numLinksPerVertex, int &maxLinks )\r
 714 {\r
 715         for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )\r
 716         {\r
 717                 btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );\r
 718                 numLinksPerVertex[nodes.vertex0]++;\r
 719                 numLinksPerVertex[nodes.vertex1]++;\r
 720         }\r
 721         int maxLinksPerVertex = 0;\r
 722         for( int vertexIndex = 0; vertexIndex < numVertices; ++vertexIndex )\r
 723         {\r
 724                 maxLinksPerVertex = btMax(numLinksPerVertex[vertexIndex], maxLinksPerVertex);\r
 725         }\r
 726         maxLinks = maxLinksPerVertex;\r
 727 \r
 728         btAlignedObjectArray< int > linksFoundPerVertex;\r
 729         linksFoundPerVertex.resize( numVertices, 0 );\r
 730 \r
 731         listOfLinksPerVertex.resize( maxLinksPerVertex * numVertices );\r
 732 \r
 733         for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )\r
 734         {\r
 735                 btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );\r
 736                 {\r
 737                         // Do vertex 0\r
 738                         int vertexIndex = nodes.vertex0;\r
 739                         int linkForVertex = linksFoundPerVertex[nodes.vertex0];\r
 740                         int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;\r
 741 \r
 742                         listOfLinksPerVertex[linkAddress] = linkIndex;\r
 743 \r
 744                         linksFoundPerVertex[nodes.vertex0] = linkForVertex + 1;\r
 745                 }\r
 746                 {\r
 747                         // Do vertex 1\r
 748                         int vertexIndex = nodes.vertex1;\r
 749                         int linkForVertex = linksFoundPerVertex[nodes.vertex1];\r
 750                         int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;\r
 751 \r
 752                         listOfLinksPerVertex[linkAddress] = linkIndex;\r
 753 \r
 754                         linksFoundPerVertex[nodes.vertex1] = linkForVertex + 1;\r
 755                 }\r
 756         }\r
 757 }\r
 758 \r
 759 static void computeBatchingIntoWavefronts( \r
 760         btSoftBodyLinkData &linkData, \r
 761         int wavefrontSize, \r
 762         int linksPerWorkItem, \r
 763         int maxLinksPerWavefront, \r
 764         btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts, \r
 765         btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > &batchesWithinWaves, /* wave, batch, links in batch */\r
 766         btAlignedObjectArray< btAlignedObjectArray< int > > &verticesForWavefronts /* wavefront, vertex */\r
 767         )\r
 768 {\r
 769         \r
 770 \r
 771         // Attempt generation of larger batches of links.\r
 772         btAlignedObjectArray< bool > processedLink;\r
 773         processedLink.resize( linkData.getNumLinks() );\r
 774         btAlignedObjectArray< int > listOfLinksPerVertex;\r
 775         int maxLinksPerVertex = 0;\r
 776 \r
 777         // Count num vertices\r
 778         int numVertices = 0;\r
 779         for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )\r
 780         {\r
 781                 btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );\r
 782                 numVertices = btMax( numVertices, nodes.vertex0 + 1 );\r
 783                 numVertices = btMax( numVertices, nodes.vertex1 + 1 );\r
 784         }\r
 785 \r
 786         // Need list of links per vertex\r
 787         // Compute valence of each vertex\r
 788         btAlignedObjectArray <int> numLinksPerVertex;\r
 789         numLinksPerVertex.resize(0);\r
 790         numLinksPerVertex.resize( numVertices, 0 );\r
 791 \r
 792         generateLinksPerVertex( numVertices, linkData, listOfLinksPerVertex, numLinksPerVertex, maxLinksPerVertex );\r
 793 \r
 794         if (!numVertices)\r
 795                 return;\r
 796 \r
 797         for( int vertex = 0; vertex < 10; ++vertex )\r
 798         {\r
 799                 for( int link = 0; link < numLinksPerVertex[vertex]; ++link )\r
 800                 {\r
 801                         int linkAddress = vertex * maxLinksPerVertex + link;\r
 802                 }\r
 803         }\r
 804 \r
 805 \r
 806         // At this point we know what links we have for each vertex so we can start batching\r
 807         \r
 808         // We want a vertex to start with, let's go with 0\r
 809         int currentVertex = 0;\r
 810         int linksProcessed = 0;\r
 811 \r
 812         btAlignedObjectArray <int> verticesToProcess;\r
 813 \r
 814         while( linksProcessed < linkData.getNumLinks() )\r
 815         {\r
 816                 // Next wavefront\r
 817                 int nextWavefront = linksForWavefronts.size();\r
 818                 linksForWavefronts.resize( nextWavefront + 1 );\r
 819                 btAlignedObjectArray <int> &linksForWavefront(linksForWavefronts[nextWavefront]);\r
 820                 verticesForWavefronts.resize( nextWavefront + 1 );\r
 821                 btAlignedObjectArray<int> &vertexSet( verticesForWavefronts[nextWavefront] );\r
 822 \r
 823                 linksForWavefront.resize(0);\r
 824 \r
 825                 // Loop to find enough links to fill the wavefront\r
 826                 // Stopping if we either run out of links, or fill it\r
 827                 while( linksProcessed < linkData.getNumLinks() && linksForWavefront.size() < maxLinksPerWavefront )\r
 828                 {\r
 829                         // Go through the links for the current vertex\r
 830                         for( int link = 0; link < numLinksPerVertex[currentVertex] && linksForWavefront.size() < maxLinksPerWavefront; ++link )\r
 831                         {\r
 832                                 int linkAddress = currentVertex * maxLinksPerVertex + link;\r
 833                                 int linkIndex = listOfLinksPerVertex[linkAddress];\r
 834                                 \r
 835                                 // If we have not already processed this link, add it to the wavefront\r
 836                                 // Claim it as another processed link\r
 837                                 // Add the vertex at the far end to the list of vertices to process.\r
 838                                 if( !processedLink[linkIndex] )\r
 839                                 {\r
 840                                         linksForWavefront.push_back( linkIndex );\r
 841                                         linksProcessed++;\r
 842                                         processedLink[linkIndex] = true;\r
 843                                         int v0 = linkData.getVertexPair(linkIndex).vertex0;\r
 844                                         int v1 = linkData.getVertexPair(linkIndex).vertex1;\r
 845                                         if( v0 == currentVertex )\r
 846                                                 verticesToProcess.push_back( v1 );\r
 847                                         else\r
 848                                                 verticesToProcess.push_back( v0 );\r
 849                                 }\r
 850                         }\r
 851                         if( verticesToProcess.size() > 0 )\r
 852                         {\r
 853                                 // Get the element on the front of the queue and remove it\r
 854                                 currentVertex = verticesToProcess[0];\r
 855                                 removeFromVector( verticesToProcess, 0 );\r
 856                         } else {                \r
 857                                 // If we've not yet processed all the links, find the first unprocessed one\r
 858                                 // and select one of its vertices as the current vertex\r
 859                                 if( linksProcessed < linkData.getNumLinks() )\r
 860                                 {\r
 861                                         int searchLink = 0;\r
 862                                         while( processedLink[searchLink] )\r
 863                                                 searchLink++;\r
 864                                         currentVertex = linkData.getVertexPair(searchLink).vertex0;\r
 865                                 }       \r
 866                         }\r
 867                 }\r
 868 \r
 869                 // We have either finished or filled a wavefront\r
 870                 for( int link = 0; link < linksForWavefront.size(); ++link )\r
 871                 {\r
 872                         int v0 = linkData.getVertexPair( linksForWavefront[link] ).vertex0;\r
 873                         int v1 = linkData.getVertexPair( linksForWavefront[link] ).vertex1;\r
 874                         insertUniqueAndOrderedIntoVector( vertexSet, v0 );\r
 875                         insertUniqueAndOrderedIntoVector( vertexSet, v1 );\r
 876                 }\r
 877                 // Iterate over links mapped to the wave and batch those\r
 878                 // We can run a batch on each cycle trivially\r
 879                 \r
 880                 batchesWithinWaves.resize( batchesWithinWaves.size() + 1 );\r
 881                 btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWave( batchesWithinWaves[batchesWithinWaves.size()-1] );\r
 882                 \r
 883 \r
 884                 for( int link = 0; link < linksForWavefront.size(); ++link )\r
 885                 {\r
 886                         int linkIndex = linksForWavefront[link];\r
 887                         btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( linkIndex );\r
 888                         \r
 889                         int batch = 0;\r
 890                         bool placed = false;\r
 891                         while( batch < batchesWithinWave.size() && !placed )\r
 892                         {\r
 893                                 bool foundSharedVertex = false;\r
 894                                 if( batchesWithinWave[batch].size() >= wavefrontSize )\r
 895                                 {\r
 896                                         // If we have already filled this batch, move on to another\r
 897                                         foundSharedVertex = true;\r
 898                                 } else {\r
 899                                         for( int link2 = 0; link2 < batchesWithinWave[batch].size(); ++link2 )\r
 900                                         {\r
 901                                                 btSoftBodyLinkData::LinkNodePair vertices2 = linkData.getVertexPair( (batchesWithinWave[batch])[link2] );\r
 902 \r
 903                                                 if( vertices.vertex0 == vertices2.vertex0 ||\r
 904                                                         vertices.vertex1 == vertices2.vertex0 ||\r
 905                                                         vertices.vertex0 == vertices2.vertex1 ||\r
 906                                                         vertices.vertex1 == vertices2.vertex1 )\r
 907                                                 {\r
 908                                                         foundSharedVertex = true;\r
 909                                                         break;\r
 910                                                 }\r
 911                                         }\r
 912                                 }\r
 913                                 if( !foundSharedVertex )\r
 914                                 {\r
 915                                         batchesWithinWave[batch].push_back( linkIndex );\r
 916                                         placed = true;\r
 917                                 } else {\r
 918                                         ++batch;\r
 919                                 }\r
 920                         }\r
 921                         if( batch == batchesWithinWave.size() && !placed )\r
 922                         {\r
 923                                 batchesWithinWave.resize( batch + 1 );\r
 924                                 batchesWithinWave[batch].push_back( linkIndex );\r
 925                         }\r
 926                 }\r
 927                 \r
 928         }\r
 929 \r
 930 }\r
 931 \r
 932 void btSoftBodyLinkDataOpenCLSIMDAware::generateBatches()\r
 933 {\r
 934         btAlignedObjectArray < btAlignedObjectArray <int> > linksForWavefronts;\r
 935         btAlignedObjectArray < btAlignedObjectArray <int> > wavefrontBatches;\r
 936         btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > batchesWithinWaves;\r
 937         btAlignedObjectArray< btAlignedObjectArray< int > > verticesForWavefronts; // wavefronts, vertices in wavefront as an ordered set\r
 938 \r
 939         // Group the links into wavefronts\r
 940         computeBatchingIntoWavefronts( *this, m_wavefrontSize, m_linksPerWorkItem, m_maxLinksPerWavefront, linksForWavefronts, batchesWithinWaves, verticesForWavefronts );\r
 941 \r
 942 \r
 943         // Batch the wavefronts\r
 944         generateBatchesOfWavefronts( linksForWavefronts, *this, m_maxVertex, wavefrontBatches );\r
 945 \r
 946         m_numWavefronts = linksForWavefronts.size();\r
 947 \r
 948         // At this point we have a description of which links we need to process in each wavefront\r
 949 \r
 950         // First correctly fill the batch ranges vector\r
 951         int numBatches = wavefrontBatches.size();\r
 952         m_wavefrontBatchStartLengths.resize(0);\r
 953         int prefixSum = 0;\r
 954         for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )\r
 955         {\r
 956                 int wavesInBatch = wavefrontBatches[batchIndex].size();\r
 957                 int nextPrefixSum = prefixSum + wavesInBatch;\r
 958                 m_wavefrontBatchStartLengths.push_back( BatchPair( prefixSum, nextPrefixSum - prefixSum ) );\r
 959 \r
 960                 prefixSum += wavesInBatch;\r
 961         }\r
 962         \r
 963         // Also find max number of batches within a wave\r
 964         m_maxBatchesWithinWave = 0;\r
 965         m_maxVerticesWithinWave = 0;\r
 966         m_numBatchesAndVerticesWithinWaves.resize( m_numWavefronts );\r
 967         for( int waveIndex = 0; waveIndex < m_numWavefronts; ++waveIndex )\r
 968         {\r
 969                 // See if the number of batches in this wave is greater than the current maxium\r
 970                 int batchesInCurrentWave = batchesWithinWaves[waveIndex].size();\r
 971                 int verticesInCurrentWave = verticesForWavefronts[waveIndex].size();\r
 972                 m_maxBatchesWithinWave = btMax( batchesInCurrentWave, m_maxBatchesWithinWave );\r
 973                 m_maxVerticesWithinWave = btMax( verticesInCurrentWave, m_maxVerticesWithinWave );\r
 974         }\r
 975         \r
 976         // Add padding values both for alignment and as dudd addresses within LDS to compute junk rather than branch around\r
 977         m_maxVerticesWithinWave = 16*((m_maxVerticesWithinWave/16)+2);\r
 978 \r
 979         // Now we know the maximum number of vertices per-wave we can resize the global vertices array\r
 980         m_wavefrontVerticesGlobalAddresses.resize( m_maxVerticesWithinWave * m_numWavefronts );\r
 981 \r
 982         // Grab backup copies of all the link data arrays for the sorting process\r
 983         btAlignedObjectArray<btSoftBodyLinkData::LinkNodePair>                          m_links_Backup(m_links);\r
 984         btAlignedObjectArray<float>                                                                                     m_linkStrength_Backup(m_linkStrength);\r
 985         btAlignedObjectArray<float>                                                                                     m_linksMassLSC_Backup(m_linksMassLSC);\r
 986         btAlignedObjectArray<float>                                                                                     m_linksRestLengthSquared_Backup(m_linksRestLengthSquared);\r
 987         //btAlignedObjectArray<Vectormath::Aos::Vector3>                                                m_linksCLength_Backup(m_linksCLength);\r
 988         //btAlignedObjectArray<float>                                                                                   m_linksLengthRatio_Backup(m_linksLengthRatio);\r
 989         btAlignedObjectArray<float>                                                                                     m_linksRestLength_Backup(m_linksRestLength);\r
 990         btAlignedObjectArray<float>                                                                                     m_linksMaterialLinearStiffnessCoefficient_Backup(m_linksMaterialLinearStiffnessCoefficient);\r
 991 \r
 992         // Resize to a wavefront sized batch per batch per wave so we get perfectly coherent memory accesses.\r
 993         m_links.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );\r
 994         m_linkVerticesLocalAddresses.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );\r
 995         m_linkStrength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );\r
 996         m_linksMassLSC.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );\r
 997         m_linksRestLengthSquared.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );\r
 998         m_linksRestLength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );\r
 999         m_linksMaterialLinearStiffnessCoefficient.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts ); \r
1000                 \r
1001         // Then re-order links into wavefront blocks\r
1002 \r
1003         // Total number of wavefronts moved. This will decide the ordering of sorted wavefronts.\r
1004         int wavefrontCount = 0;\r
1005 \r
1006         // Iterate over batches of wavefronts, then wavefronts in the batch\r
1007         for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )\r
1008         {\r
1009                 btAlignedObjectArray <int> &batch( wavefrontBatches[batchIndex] );\r
1010                 int wavefrontsInBatch = batch.size();\r
1011 \r
1012                 \r
1013                 for( int wavefrontIndex = 0; wavefrontIndex < wavefrontsInBatch; ++wavefrontIndex )\r
1014                 {       \r
1015 \r
1016                         int originalWavefrontIndex = batch[wavefrontIndex];\r
1017                         btAlignedObjectArray< int > &wavefrontVertices( verticesForWavefronts[originalWavefrontIndex] );\r
1018                         int verticesUsedByWavefront = wavefrontVertices.size();\r
1019 \r
1020                         // Copy the set of vertices into the correctly structured array for use on the device\r
1021                         // Fill the non-vertices with -1s\r
1022                         // so we can mask out those reads\r
1023                         for( int vertex = 0; vertex < verticesUsedByWavefront; ++vertex )\r
1024                         {\r
1025                                 m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = wavefrontVertices[vertex];\r
1026                         }\r
1027                         for( int vertex = verticesUsedByWavefront; vertex < m_maxVerticesWithinWave; ++vertex )\r
1028                         {\r
1029                                 m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = -1;\r
1030                         }\r
1031 \r
1032                         // Obtain the set of batches within the current wavefront\r
1033                         btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWavefront( batchesWithinWaves[originalWavefrontIndex] );\r
1034                         // Set the size of the batches for use in the solver, correctly ordered\r
1035                         NumBatchesVerticesPair batchesAndVertices;\r
1036                         batchesAndVertices.numBatches = batchesWithinWavefront.size();\r
1037                         batchesAndVertices.numVertices = verticesUsedByWavefront;\r
1038                         m_numBatchesAndVerticesWithinWaves[wavefrontCount] = batchesAndVertices;\r
1039                         \r
1040 \r
1041                         // Now iterate over batches within the wavefront to structure the links correctly\r
1042                         for( int wavefrontBatch = 0; wavefrontBatch < batchesWithinWavefront.size(); ++wavefrontBatch )\r
1043                         {\r
1044                                 btAlignedObjectArray <int> &linksInBatch( batchesWithinWavefront[wavefrontBatch] );\r
1045                                 int wavefrontBatchSize = linksInBatch.size();\r
1046 \r
1047                                 int batchAddressInTarget = m_maxBatchesWithinWave * m_wavefrontSize * wavefrontCount + m_wavefrontSize * wavefrontBatch;\r
1048 \r
1049                                 for( int linkIndex = 0; linkIndex < wavefrontBatchSize; ++linkIndex )\r
1050                                 {\r
1051                                         int originalLinkAddress = linksInBatch[linkIndex];\r
1052                                         // Reorder simple arrays trivially\r
1053                                         m_links[batchAddressInTarget + linkIndex] = m_links_Backup[originalLinkAddress];\r
1054                                         m_linkStrength[batchAddressInTarget + linkIndex] = m_linkStrength_Backup[originalLinkAddress];\r
1055                                         m_linksMassLSC[batchAddressInTarget + linkIndex] = m_linksMassLSC_Backup[originalLinkAddress];\r
1056                                         m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = m_linksRestLengthSquared_Backup[originalLinkAddress];\r
1057                                         m_linksRestLength[batchAddressInTarget + linkIndex] = m_linksRestLength_Backup[originalLinkAddress];\r
1058                                         m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = m_linksMaterialLinearStiffnessCoefficient_Backup[originalLinkAddress];\r
1059 \r
1060                                         // The local address is more complicated. We need to work out where a given vertex will end up\r
1061                                         // by searching the set of vertices for this link and using the index as the local address\r
1062                                         btSoftBodyLinkData::LinkNodePair localPair;\r
1063                                         btSoftBodyLinkData::LinkNodePair globalPair = m_links[batchAddressInTarget + linkIndex];\r
1064                                         localPair.vertex0 = wavefrontVertices.findLinearSearch( globalPair.vertex0 );\r
1065                                         localPair.vertex1 = wavefrontVertices.findLinearSearch( globalPair.vertex1 );\r
1066                                         m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;\r
1067                                 }\r
1068                                 for( int linkIndex = wavefrontBatchSize; linkIndex < m_wavefrontSize; ++linkIndex )\r
1069                                 {\r
1070                                         // Put 0s into these arrays for padding for cleanliness\r
1071                                         m_links[batchAddressInTarget + linkIndex] = btSoftBodyLinkData::LinkNodePair(0, 0);\r
1072                                         m_linkStrength[batchAddressInTarget + linkIndex] = 0.f;\r
1073                                         m_linksMassLSC[batchAddressInTarget + linkIndex] = 0.f;\r
1074                                         m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = 0.f;\r
1075                                         m_linksRestLength[batchAddressInTarget + linkIndex] = 0.f;\r
1076                                         m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = 0.f;\r
1077 \r
1078 \r
1079                                         // For local addresses of junk data choose a set of addresses just above the range of valid ones \r
1080                                         // and cycling tyhrough % 16 so that we don't have bank conficts between all dud addresses\r
1081                                         // The valid addresses will do scatter and gather in the valid range, the junk ones should happily work\r
1082                                         // off the end of that range so we need no control\r
1083                                         btSoftBodyLinkData::LinkNodePair localPair;\r
1084                                         localPair.vertex0 = verticesUsedByWavefront + (linkIndex % 16);\r
1085                                         localPair.vertex1 = verticesUsedByWavefront + (linkIndex % 16);\r
1086                                         m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;\r
1087                                 }\r
1088 \r
1089                         }\r
1090 \r
1091                         \r
1092                         wavefrontCount++;\r
1093                 }\r
1094 \r
1095         \r
1096         }\r
1097 \r
1098 } // void btSoftBodyLinkDataDX11SIMDAware::generateBatches()\r
1099 \r
1100 \r
1101 \r