MSTRINGIFY(

/*
Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc. 

This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose, 
including commercial applications, and to alter it and redistribute it freely, 
subject to the following restrictions:

1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/



int4 getGridPos(float4 worldPos, __global float4* pParams)
{
    int4 gridPos;
    gridPos.x = (int)floor((worldPos.x - pParams[1].x) / pParams[3].x);
    gridPos.y = (int)floor((worldPos.y - pParams[1].y) / pParams[3].y);
    gridPos.z = (int)floor((worldPos.z - pParams[1].z) / pParams[3].z);
    return gridPos;
}

unsigned int getPosHash(int4 gridPos, __global float4* pParams)
{
	int4 gridDim = *((__global int4*)(pParams + 4));
	if(gridPos.x < 0) gridPos.x = 0;
	if(gridPos.x >= gridDim.x) gridPos.x = gridDim.x - 1;
	if(gridPos.y < 0) gridPos.y = 0;
	if(gridPos.y >= gridDim.y) gridPos.y = gridDim.y - 1;
	if(gridPos.z < 0) gridPos.z = 0;
	if(gridPos.z >= gridDim.z) gridPos.z = gridDim.z - 1;
	unsigned int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;
	return hash;
} 


__kernel void kComputeCellId(	int numParticles, 
								__global float4* pPos, 
								__global int2* pPosHash,
								__global float4* pParams GUID_ARG)
{
    int index = get_global_id(0);
    if(index >= numParticles)
    {
		return;
    }
	float4 pos = pPos[index];
	int4 gridPos = getGridPos(pos, pParams);
	unsigned int hash = getPosHash(gridPos, pParams);
	pPosHash[index].x = hash;
	pPosHash[index].y = index;
}

__kernel void kClearCellStart(	int numCells, 
								__global int* pCellStart GUID_ARG)
{
    int index = get_global_id(0);
    if(index >= numCells)
	{
		return;
	}
	pCellStart[index] = -1;
}

__kernel void kFindCellStart(	int numParticles, 
								__global int2* pHash, 
								__global int* cellStart,
								__global float4* pPos,
								__global float4* pVel,
								__global float4* pSortedPos,
								__global float4* pSortedVel GUID_ARG)
{
    int index = get_global_id(0);
	__local int sharedHash[1025];//maximum workgroup size 1024
	int2 sortedData;
	
    if(index < numParticles)
	{

		sortedData = pHash[index];
		// Load hash data into shared memory so that we can look 
		// at neighboring body's hash value without loading
		// two hash values per thread
		sharedHash[get_local_id(0) + 1] = sortedData.x;
		if((index > 0) && (get_local_id(0) == 0))
		{
			// first thread in block must load neighbor body hash
			sharedHash[0] = pHash[index-1].x;
		}
		
	}
    barrier(CLK_LOCAL_MEM_FENCE);
	
	if(index < numParticles)
	{
		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))
		{
			cellStart[sortedData.x] = index;
		}
		int unsortedIndex = sortedData.y;
		float4 pos = pPos[unsortedIndex];
		float4 vel = pVel[unsortedIndex];
		pSortedPos[index] = pos;
		pSortedVel[index] = vel;
	}
}

__kernel void kIntegrateMotion(	int numParticles,
								__global float4* pPos, 
								__global float4* pVel, 
								__global float4* pParams, 
								float timeStep GUID_ARG)
{
    int index = get_global_id(0);
    if(index >= numParticles)
    {
		return;
	}
    float4 pos = pPos[index];
    float4 vel = pVel[index];
    pos.w = 1.0f;
    vel.w = 0.0f;
    // apply gravity
    float4 gravity = *((__global float4*)(pParams + 0));
    float particleRad = pParams[5].x;
    float globalDamping = pParams[5].y;
    float boundaryDamping = pParams[5].z;
    vel += gravity * timeStep;
    vel *= globalDamping;
    // integrate position
    pos += vel * timeStep;
    // collide with world boundaries
    float4 worldMin = *((__global float4*)(pParams + 1));
    float4 worldMax = *((__global float4*)(pParams + 2));
    
    
    if(pos.x < (worldMin.x + 2*particleRad))
    {
        pos.x = worldMin.x + 2*particleRad;
        vel.x *= boundaryDamping;
    }
    if(pos.x > (worldMax.x - 2*particleRad))
    {
        pos.x = worldMax.x - 2*particleRad;
        vel.x *= boundaryDamping;
    }
    if(pos.y < (worldMin.y + 2*particleRad))
    {
        pos.y = worldMin.y + 2*particleRad;
        vel.y *= boundaryDamping;
    }
    if(pos.y > (worldMax.y - 2*particleRad))
    {
        pos.y = worldMax.y - 2*particleRad;
        vel.y *= boundaryDamping;
    }
    if(pos.z < (worldMin.z + 2*particleRad))
    {
        pos.z = worldMin.z + 2*particleRad;
        vel.z *= boundaryDamping;
    }
    if(pos.z > (worldMax.z - 2*particleRad))
    {
        pos.z = worldMax.z - 2*particleRad;
        vel.z *= boundaryDamping;
    }
    // write back position and velocity
    pPos[index] = pos;
    pVel[index] = vel;
}


float4 collideTwoParticles(
    float4 posA,
    float4 posB,
    float4 velA,
    float4 velB,
    float radiusA,
    float radiusB,
    float spring,
    float damping,
    float shear,
    float attraction
)
{
    //Calculate relative position
    float4     relPos = posB - posA; relPos.w = 0.f;
    float        dist = sqrt(relPos.x * relPos.x + relPos.y * relPos.y + relPos.z * relPos.z);
    float collideDist = radiusA + radiusB;

    float4 force = (float4)0.f;
    if(dist < collideDist){
        float4 norm = relPos * (1.f / dist); norm.w = 0.f;

        //Relative velocity
        float4 relVel = velB - velA; relVel.w = 0.f;

        //Relative tangential velocity
        float relVelDotNorm = relVel.x * norm.x + relVel.y * norm.y + relVel.z * norm.z;
        float4 tanVel = relVel - norm * relVelDotNorm;  tanVel.w = 0.f;

        //Spring force (potential)
        float springFactor = -spring * (collideDist - dist);
        force = springFactor * norm + damping * relVel + shear * tanVel + attraction * relPos;
        force.w = 0.f;
    }
    return force;
}


__kernel void kCollideParticles(int numParticles,
								__global float4*		pVel,          //output: new velocity
								__global const float4* pSortedPos, //input: reordered positions
								__global const float4* pSortedVel, //input: reordered velocities
								__global const int2   *pPosHash,        //input: reordered particle indices
								__global const int   *pCellStart,    //input: cell boundaries
								__global float4* pParams GUID_ARG)
{
    int index = get_global_id(0);
    if(index >= numParticles)
	{
        return;
	}

    float4   posA = pSortedPos[index];
    float4   velA = pSortedVel[index];
    float4 force = (float4)0.f;
    float particleRad = pParams[5].x;
    float collisionDamping = pParams[5].w;
    float spring = pParams[6].x;
    float shear = pParams[6].y;
    float attraction = pParams[6].z;
    int unsortedIndex = pPosHash[index].y;

    //Get address in grid
    int4 gridPosA = getGridPos(posA, pParams);

    //Accumulate surrounding cells
    int4 gridPosB; 
    for(int z = -1; z <= 1; z++)
	{
		gridPosB.z = gridPosA.z + z;
        for(int y = -1; y <= 1; y++)
		{
			gridPosB.y = gridPosA.y + y;
            for(int x = -1; x <= 1; x++)
            {
				gridPosB.x = gridPosA.x + x;
                //Get start particle index for this cell
                uint hashB = getPosHash(gridPosB, pParams);
                int startI = pCellStart[hashB];
                //Skip empty cell
                if(startI < 0)
                {
                    continue;
                }
               //Iterate over particles in this cell
                int endI = startI + 32;
                if(endI >= numParticles) 
					endI = numParticles ;
					
                for(int j = startI; j < endI; j++)
                {
					uint hashC = pPosHash[j].x;
					if(hashC != hashB)
					{
						break;
					}
					if(j == index)
					{
						continue;
					}
                    float4 posB = pSortedPos[j];
                    float4 velB = pSortedVel[j];
                    //Collide two spheres
                    force += collideTwoParticles(	posA, posB, velA, velB, particleRad, particleRad, 
													spring, collisionDamping, shear, attraction);
                }
            }
		}
	}     
    //Write new velocity back to original unsorted location
    pVel[unsortedIndex] = velA + force;
}






/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and 
 * proprietary rights in and to this software and related documentation. 
 * Any use, reproduction, disclosure, or distribution of this software 
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA) 
 * associated with this source code for terms and conditions that govern 
 * your use of this NVIDIA software.
 * 
 */



inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir)
{
    if((keyA[0].x > keyB[0].x) == dir)
    {
		int2 tmp = *keyA;
		*keyA = *keyB;
		*keyB = tmp;
    }
}

inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir)
{
    if((keyA[0].x > keyB[0].x) == dir)
    {
		int2 tmp = *keyA;
		*keyA = *keyB;
		*keyB = tmp;
    }
}

////////////////////////////////////////////////////////////////////////////////
// Monolithic bitonic sort kernel for short arrays fitting into local memory
////////////////////////////////////////////////////////////////////////////////
__kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG)
{
    __local int2 l_key[LOCAL_SIZE_MAX];
    int localSizeLimit = get_local_size(0) * 2;

    //Offset to the beginning of subbatch and load data
    pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
    l_key[get_local_id(0) +                    0] = pKey[                   0];
    l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];

    for(uint size = 2; size < arrayLength; size <<= 1)
    {
        //Bitonic merge
        uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );
        for(uint stride = size / 2; stride > 0; stride >>= 1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
            ComparatorLocal(&l_key[pos +      0], &l_key[pos + stride], ddd);
        }
    }

    //ddd == dir for the last bitonic merge step
    {
        for(uint stride = arrayLength / 2; stride > 0; stride >>= 1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
            ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir);
        }
    }

    barrier(CLK_LOCAL_MEM_FENCE);
    pKey[                   0] = l_key[get_local_id(0) +                    0];
    pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}

////////////////////////////////////////////////////////////////////////////////
// Bitonic sort kernel for large arrays (not fitting into local memory)
////////////////////////////////////////////////////////////////////////////////
//Bottom-level bitonic sort
//Almost the same as bitonicSortLocal with the only exception
//of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being
//sorted in opposite directions
__kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG)
{
    __local int2 l_key[LOCAL_SIZE_MAX];
    uint localSizeLimit = get_local_size(0) * 2;

    //Offset to the beginning of subarray and load data
    pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
    l_key[get_local_id(0) +                    0] = pKey[                   0];
    l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];

    uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1);

    for(uint size = 2; size < localSizeLimit; size <<= 1)
    {
        //Bitonic merge
        uint ddd = (comparatorI & (size / 2)) != 0;
        for(uint stride = size / 2; stride > 0; stride >>= 1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
            ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
        }
    }

    //Odd / even arrays of localSizeLimit elements
    //sorted in opposite directions
    {
        uint ddd = (get_group_id(0) & 1);
        for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1)
        {
            barrier(CLK_LOCAL_MEM_FENCE);
            uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
            ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
        }
    }

    barrier(CLK_LOCAL_MEM_FENCE);
    pKey[                   0] = l_key[get_local_id(0) +                    0];
    pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}

//Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT
__kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG)
{
    uint global_comparatorI = get_global_id(0);
    uint        comparatorI = global_comparatorI & (arrayLength / 2 - 1);

    //Bitonic merge
    uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
    uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));

    int2 keyA = pKey[pos +      0];
    int2 keyB = pKey[pos + stride];

    ComparatorPrivate(&keyA, &keyB, ddd);

    pKey[pos +      0] = keyA;
    pKey[pos + stride] = keyB;
}

//Combined bitonic merge steps for
//'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]
__kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG)
{
    __local int2 l_key[LOCAL_SIZE_MAX];
    int localSizeLimit = get_local_size(0) * 2;

    pKey += get_group_id(0) * localSizeLimit + get_local_id(0);
    l_key[get_local_id(0) +                    0] = pKey[                   0];
    l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];

    //Bitonic merge
    uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);
    uint         ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );
    for(; stride > 0; stride >>= 1)
    {
        barrier(CLK_LOCAL_MEM_FENCE);
        uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
        ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);
    }

    barrier(CLK_LOCAL_MEM_FENCE);
    pKey[                   0] = l_key[get_local_id(0) +                    0];
    pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];
}

);