/* 2011 Takahiro Harada */ typedef uint u32; #define GET_GROUP_IDX groupIdx.x #define GET_LOCAL_IDX localIdx.x #define GET_GLOBAL_IDX globalIdx.x #define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync() #define GROUP_MEM_FENCE #define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID #define AtomInc(x) InterlockedAdd(x, 1) #define AtomInc1(x, out) InterlockedAdd(x, 1, out) #define make_uint4 uint4 #define make_uint2 uint2 uint4 SELECT_UINT4(uint4 b,uint4 a,uint4 condition ){ return make_uint4( ((condition).x)?a.x:b.x, ((condition).y)?a.y:b.y, ((condition).z)?a.z:b.z, ((condition).w)?a.w:b.w ); } // takahiro end #define WG_SIZE 128 #define NUM_PER_WI 4 #define GET_GROUP_SIZE WG_SIZE typedef struct { u32 m_key; u32 m_value; }SortData; cbuffer SortCB : register( b0 ) { u32 m_startBit; u32 m_numGroups; u32 m_padding[2]; }; #define BITS_PER_PASS 4 uint4 prefixScanVector( uint4 data ) { data.y += data.x; data.w += data.z; data.z += data.y; data.w += data.y; return data; } uint prefixScanVectorEx( inout uint4 data ) { uint4 backup = data; data.y += data.x; data.w += data.z; data.z += data.y; data.w += data.y; uint sum = data.w; data -= backup; return sum; } RWStructuredBuffer sortDataIn : register( u0 ); RWStructuredBuffer ldsHistogramOut0 : register( u1 ); RWStructuredBuffer ldsHistogramOut1 : register( u2 ); groupshared u32 ldsSortData[ WG_SIZE*NUM_PER_WI + 16 ]; uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum ) { { // Set data ldsSortData[lIdx] = 0; ldsSortData[lIdx+WG_SIZE] = prefixScanVectorEx( pData ); } GROUP_LDS_BARRIER; { // Prefix sum int idx = 2*lIdx + (WG_SIZE+1); if( lIdx < 64 ) { ldsSortData[idx] += ldsSortData[idx-1]; GROUP_MEM_FENCE; ldsSortData[idx] += ldsSortData[idx-2]; GROUP_MEM_FENCE; ldsSortData[idx] += ldsSortData[idx-4]; GROUP_MEM_FENCE; ldsSortData[idx] += ldsSortData[idx-8]; GROUP_MEM_FENCE; ldsSortData[idx] += ldsSortData[idx-16]; GROUP_MEM_FENCE; ldsSortData[idx] += ldsSortData[idx-32]; GROUP_MEM_FENCE; ldsSortData[idx] += ldsSortData[idx-64]; GROUP_MEM_FENCE; ldsSortData[idx-1] += ldsSortData[idx-2]; GROUP_MEM_FENCE; } } GROUP_LDS_BARRIER; totalSum = ldsSortData[WG_SIZE*2-1]; uint addValue = ldsSortData[lIdx+127]; return pData + make_uint4(addValue, addValue, addValue, addValue); } void generateHistogram(u32 lIdx, u32 wgIdx, uint4 sortedData) { if( lIdx < (1<>m_startBit, sortData[1].m_key>>m_startBit, sortData[2].m_key>>m_startBit, sortData[3].m_key>>m_startBit ); generateHistogram( lIdx, wgIdx, localKeys ); GROUP_LDS_BARRIER; int nBins = (1< src : register( t0 ); StructuredBuffer histogramGlobalRadixMajor : register( t1 ); StructuredBuffer histogramLocalGroupMajor : register( t2 ); RWStructuredBuffer dst : register( u0 ); groupshared u32 ldsLocalHistogram[ 2*(1<>m_startBit)&cmpValue, (sortData[1].m_key>>m_startBit)&cmpValue, (sortData[2].m_key>>m_startBit)&cmpValue, (sortData[3].m_key>>m_startBit)&cmpValue );; // data is already sorted. So simply subtract local prefix sum uint4 dstAddr; dstAddr.x = ldsGlobalHistogram[radix.x] + (localAddr.x - ldsLocalHistogram[radix.x]); dstAddr.y = ldsGlobalHistogram[radix.y] + (localAddr.y - ldsLocalHistogram[radix.y]); dstAddr.z = ldsGlobalHistogram[radix.z] + (localAddr.z - ldsLocalHistogram[radix.z]); dstAddr.w = ldsGlobalHistogram[radix.w] + (localAddr.w - ldsLocalHistogram[radix.w]); dst[dstAddr.x] = sortData[0]; dst[dstAddr.y] = sortData[1]; dst[dstAddr.z] = sortData[2]; dst[dstAddr.w] = sortData[3]; } [numthreads(WG_SIZE, 1, 1)] void CopyKernel( DEFAULT_ARGS ) { dst[ GET_GLOBAL_IDX ] = src[ GET_GLOBAL_IDX ]; }