Initialize libbullet git in 2.0_beta.
[platform/upstream/libbullet.git] / Extras / RigidBodyGpuPipeline / opencl / primitives / AdlPrimitives / Sort / radixsortadvanced.inl
1 /*
2                 2011 Takahiro Harada
3 */
4
5 #define PATH "..\\..\\AdlPrimitives\\Sort\\RadixSortAdvancedKernels"
6 #define KERNEL0 "StreamCountKernel"
7 #define KERNEL1 "SortAndScatterKernel1"
8 #define KERNEL2 "PrefixScanKernel"
9
10 template<DeviceType type>
11 class RadixSortAdvanced : public RadixSortBase
12 {
13         public:
14                 typedef Launcher::BufferInfo BufferInfo;
15
16                 enum
17                 {
18                         WG_SIZE = 128,
19                         NUM_PER_WI = 4,
20                         MAX_NUM_WORKGROUPS = 60,
21                 };
22
23                 struct Data : public RadixSort<type>::Data
24                 {
25                         Kernel* m_localCountKernel;
26                         Kernel* m_scatterKernel;
27                         Kernel* m_scanKernel;
28
29                         Buffer<u32>* m_workBuffer0;
30                         Buffer<SortData>* m_workBuffer1;
31                         Buffer<int4>* m_constBuffer[32/4];
32                 };
33                 
34
35                 static
36                 Data* allocate(const Device* deviceData, int maxSize, Option option = SORT_NORMAL);
37
38                 static
39                 void deallocate(void* data);
40
41                 static
42                 void execute(void* data, Buffer<SortData>& inout, int n, int sortBits);
43 };
44
45 template<DeviceType type>
46 typename RadixSortAdvanced<type>::Data* RadixSortAdvanced<type>::allocate(const Device* deviceData, int maxSize, Option option)
47 {
48         ADLASSERT( type == deviceData->m_type );
49
50         const char* src[] = { 0, 0, 0 };
51
52         Data* data = new Data;
53         data->m_option = option;
54         data->m_deviceData = deviceData;
55
56         data->m_localCountKernel = deviceData->getKernel( PATH, KERNEL0, 0, src[type] );
57         data->m_scatterKernel = deviceData->getKernel( PATH, KERNEL1, 0, src[type] );
58         data->m_scanKernel = deviceData->getKernel( PATH, KERNEL2, 0, src[type] );
59
60         data->m_workBuffer0 = new Buffer<u32>( deviceData, MAX_NUM_WORKGROUPS*16 );
61         data->m_workBuffer1 = new Buffer<SortData>( deviceData, maxSize );
62         for(int i=0; i<32/4; i++)
63                 data->m_constBuffer[i] = new Buffer<int4>( deviceData, 1, BufferBase::BUFFER_CONST );
64         data->m_maxSize = maxSize;
65
66         return data;
67 }
68
69 template<DeviceType type>
70 void RadixSortAdvanced<type>::deallocate(void* rawData)
71 {
72         Data* data = (Data*)rawData;
73
74         delete data->m_workBuffer0;
75         delete data->m_workBuffer1;
76         for(int i=0; i<32/4; i++)
77                 delete data->m_constBuffer[i];
78         
79         delete data;
80 }
81
82 template<DeviceType type>
83 void RadixSortAdvanced<type>::execute(void* rawData, Buffer<SortData>& inout, int n, int sortBits)
84 {
85         Data* data = (Data*)rawData;
86
87         ADLASSERT( sortBits == 32 );
88
89         ADLASSERT( NUM_PER_WI == 4 );
90         ADLASSERT( n%(WG_SIZE*NUM_PER_WI) == 0 );
91         ADLASSERT( MAX_NUM_WORKGROUPS < 128*8/16 );
92
93         Buffer<SortData>* src = &inout;
94         Buffer<SortData>* dst = data->m_workBuffer1;
95
96         const Device* deviceData = data->m_deviceData;
97
98         int nBlocks = n/(NUM_PER_WI*WG_SIZE);
99         const int nWorkGroupsToExecute = min2((int)MAX_NUM_WORKGROUPS, nBlocks);
100         int nBlocksPerGroup = (nBlocks+nWorkGroupsToExecute-1)/nWorkGroupsToExecute;
101         ADLASSERT( nWorkGroupsToExecute <= MAX_NUM_WORKGROUPS );
102
103         int4 constBuffer = make_int4(0, nBlocks, nWorkGroupsToExecute, nBlocksPerGroup);
104
105         int iPass = 0;
106         int startBit = 0;
107         for(int startBit=0; startBit<32; startBit+=4, iPass++)
108         {
109                 constBuffer.x = startBit;
110
111                 {
112                         BufferInfo bInfo[] = { BufferInfo( src, true ), BufferInfo( data->m_workBuffer0 ) };
113
114                         Launcher launcher( deviceData, data->m_localCountKernel );
115                         launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
116                         launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
117                         launcher.launch1D( WG_SIZE* nWorkGroupsToExecute, WG_SIZE );
118                 }
119
120
121                 {
122                         BufferInfo bInfo[] = { BufferInfo( data->m_workBuffer0 ) };
123
124                         Launcher launcher( deviceData, data->m_scanKernel );
125                         launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
126                         launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
127                         launcher.launch1D( WG_SIZE, WG_SIZE );
128                 }
129
130                 {
131                         BufferInfo bInfo[] = { BufferInfo( data->m_workBuffer0, true ), BufferInfo( src ), BufferInfo( dst ) };
132
133                         Launcher launcher( deviceData, data->m_scatterKernel );
134                         launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
135                         launcher.setConst( *data->m_constBuffer[iPass], constBuffer );
136                         launcher.launch1D( WG_SIZE*nWorkGroupsToExecute, WG_SIZE );
137                 }
138
139                 swap2( src, dst );
140         }
141 }
142
143 #undef PATH
144 #undef KERNEL0
145 #undef KERNEL1
146 #undef KERNEL2