[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / Bullet3OpenCL / ParallelPrimitives / b3RadixSort32CL.h
1
2 #ifndef B3_RADIXSORT32_H
3 #define B3_RADIXSORT32_H
4
5 #include "b3OpenCLArray.h"
6
7 struct b3SortData
8 {
9         union {
10                 unsigned int m_key;
11                 unsigned int x;
12         };
13
14         union {
15                 unsigned int m_value;
16                 unsigned int y;
17         };
18 };
19 #include "b3BufferInfoCL.h"
20
21 class b3RadixSort32CL
22 {
23         b3OpenCLArray<unsigned int>* m_workBuffer1;
24         b3OpenCLArray<unsigned int>* m_workBuffer2;
25
26         b3OpenCLArray<b3SortData>* m_workBuffer3;
27         b3OpenCLArray<b3SortData>* m_workBuffer4;
28
29         b3OpenCLArray<unsigned int>* m_workBuffer3a;
30         b3OpenCLArray<unsigned int>* m_workBuffer4a;
31
32         cl_command_queue m_commandQueue;
33
34         cl_kernel m_streamCountSortDataKernel;
35         cl_kernel m_streamCountKernel;
36
37         cl_kernel m_prefixScanKernel;
38         cl_kernel m_sortAndScatterSortDataKernel;
39         cl_kernel m_sortAndScatterKernel;
40
41         bool m_deviceCPU;
42
43         class b3PrefixScanCL* m_scan;
44         class b3FillCL* m_fill;
45
46 public:
47         struct b3ConstData
48         {
49                 int m_n;
50                 int m_nWGs;
51                 int m_startBit;
52                 int m_nBlocksPerWG;
53         };
54         enum
55         {
56                 DATA_ALIGNMENT = 256,
57                 WG_SIZE = 64,
58                 BLOCK_SIZE = 256,
59                 ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE),
60                 BITS_PER_PASS = 4,
61                 NUM_BUCKET = (1 << BITS_PER_PASS),
62                 //      if you change this, change nPerWI in kernel as well
63                 NUM_WGS = 20 * 6,  //   cypress
64                                                    //                   NUM_WGS = 24*6, //      cayman
65                                                    //                   NUM_WGS = 32*4, //      nv
66         };
67
68 private:
69 public:
70         b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0);
71
72         virtual ~b3RadixSort32CL();
73
74         void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
75                                  b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
76
77         ///keys only
78         void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32);
79
80         void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
81         void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
82         void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
83 };
84 #endif  //B3_RADIXSORT32_H