using namespace cv;
using namespace cv::ocl;
+// currently sort procedure on the host is more efficient
static bool use_cpu_sorter = true;
-namespace
+// compact structure for corners
+struct DefCorner
{
-enum SortMethod
+ float eig; //eigenvalue of corner
+ short x; //x coordinate of corner point
+ short y; //y coordinate of corner point
+} ;
+
+// compare procedure for corner
+//it is used for sort on the host side
+struct DefCornerCompare
{
- CPU_STL,
- BITONIC,
- SELECTION
-};
-
-const int GROUP_SIZE = 256;
-
-template<SortMethod method>
-struct Sorter
-{
- //typedef EigType;
-};
-
-//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
-template<>
-struct Sorter<CPU_STL>
-{
- typedef oclMat EigType;
- static cv::Mutex cs;
- static Mat mat_eig;
-
- //prototype
- static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
- {
- float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
- float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
- return v1 > v2;
- }
- static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+ bool operator()(const DefCorner a, const DefCorner b) const
{
- cv::AutoLock lock(cs);
- //temporarily use STL's sort function
- Mat mat_corners = corners;
- mat_eig = eig_tex;
- std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
- corners = mat_corners;
+ return a.eig > b.eig;
}
};
-cv::Mutex Sorter<CPU_STL>::cs;
-cv::Mat Sorter<CPU_STL>::mat_eig;
-template<>
-struct Sorter<BITONIC>
+// sort corner point using opencl bitonicosrt implementation
+static void sortCorners_caller(oclMat& corners, const int count)
{
- typedef TextureCL EigType;
-
- static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+ Context * cxt = Context::getContext();
+ int GS = count/2;
+ int LS = min(255,GS);
+ size_t globalThreads[3] = {GS, 1, 1};
+ size_t localThreads[3] = {LS, 1, 1};
+
+ // 2^numStages should be equal to count or the output is invalid
+ int numStages = 0;
+ for(int i = count; i > 1; i >>= 1)
{
- Context * cxt = Context::getContext();
- size_t globalThreads[3] = {count / 2, 1, 1};
- size_t localThreads[3] = {GROUP_SIZE, 1, 1};
-
- // 2^numStages should be equal to count or the output is invalid
- int numStages = 0;
- for(int i = count; i > 1; i >>= 1)
- {
- ++numStages;
- }
- const int argc = 5;
- std::vector< std::pair<size_t, const void *> > args(argc);
- std::string kernelname = "sortCorners_bitonicSort";
- args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
- args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
- args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
- for(int stage = 0; stage < numStages; ++stage)
- {
- args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
- for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
- {
- args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
- openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
- }
- }
+ ++numStages;
}
-};
-
-template<>
-struct Sorter<SELECTION>
-{
- typedef TextureCL EigType;
-
- static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+ const int argc = 4;
+ std::vector< std::pair<size_t, const void *> > args(argc);
+ std::string kernelname = "sortCorners_bitonicSort";
+ args[0] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
+ args[1] = std::make_pair(sizeof(cl_int), (void *)&count);
+ for(int stage = 0; stage < numStages; ++stage)
{
- Context * cxt = Context::getContext();
-
- size_t globalThreads[3] = {count, 1, 1};
- size_t localThreads[3] = {GROUP_SIZE, 1, 1};
-
- std::vector< std::pair<size_t, const void *> > args;
- //local
- std::string kernelname = "sortCorners_selectionSortLocal";
- int lds_size = GROUP_SIZE * sizeof(cl_float2);
- args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
- args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
- args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
- args.push_back( std::make_pair( lds_size, (void*)NULL) );
-
- openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
-
- //final
- kernelname = "sortCorners_selectionSortFinal";
- args.pop_back();
- openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
+ args[2] = std::make_pair(sizeof(cl_int), (void *)&stage);
+ for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
+ {
+ args[3] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
+ openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
+ }
}
-};
+}
-int findCorners_caller(
- const TextureCL& eig,
- const float threshold,
- const oclMat& mask,
- oclMat& corners,
- const int max_count)
+// find corners on matrix and put it into array
+void findCorners_caller(
+ const oclMat& eig_mat, //input matrix worth eigenvalues
+ oclMat& eigMinMax, //input with min and max values of eigenvalues
+ const float qualityLevel,
+ const oclMat& mask,
+ oclMat& corners, //output array with detected corners
+ oclMat& counter) //output value with number of detected corners, have to be 0 before call
{
+ string opt;
std::vector<int> k;
Context * cxt = Context::getContext();
std::vector< std::pair<size_t, const void*> > args;
- std::string kernelname = "findCorners";
const int mask_strip = mask.step / mask.elemSize1();
- oclMat g_counter(1, 1, CV_32SC1);
- g_counter.setTo(0);
+ args.push_back(make_pair( sizeof(cl_mem), (void*)&(eig_mat.data)));
- args.push_back(make_pair( sizeof(cl_mem), (void*)&eig ));
+ int src_pitch = (int)eig_mat.step;
+ args.push_back(make_pair( sizeof(cl_int), (void*)&src_pitch ));
args.push_back(make_pair( sizeof(cl_mem), (void*)&mask.data ));
args.push_back(make_pair( sizeof(cl_mem), (void*)&corners.data ));
args.push_back(make_pair( sizeof(cl_int), (void*)&mask_strip));
- args.push_back(make_pair( sizeof(cl_float), (void*)&threshold ));
- args.push_back(make_pair( sizeof(cl_int), (void*)&eig.rows ));
- args.push_back(make_pair( sizeof(cl_int), (void*)&eig.cols ));
- args.push_back(make_pair( sizeof(cl_int), (void*)&max_count ));
- args.push_back(make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
-
- size_t globalThreads[3] = {eig.cols, eig.rows, 1};
+ args.push_back(make_pair( sizeof(cl_mem), (void*)&eigMinMax.data ));
+ args.push_back(make_pair( sizeof(cl_float), (void*)&qualityLevel ));
+ args.push_back(make_pair( sizeof(cl_int), (void*)&eig_mat.rows ));
+ args.push_back(make_pair( sizeof(cl_int), (void*)&eig_mat.cols ));
+ args.push_back(make_pair( sizeof(cl_int), (void*)&corners.cols ));
+ args.push_back(make_pair( sizeof(cl_mem), (void*)&counter.data ));
+
+ size_t globalThreads[3] = {eig_mat.cols, eig_mat.rows, 1};
size_t localThreads[3] = {16, 16, 1};
+ if(!mask.empty())
+ opt += " -D WITH_MASK=1";
- const char * opt = mask.empty() ? "" : "-D WITH_MASK";
- openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1, opt);
- return std::min(Mat(g_counter).at<int>(0), max_count);
+ openCLExecuteKernel(cxt, &imgproc_gftt, "findCorners", globalThreads, localThreads, args, -1, -1, opt.c_str());
+}
+
+
+static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero)
+{
+ size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
+ CV_Assert(groupnum != 0);
+
+ int dbsize = groupnum * 2 * src.elemSize();
+
+ ensureSizeIsEnough(1, dbsize, CV_8UC1, dst);
+
+ cl_mem dst_data = reinterpret_cast<cl_mem>(dst.data);
+
+ int all_cols = src.step / src.elemSize();
+ int pre_cols = (src.offset % src.step) / src.elemSize();
+ int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
+ int invalid_cols = pre_cols + sec_cols;
+ int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
+ int offset = src.offset / src.elemSize();
+
+ {// first parallel pass
+ vector<pair<size_t , const void *> > args;
+ args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
+ args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&offset));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
+ size_t globalThreads[3] = {groupnum * 256, 1, 1};
+ size_t localThreads[3] = {256, 1, 1};
+ openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads,
+ args, -1, -1, "-D T=float -D DEPTH_5");
+ }
+
+ {// run final "serial" kernel to find accumulate results from threads and reset corner counter
+ vector<pair<size_t , const void *> > args;
+ args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum ));
+ args.push_back( make_pair( sizeof(cl_mem) , (void *)&tozero.data ));
+ size_t globalThreads[3] = {1, 1, 1};
+ size_t localThreads[3] = {1, 1, 1};
+ openCLExecuteKernel(src.clCxt, &imgproc_gftt, "arithm_op_minMax_final", globalThreads, localThreads,
+ args, -1, -1);
+ }
}
-}//unnamed namespace
void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
{
ensureSizeIsEnough(image.size(), CV_32F, eig_);
if (useHarrisDetector)
- cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
+ cornerHarris_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
else
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
- double maxVal = 0;
- minMax(eig_, NULL, &maxVal);
+ ensureSizeIsEnough(1,1, CV_32SC1, counter_);
+
+ // find max eigenvalue and reset detected counters
+ minMaxEig_caller(eig_,eig_minmax_,counter_);
- ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+ // allocate buffer for kernels
+ int corner_array_size = std::max(1024, static_cast<int>(image.size().area() * 0.05));
- Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
- int total = findCorners_caller(
- *eig_tex,
- static_cast<float>(maxVal * qualityLevel),
+ if(!use_cpu_sorter)
+ { // round to 2^n
+ unsigned int n=1;
+ for(n=1;n<(unsigned int)corner_array_size;n<<=1);
+ corner_array_size = (int)n;
+
+ ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
+
+ // set to 0 to be able use bitonic sort on whole 2^n array
+ tmpCorners_.setTo(0);
+ }
+ else
+ {
+ ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
+ }
+
+ int total = tmpCorners_.cols; // by default the number of corner is full array
+ vector<DefCorner> tmp(tmpCorners_.cols); // input buffer with corner for HOST part of algorithm
+
+ //find points with high eigenvalue and put it into the output array
+ findCorners_caller(
+ eig_,
+ eig_minmax_,
+ static_cast<float>(qualityLevel),
mask,
tmpCorners_,
- tmpCorners_.cols);
+ counter_);
+
+ if(!use_cpu_sorter)
+ {// sort detected corners on deivce side
+ sortCorners_caller(tmpCorners_, corner_array_size);
+ }
+ else
+ {// send non-blocking request to read real non-zero number of corners to sort it on the HOST side
+ openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(counter_.clCxt), (cl_mem)counter_.data, CL_FALSE, 0,sizeof(int), &total, 0, NULL, NULL));
+ }
+
+ //blocking read whole corners array (sorted or not sorted)
+ openCLReadBuffer(tmpCorners_.clCxt,(cl_mem)tmpCorners_.data,&tmp[0],tmpCorners_.cols*sizeof(DefCorner));
if (total == 0)
- {
+ {// check for trivial case
corners.release();
return;
}
+
if(use_cpu_sorter)
- {
- Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
- }
- else
- {
- //if total is power of 2
- if(((total - 1) & (total)) == 0)
- {
- Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
- }
- else
- {
- Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
- }
+ {// sort detected corners on cpu side.
+ tmp.resize(total);
+ cv::sort(tmp,DefCornerCompare());
}
+ //estimate maximal size of final output array
+ int total_max = maxCorners > 0 ? std::min(maxCorners, total) : total;
+ int D2 = (int)ceil(minDistance * minDistance);
+ // allocate output buffer
+ vector<Point2f> tmp2;
+ tmp2.reserve(total_max);
+
+
if (minDistance < 1)
- {
- Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1);
- tmpCorners_(roi_range).copyTo(corners);
+ {// we have not distance restriction. then just copy with conversion maximal allowed points into output array
+ for(int i=0;i<total_max && tmp[i].eig>0.0f;++i)
+ {
+ tmp2.push_back(Point2f(tmp[i].x,tmp[i].y));
+ }
}
else
- {
- vector<Point2f> tmp(total);
- downloadPoints(tmpCorners_, tmp);
-
- vector<Point2f> tmp2;
- tmp2.reserve(total);
-
+ {// we have distance restriction. then start coping to output array from the first element and check distance for each next one
const int cell_size = cvRound(minDistance);
const int grid_width = (image.cols + cell_size - 1) / cell_size;
const int grid_height = (image.rows + cell_size - 1) / cell_size;
- std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+ std::vector< std::vector<Point2i> > grid(grid_width * grid_height);
- for (int i = 0; i < total; ++i)
+ for (int i = 0; i < total ; ++i)
{
- Point2f p = tmp[i];
+ DefCorner p = tmp[i];
+
+ if(p.eig<=0.0f)
+ break; // condition to stop that is needed for GPU bitonic sort usage.
bool good = true;
{
for (int xx = x1; xx <= x2; xx++)
{
- vector<Point2f>& m = grid[yy * grid_width + xx];
-
- if (!m.empty())
+ vector<Point2i>& m = grid[yy * grid_width + xx];
+ if (m.empty())
+ continue;
+ for(size_t j = 0; j < m.size(); j++)
{
- for(size_t j = 0; j < m.size(); j++)
+ int dx = p.x - m[j].x;
+ int dy = p.y - m[j].y;
+
+ if (dx * dx + dy * dy < D2)
{
- float dx = p.x - m[j].x;
- float dy = p.y - m[j].y;
-
- if (dx * dx + dy * dy < minDistance * minDistance)
- {
- good = false;
- goto break_out;
- }
+ good = false;
+ goto break_out_;
}
}
}
}
- break_out:
+ break_out_:
if(good)
{
- grid[y_cell * grid_width + x_cell].push_back(p);
+ grid[y_cell * grid_width + x_cell].push_back(Point2i(p.x,p.y));
- tmp2.push_back(p);
+ tmp2.push_back(Point2f(p.x,p.y));
if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
break;
}
}
- corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
}
+ int final_size = static_cast<int>(tmp2.size());
+ if(final_size>0)
+ corners.upload(Mat(1, final_size, CV_32FC2, &tmp2[0]));
+ else
+ corners.release();
}
void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, vector<Point2f> &points_v)
{
#ifndef WITH_MASK
#define WITH_MASK 0
#endif
-
-__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-
-inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
-{
- return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
-}
-
-inline float ELEM_FLT2(image2d_t _eig, float2 pt)
-{
- return read_imagef(_eig, sampler, pt).x;
-}
+//macro to read eigenvalue matrix
+#define GET_SRC_32F(_x, _y) ((__global const float*)(eig + (_y)*eig_pitch))[_x]
__kernel
void findCorners
(
- image2d_t eig,
- __global const char * mask,
- __global float2 * corners,
- const int mask_strip,// in pixels
- const float threshold,
- const int rows,
- const int cols,
- const int max_count,
- __global int * g_counter
+ __global const char* eig,
+ const int eig_pitch,
+ __global const char* mask,
+ __global float2* corners,
+ const int mask_strip,// in pixels
+ __global const float* pMinMax,
+ const float qualityLevel,
+ const int rows,
+ const int cols,
+ const int max_count,
+ __global int* g_counter
)
{
+ float threshold = qualityLevel*pMinMax[1];
const int j = get_global_id(0);
const int i = get_global_id(1);
#endif
)
{
- const float val = ELEM_INT2(eig, j, i);
+ const float val = GET_SRC_32F(j, i);
if (val > threshold)
{
float maxVal = val;
+ maxVal = fmax(GET_SRC_32F(j - 1, i - 1), maxVal);
+ maxVal = fmax(GET_SRC_32F(j , i - 1), maxVal);
+ maxVal = fmax(GET_SRC_32F(j + 1, i - 1), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
-
- maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
+ maxVal = fmax(GET_SRC_32F(j - 1, i), maxVal);
+ maxVal = fmax(GET_SRC_32F(j + 1, i), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
+ maxVal = fmax(GET_SRC_32F(j - 1, i + 1), maxVal);
+ maxVal = fmax(GET_SRC_32F(j , i + 1), maxVal);
+ maxVal = fmax(GET_SRC_32F(j + 1, i + 1), maxVal);
if (val == maxVal)
{
const int ind = atomic_inc(g_counter);
if (ind < max_count)
- corners[ind] = (float2)(j, i);
+ {// pack and store eigenvalue and its coordinates
+ corners[ind].x = val;
+ corners[ind].y = as_float(j|(i<<16));
+ }
}
}
}
}
+#undef GET_SRC_32F
+
//bitonic sort
__kernel
void sortCorners_bitonicSort
(
- image2d_t eig,
__global float2 * corners,
const int count,
const int stage,
const float2 leftPt = corners[leftId];
const float2 rightPt = corners[rightId];
- const float leftVal = ELEM_FLT2(eig, leftPt);
- const float rightVal = ELEM_FLT2(eig, rightPt);
+ const float leftVal = leftPt.x;
+ const float rightVal = rightPt.x;
const bool compareResult = leftVal > rightVal;
corners[rightId] = sortOrder ? greater : lesser;
}
-//selection sort for gfft
-//kernel is ported from Bolt library:
-//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
-// Local sort will firstly sort elements of each workgroup using selection sort
-// its performance is O(n)
-__kernel
- void sortCorners_selectionSortLocal
- (
- image2d_t eig,
- __global float2 * corners,
- const int count,
- __local float2 * scratch
- )
+// this is simple short serial kernel that makes some short reduction and initialization work
+// it makes HOST like work to avoid additional sync with HOST to do this short work
+// data - input/output float2.
+// input data are sevral (min,max) pairs
+// output data is one reduced (min,max) pair
+// g_counter - counter that have to be initialized by 0 for next findCorner call.
+__kernel void arithm_op_minMax_final(__global float * data, int groupnum,__global int * g_counter)
{
- int i = get_local_id(0); // index in workgroup
- int numOfGroups = get_num_groups(0); // index in workgroup
- int groupID = get_group_id(0);
- int wg = get_local_size(0); // workgroup size = block size
- int n; // number of elements to be processed for this work group
-
- int offset = groupID * wg;
- int same = 0;
- corners += offset;
- n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
- float2 pt1, pt2;
-
- pt1 = corners[min(i, n)];
- scratch[i] = pt1;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(i >= n)
+ g_counter[0] = 0;
+ float minVal = data[0];
+ float maxVal = data[groupnum];
+ for(int i=1;i<groupnum;++i)
{
- return;
+ minVal = min(minVal,data[i]);
+ maxVal = max(maxVal,data[i+groupnum]);
}
-
- float val1 = ELEM_FLT2(eig, pt1);
- float val2;
-
- int pos = 0;
- for (int j=0;j<n;++j)
- {
- pt2 = scratch[j];
- val2 = ELEM_FLT2(eig, pt2);
- if(val2 > val1)
- pos++;//calculate the rank of this element in this work group
- else
- {
- if(val1 > val2)
- continue;
- else
- {
- // val1 and val2 are same
- same++;
- }
- }
- }
- for (int j=0; j< same; j++)
- corners[pos + j] = pt1;
-}
-__kernel
- void sortCorners_selectionSortFinal
- (
- image2d_t eig,
- __global float2 * corners,
- const int count
- )
-{
- const int i = get_local_id(0); // index in workgroup
- const int numOfGroups = get_num_groups(0); // index in workgroup
- const int groupID = get_group_id(0);
- const int wg = get_local_size(0); // workgroup size = block size
- int pos = 0, same = 0;
- const int offset = get_group_id(0) * wg;
- const int remainder = count - wg*(numOfGroups-1);
-
- if((offset + i ) >= count)
- return;
- float2 pt1, pt2;
- pt1 = corners[groupID*wg + i];
-
- float val1 = ELEM_FLT2(eig, pt1);
- float val2;
-
- for(int j=0; j<numOfGroups-1; j++ )
- {
- for(int k=0; k<wg; k++)
- {
- pt2 = corners[j*wg + k];
- val2 = ELEM_FLT2(eig, pt2);
- if(val1 > val2)
- break;
- else
- {
- //Increment only if the value is not the same.
- if( val2 > val1 )
- pos++;
- else
- same++;
- }
- }
- }
-
- for(int k=0; k<remainder; k++)
- {
- pt2 = corners[(numOfGroups-1)*wg + k];
- val2 = ELEM_FLT2(eig, pt2);
- if(val1 > val2)
- break;
- else
- {
- //Don't increment if the value is the same.
- //Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false
- if(val2 > val1)
- pos++;
- else
- same++;
- }
- }
- for (int j=0; j< same; j++)
- corners[pos + j] = pt1;
-}
+ data[0] = minVal;
+ data[1] = maxVal;
+}
\ No newline at end of file