modules/nonfree/src/opencl/surf.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Peng Xiao, pengxiao@multicorewareinc.com
  19 //    Sen Liu, swjtuls1987@126.com
  20 //
  21 // Redistribution and use in source and binary forms, with or without modification,
  22 // are permitted provided that the following conditions are met:
  23 //
  24 //   * Redistribution's of source code must retain the above copyright notice,
  25 //     this list of conditions and the following disclaimer.
  26 //
  27 //   * Redistribution's in binary form must reproduce the above copyright notice,
  28 //     this list of conditions and the following disclaimer in the documentation
  29 //     and/or other materials provided with the distribution.
  30 //
  31 //   * The name of the copyright holders may not be used to endorse or promote products
  32 //     derived from this software without specific prior written permission.
  33 //
  34 // This software is provided by the copyright holders and contributors as is and
  35 // any express or implied warranties, including, but not limited to, the implied
  36 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  37 // In no event shall the Intel Corporation or contributors be liable for any direct,
  38 // indirect, incidental, special, exemplary, or consequential damages
  39 // (including, but not limited to, procurement of substitute goods or services;
  40 // loss of use, data, or profits; or business interruption) however caused
  41 // and on any theory of liability, whether in contract, strict liability,
  42 // or tort (including negligence or otherwise) arising in any way out of
  43 // the use of this software, even if advised of the possibility of such damage.
  44 //
  45 //M*/
  46
  47 // specialized for non-image2d_t supported platform, intel HD4000, for example
  48 #ifdef DISABLE_IMAGE2D
  49 #define IMAGE_INT32 __global uint  *
  50 #define IMAGE_INT8  __global uchar *
  51 #else
  52 #define IMAGE_INT32 image2d_t
  53 #define IMAGE_INT8  image2d_t
  54 #endif
  55
  56 uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow)
  57 {
  58 #ifdef DISABLE_IMAGE2D
  59     int x = clamp(coord.x, 0, cols);
  60     int y = clamp(coord.y, 0, rows);
  61     return img[elemPerRow * y + x];
  62 #else
  63     return read_imageui(img, sam, coord).x;
  64 #endif
  65 }
  66 uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
  67 {
  68 #ifdef DISABLE_IMAGE2D
  69     int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
  70     int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
  71     return img[elemPerRow * y + x];
  72 #else
  73     return (uchar)read_imageui(img, sam, coord).x;
  74 #endif
  75 }
  76
  77 // dynamically change the precision used for floating type
  78
  79 #if defined (DOUBLE_SUPPORT)
  80 #ifdef cl_khr_fp64
  81 #pragma OPENCL EXTENSION cl_khr_fp64:enable
  82 #elif defined (cl_amd_fp64)
  83 #pragma OPENCL EXTENSION cl_amd_fp64:enable
  84 #endif
  85 #define F double
  86 #else
  87 #define F float
  88 #endif
  89
  90 // Image read mode
  91 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
  92
  93 #ifndef FLT_EPSILON
  94 #define FLT_EPSILON (1e-15)
  95 #endif
  96
  97 #ifndef CV_PI_F
  98 #define CV_PI_F 3.14159265f
  99 #endif
 100
 101 // Use integral image to calculate haar wavelets.
 102 // N = 2
 103 // for simple haar paatern
 104 float icvCalcHaarPatternSum_2(
 105     IMAGE_INT32 sumTex,
 106     __constant float2 *src,
 107     int oldSize,
 108     int newSize,
 109     int y, int x,
 110     int rows, int cols, int elemPerRow)
 111 {
 112
 113     float ratio = (float)newSize / oldSize;
 114
 115     F d = 0;
 116
 117     int2 dx1 = convert_int2_rte(ratio * src[0]);
 118     int2 dy1 = convert_int2_rte(ratio * src[1]);
 119     int2 dx2 = convert_int2_rte(ratio * src[2]);
 120     int2 dy2 = convert_int2_rte(ratio * src[3]);
 121
 122     F t = 0;
 123     t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
 124     t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
 125     t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
 126     t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
 127     d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
 128
 129     t = 0;
 130     t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
 131     t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
 132     t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
 133     t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
 134     d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
 135
 136     return (float)d;
 137 }
 138
 139 // N = 3
 140 float icvCalcHaarPatternSum_3(
 141     IMAGE_INT32 sumTex,
 142     __constant float4 *src,
 143     int oldSize,
 144     int newSize,
 145     int y, int x,
 146     int rows, int cols, int elemPerRow)
 147 {
 148
 149     float ratio = (float)newSize / oldSize;
 150
 151     F d = 0;
 152
 153     int4 dx1 = convert_int4_rte(ratio * src[0]);
 154     int4 dy1 = convert_int4_rte(ratio * src[1]);
 155     int4 dx2 = convert_int4_rte(ratio * src[2]);
 156     int4 dy2 = convert_int4_rte(ratio * src[3]);
 157
 158     F t = 0;
 159     t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
 160     t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
 161     t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
 162     t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
 163     d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
 164
 165     t = 0;
 166     t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
 167     t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
 168     t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
 169     t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
 170     d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
 171
 172     t = 0;
 173     t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
 174     t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
 175     t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
 176     t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
 177     d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
 178
 179     return (float)d;
 180 }
 181
 182 // N = 4
 183 float icvCalcHaarPatternSum_4(
 184     IMAGE_INT32 sumTex,
 185     __constant float4 *src,
 186     int oldSize,
 187     int newSize,
 188     int y, int x,
 189     int rows, int cols, int elemPerRow)
 190 {
 191
 192     float ratio = (float)newSize / oldSize;
 193
 194     F d = 0;
 195
 196     int4 dx1 = convert_int4_rte(ratio * src[0]);
 197     int4 dy1 = convert_int4_rte(ratio * src[1]);
 198     int4 dx2 = convert_int4_rte(ratio * src[2]);
 199     int4 dy2 = convert_int4_rte(ratio * src[3]);
 200
 201     F t = 0;
 202     t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
 203     t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
 204     t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
 205     t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
 206     d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
 207
 208     t = 0;
 209     t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
 210     t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
 211     t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
 212     t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
 213     d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
 214
 215     t = 0;
 216     t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
 217     t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
 218     t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
 219     t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
 220     d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
 221
 222     t = 0;
 223     t += read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy1.w), rows, cols, elemPerRow );
 224     t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy2.w), rows, cols, elemPerRow );
 225     t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy1.w), rows, cols, elemPerRow );
 226     t += read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy2.w), rows, cols, elemPerRow );
 227     d += t * src[4].w / ((dx2.w - dx1.w) * (dy2.w - dy1.w));
 228
 229     return (float)d;
 230 }
 231
 232 ////////////////////////////////////////////////////////////////////////
 233 // Hessian
 234
 235 __constant float4 c_DX[5] = { (float4)(0, 3, 6, 0), (float4)(2, 2, 2, 0), (float4)(3, 6, 9, 0), (float4)(7, 7, 7, 0), (float4)(1, -2, 1, 0) };
 236 __constant float4 c_DY[5] = { (float4)(2, 2, 2, 0), (float4)(0, 3, 6, 0), (float4)(7, 7, 7, 0), (float4)(3, 6, 9, 0), (float4)(1, -2, 1, 0) };
 237 __constant float4 c_DXY[5] = { (float4)(1, 5, 1, 5), (float4)(1, 1, 5, 5), (float4)(4, 8, 4, 8), (float4)(4, 4, 8, 8), (float4)(1, -1, -1, 1) };// Use integral image to calculate haar wavelets.
 238
 239 __inline int calcSize(int octave, int layer)
 240 {
 241     /* Wavelet size at first layer of first octave. */
 242     const int HAAR_SIZE0 = 9;
 243
 244     /* Wavelet size increment between layers. This should be an even number,
 245     such that the wavelet sizes in an octave are either all even or all odd.
 246     This ensures that when looking for the neighbours of a sample, the layers
 247     above and below are aligned correctly. */
 248     const int HAAR_SIZE_INC = 6;
 249
 250     return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
 251 }
 252
 253
 254 //calculate targeted layer per-pixel determinant and trace with an integral image
 255 __kernel void icvCalcLayerDetAndTrace(
 256     IMAGE_INT32 sumTex, // input integral image
 257     __global float * det,      // output Determinant
 258     __global float * trace,    // output trace
 259     int det_step,     // the step of det in bytes
 260     int trace_step,   // the step of trace in bytes
 261     int c_img_rows,
 262     int c_img_cols,
 263     int c_nOctaveLayers,
 264     int c_octave,
 265     int c_layer_rows,
 266     int sumTex_step
 267 )
 268 {
 269     det_step   /= sizeof(*det);
 270     trace_step /= sizeof(*trace);
 271     sumTex_step/= sizeof(uint);
 272     // Determine the indices
 273     const int gridDim_y  = get_num_groups(1) / (c_nOctaveLayers + 2);
 274     const int blockIdx_y = get_group_id(1) % gridDim_y;
 275     const int blockIdx_z = get_group_id(1) / gridDim_y;
 276
 277     const int j = get_local_id(0) + get_group_id(0) * get_local_size(0);
 278     const int i = get_local_id(1) + blockIdx_y * get_local_size(1);
 279     const int layer = blockIdx_z;
 280
 281     const int size = calcSize(c_octave, layer);
 282
 283     const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
 284     const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
 285
 286     // Ignore pixels where some of the kernel is outside the image
 287     const int margin = (size >> 1) >> c_octave;
 288
 289     if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
 290     {
 291         const float dx  = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
 292         const float dy  = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
 293         const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
 294
 295         det  [j + margin + det_step   * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
 296         trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
 297     }
 298 }
 299
 300
 301 ////////////////////////////////////////////////////////////////////////
 302 // NONMAX
 303
 304 __constant float c_DM[5] = {0, 0, 9, 9, 1};
 305
 306 bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step)
 307 {
 308     float ratio = (float)size / 9.0f;
 309
 310     float d = 0;
 311
 312     int dx1 = convert_int_rte(ratio * c_DM[0]);
 313     int dy1 = convert_int_rte(ratio * c_DM[1]);
 314     int dx2 = convert_int_rte(ratio * c_DM[2]);
 315     int dy2 = convert_int_rte(ratio * c_DM[3]);
 316
 317     float t = 0;
 318
 319     t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step);
 320     t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step);
 321     t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step);
 322     t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step);
 323
 324     d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
 325
 326     return (d >= 0.5f);
 327 }
 328
 329 // Non-maximal suppression to further filtering the candidates from previous step
 330 __kernel
 331 void icvFindMaximaInLayer_withmask(
 332     __global const float * det,
 333     __global const float * trace,
 334     __global int4 * maxPosBuffer,
 335     volatile __global int* maxCounter,
 336     int counter_offset,
 337     int det_step,     // the step of det in bytes
 338     int trace_step,   // the step of trace in bytes
 339     int c_img_rows,
 340     int c_img_cols,
 341     int c_nOctaveLayers,
 342     int c_octave,
 343     int c_layer_rows,
 344     int c_layer_cols,
 345     int c_max_candidates,
 346     float c_hessianThreshold,
 347     IMAGE_INT32 maskSumTex,
 348     int mask_step
 349 )
 350 {
 351     volatile __local  float N9[768]; // threads.x * threads.y * 3
 352
 353     det_step   /= sizeof(*det);
 354     trace_step /= sizeof(*trace);
 355     maxCounter += counter_offset;
 356     mask_step  /= sizeof(uint);
 357
 358     // Determine the indices
 359     const int gridDim_y  = get_num_groups(1) / c_nOctaveLayers;
 360     const int blockIdx_y = get_group_id(1)   % gridDim_y;
 361     const int blockIdx_z = get_group_id(1)   / gridDim_y;
 362
 363     const int layer = blockIdx_z + 1;
 364
 365     const int size = calcSize(c_octave, layer);
 366
 367     // Ignore pixels without a 3x3x3 neighbourhood in the layer above
 368     const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
 369
 370     const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1;
 371     const int i = get_local_id(1) + blockIdx_y * (get_local_size(1) - 2) + margin - 1;
 372
 373     // Is this thread within the hessian buffer?
 374     const int zoff = get_local_size(0) * get_local_size(1);
 375     const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
 376     N9[localLin - zoff] =
 377         det[det_step *
 378             (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
 379             + min(max(j, 0), c_img_cols - 1)];                            // x
 380     N9[localLin       ] =
 381         det[det_step *
 382             (c_layer_rows * (layer    ) + min(max(i, 0), c_img_rows - 1)) // y
 383             + min(max(j, 0), c_img_cols - 1)];                            // x
 384     N9[localLin + zoff] =
 385         det[det_step *
 386             (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
 387             + min(max(j, 0), c_img_cols - 1)];                            // x
 388
 389     barrier(CLK_LOCAL_MEM_FENCE);
 390
 391     if (i < c_layer_rows - margin
 392             && j < c_layer_cols - margin
 393             && get_local_id(0) > 0
 394             && get_local_id(0) < get_local_size(0) - 1
 395             && get_local_id(1) > 0
 396             && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
 397        )
 398     {
 399         float val0 = N9[localLin];
 400
 401         if (val0 > c_hessianThreshold)
 402         {
 403             // Coordinates for the start of the wavelet in the sum image. There
 404             // is some integer division involved, so don't try to simplify this
 405             // (cancel out sampleStep) without checking the result is the same
 406             const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
 407             const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
 408
 409             if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step))
 410             {
 411                 // Check to see if we have a max (in its 26 neighbours)
 412                 const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
 413                                      &&                   val0 > N9[localLin     - get_local_size(0) - zoff]
 414                                      &&                   val0 > N9[localLin + 1 - get_local_size(0) - zoff]
 415                                      &&                   val0 > N9[localLin - 1                     - zoff]
 416                                      &&                   val0 > N9[localLin                         - zoff]
 417                                      &&                   val0 > N9[localLin + 1                     - zoff]
 418                                      &&                   val0 > N9[localLin - 1 + get_local_size(0) - zoff]
 419                                      &&                   val0 > N9[localLin     + get_local_size(0) - zoff]
 420                                      &&                   val0 > N9[localLin + 1 + get_local_size(0) - zoff]
 421
 422                                      &&                   val0 > N9[localLin - 1 - get_local_size(0)]
 423                                      &&                   val0 > N9[localLin     - get_local_size(0)]
 424                                      &&                   val0 > N9[localLin + 1 - get_local_size(0)]
 425                                      &&                   val0 > N9[localLin - 1                    ]
 426                                      &&                   val0 > N9[localLin + 1                    ]
 427                                      &&                   val0 > N9[localLin - 1 + get_local_size(0)]
 428                                      &&                   val0 > N9[localLin     + get_local_size(0)]
 429                                      &&                   val0 > N9[localLin + 1 + get_local_size(0)]
 430
 431                                      &&                   val0 > N9[localLin - 1 - get_local_size(0) + zoff]
 432                                      &&                   val0 > N9[localLin     - get_local_size(0) + zoff]
 433                                      &&                   val0 > N9[localLin + 1 - get_local_size(0) + zoff]
 434                                      &&                   val0 > N9[localLin - 1                     + zoff]
 435                                      &&                   val0 > N9[localLin                         + zoff]
 436                                      &&                   val0 > N9[localLin + 1                     + zoff]
 437                                      &&                   val0 > N9[localLin - 1 + get_local_size(0) + zoff]
 438                                      &&                   val0 > N9[localLin     + get_local_size(0) + zoff]
 439                                      &&                   val0 > N9[localLin + 1 + get_local_size(0) + zoff]
 440                                      ;
 441
 442                 if(condmax)
 443                 {
 444                     int ind = atomic_inc(maxCounter);
 445
 446                     if (ind < c_max_candidates)
 447                     {
 448                         const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]);
 449
 450                         maxPosBuffer[ind] = (int4)(j, i, layer, laplacian);
 451                     }
 452                 }
 453             }
 454         }
 455     }
 456 }
 457
 458 __kernel
 459 void icvFindMaximaInLayer(
 460     __global float * det,
 461     __global float * trace,
 462     __global int4 * maxPosBuffer,
 463     volatile __global  int* maxCounter,
 464     int counter_offset,
 465     int det_step,     // the step of det in bytes
 466     int trace_step,   // the step of trace in bytes
 467     int c_img_rows,
 468     int c_img_cols,
 469     int c_nOctaveLayers,
 470     int c_octave,
 471     int c_layer_rows,
 472     int c_layer_cols,
 473     int c_max_candidates,
 474     float c_hessianThreshold
 475 )
 476 {
 477     volatile __local  float N9[768]; // threads.x * threads.y * 3
 478
 479     det_step   /= sizeof(float);
 480     trace_step /= sizeof(float);
 481     maxCounter += counter_offset;
 482
 483     // Determine the indices
 484     const int gridDim_y  = get_num_groups(1) / c_nOctaveLayers;
 485     const int blockIdx_y = get_group_id(1)   % gridDim_y;
 486     const int blockIdx_z = get_group_id(1)   / gridDim_y;
 487
 488     const int layer = blockIdx_z + 1;
 489
 490     const int size = calcSize(c_octave, layer);
 491
 492     // Ignore pixels without a 3x3x3 neighbourhood in the layer above
 493     const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
 494
 495     const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1;
 496     const int i = get_local_id(1) + blockIdx_y      * (get_local_size(1) - 2) + margin - 1;
 497
 498     // Is this thread within the hessian buffer?
 499     const int zoff     = get_local_size(0) * get_local_size(1);
 500     const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
 501
 502     int l_x = min(max(j, 0), c_img_cols - 1);
 503     int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
 504
 505     N9[localLin - zoff] =
 506         det[det_step * (l_y - c_layer_rows) + l_x];
 507     N9[localLin       ] =
 508         det[det_step * (l_y               ) + l_x];
 509     N9[localLin + zoff] =
 510         det[det_step * (l_y + c_layer_rows) + l_x];
 511     barrier(CLK_LOCAL_MEM_FENCE);
 512
 513     if (i < c_layer_rows - margin
 514             && j < c_layer_cols - margin
 515             && get_local_id(0) > 0
 516             && get_local_id(0) < get_local_size(0) - 1
 517             && get_local_id(1) > 0
 518             && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
 519        )
 520     {
 521         float val0 = N9[localLin];
 522         if (val0 > c_hessianThreshold)
 523         {
 524             // Coordinates for the start of the wavelet in the sum image. There
 525             // is some integer division involved, so don't try to simplify this
 526             // (cancel out sampleStep) without checking the result is the same
 527
 528             // Check to see if we have a max (in its 26 neighbours)
 529             const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
 530                                  &&                   val0 > N9[localLin     - get_local_size(0) - zoff]
 531                                  &&                   val0 > N9[localLin + 1 - get_local_size(0) - zoff]
 532                                  &&                   val0 > N9[localLin - 1                     - zoff]
 533                                  &&                   val0 > N9[localLin                         - zoff]
 534                                  &&                   val0 > N9[localLin + 1                     - zoff]
 535                                  &&                   val0 > N9[localLin - 1 + get_local_size(0) - zoff]
 536                                  &&                   val0 > N9[localLin     + get_local_size(0) - zoff]
 537                                  &&                   val0 > N9[localLin + 1 + get_local_size(0) - zoff]
 538
 539                                  &&                   val0 > N9[localLin - 1 - get_local_size(0)]
 540                                  &&                   val0 > N9[localLin     - get_local_size(0)]
 541                                  &&                   val0 > N9[localLin + 1 - get_local_size(0)]
 542                                  &&                   val0 > N9[localLin - 1                    ]
 543                                  &&                   val0 > N9[localLin + 1                    ]
 544                                  &&                   val0 > N9[localLin - 1 + get_local_size(0)]
 545                                  &&                   val0 > N9[localLin     + get_local_size(0)]
 546                                  &&                   val0 > N9[localLin + 1 + get_local_size(0)]
 547
 548                                  &&                   val0 > N9[localLin - 1 - get_local_size(0) + zoff]
 549                                  &&                   val0 > N9[localLin     - get_local_size(0) + zoff]
 550                                  &&                   val0 > N9[localLin + 1 - get_local_size(0) + zoff]
 551                                  &&                   val0 > N9[localLin - 1                     + zoff]
 552                                  &&                   val0 > N9[localLin                         + zoff]
 553                                  &&                   val0 > N9[localLin + 1                     + zoff]
 554                                  &&                   val0 > N9[localLin - 1 + get_local_size(0) + zoff]
 555                                  &&                   val0 > N9[localLin     + get_local_size(0) + zoff]
 556                                  &&                   val0 > N9[localLin + 1 + get_local_size(0) + zoff]
 557                                  ;
 558
 559             if(condmax)
 560             {
 561                 int ind = atomic_inc(maxCounter);
 562
 563                 if (ind < c_max_candidates)
 564                 {
 565                     const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]);
 566
 567                     maxPosBuffer[ind] = (int4)(j, i, layer, laplacian);
 568                 }
 569             }
 570         }
 571     }
 572 }
 573
 574 // solve 3x3 linear system Ax=b for floating point input
 575 inline bool solve3x3_float(volatile __local  const float4 *A, volatile __local  const float *b, volatile __local  float *x)
 576 {
 577     float det = A[0].x * (A[1].y * A[2].z - A[1].z * A[2].y)
 578                 - A[0].y * (A[1].x * A[2].z - A[1].z * A[2].x)
 579                 + A[0].z * (A[1].x * A[2].y - A[1].y * A[2].x);
 580
 581     if (det != 0)
 582     {
 583         F invdet = 1.0 / det;
 584
 585         x[0] = invdet *
 586                (b[0]    * (A[1].y * A[2].z - A[1].z * A[2].y) -
 587                 A[0].y * (b[1]    * A[2].z - A[1].z * b[2]   ) +
 588                 A[0].z * (b[1]    * A[2].y - A[1].y * b[2]   ));
 589
 590         x[1] = invdet *
 591                (A[0].x * (b[1]    * A[2].z - A[1].z * b[2]   ) -
 592                 b[0]    * (A[1].x * A[2].z - A[1].z * A[2].x) +
 593                 A[0].z * (A[1].x * b[2]    - b[1]    * A[2].x));
 594
 595         x[2] = invdet *
 596                (A[0].x * (A[1].y * b[2]    - b[1]    * A[2].y) -
 597                 A[0].y * (A[1].x * b[2]    - b[1]    * A[2].x) +
 598                 b[0]    * (A[1].x * A[2].y - A[1].y * A[2].x));
 599
 600         return true;
 601     }
 602     return false;
 603 }
 604
 605 #define X_ROW          0
 606 #define Y_ROW          1
 607 #define LAPLACIAN_ROW  2
 608 #define OCTAVE_ROW     3
 609 #define SIZE_ROW       4
 610 #define ANGLE_ROW      5
 611 #define HESSIAN_ROW    6
 612 #define ROWS_COUNT     7
 613
 614 ////////////////////////////////////////////////////////////////////////
 615 // INTERPOLATION
 616 __kernel
 617 void icvInterpolateKeypoint(
 618     __global const float * det,
 619     __global const int4 * maxPosBuffer,
 620     __global float * keypoints,
 621     volatile __global  int * featureCounter,
 622     int det_step,
 623     int keypoints_step,
 624     int c_img_rows,
 625     int c_img_cols,
 626     int c_octave,
 627     int c_layer_rows,
 628     int c_max_features
 629 )
 630 {
 631     det_step /= sizeof(*det);
 632     keypoints_step /= sizeof(*keypoints);
 633     __global float * featureX       = keypoints + X_ROW * keypoints_step;
 634     __global float * featureY       = keypoints + Y_ROW * keypoints_step;
 635     __global int * featureLaplacian = (__global int *)keypoints + LAPLACIAN_ROW * keypoints_step;
 636     __global int * featureOctave    = (__global int *)keypoints + OCTAVE_ROW * keypoints_step;
 637     __global float * featureSize    = keypoints + SIZE_ROW * keypoints_step;
 638     __global float * featureHessian = keypoints + HESSIAN_ROW * keypoints_step;
 639
 640     const int4 maxPos = maxPosBuffer[get_group_id(0)];
 641
 642     const int j = maxPos.x - 1 + get_local_id(0);
 643     const int i = maxPos.y - 1 + get_local_id(1);
 644     const int layer = maxPos.z - 1 + get_local_id(2);
 645
 646     volatile __local  float N9[3][3][3];
 647
 648     N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
 649         det[det_step * (c_layer_rows * layer + i) + j];
 650     barrier(CLK_LOCAL_MEM_FENCE);
 651
 652     if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0)
 653     {
 654         volatile __local  float dD[3];
 655
 656         //dx
 657         dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
 658         //dy
 659         dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
 660         //ds
 661         dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
 662
 663         volatile __local  float4 H[3];
 664
 665         //dxx
 666         H[0].x = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
 667         //dxy
 668         H[0].y= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
 669         //dxs
 670         H[0].z= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
 671         //dyx = dxy
 672         H[1].x = H[0].y;
 673         //dyy
 674         H[1].y = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
 675         //dys
 676         H[1].z= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
 677         //dsx = dxs
 678         H[2].x = H[0].z;
 679         //dsy = dys
 680         H[2].y = H[1].z;
 681         //dss
 682         H[2].z = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
 683
 684         volatile __local  float x[3];
 685
 686         if (solve3x3_float(H, dD, x))
 687         {
 688             if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
 689             {
 690                 // if the step is within the interpolation region, perform it
 691
 692                 const int size = calcSize(c_octave, maxPos.z);
 693
 694                 const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
 695                 const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
 696
 697                 const float center_i = sum_i + (float)(size - 1) / 2;
 698                 const float center_j = sum_j + (float)(size - 1) / 2;
 699
 700                 const float px = center_j + x[0] * (1 << c_octave);
 701                 const float py = center_i + x[1] * (1 << c_octave);
 702
 703                 const int ds = size - calcSize(c_octave, maxPos.z - 1);
 704                 const float psize = round(size + x[2] * ds);
 705
 706                 /* The sampling intervals and wavelet sized for selecting an orientation
 707                 and building the keypoint descriptor are defined relative to 's' */
 708                 const float s = psize * 1.2f / 9.0f;
 709
 710                 /* To find the dominant orientation, the gradients in x and y are
 711                 sampled in a circle of radius 6s using wavelets of size 4s.
 712                 We ensure the gradient wavelet size is even to ensure the
 713                 wavelet pattern is balanced and symmetric around its center */
 714                 const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
 715
 716                 // check when grad_wav_size is too big
 717                 if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
 718                 {
 719                     // Get a new feature index.
 720                     int ind = atomic_inc(featureCounter);
 721
 722                     if (ind < c_max_features)
 723                     {
 724                         featureX[ind] = px;
 725                         featureY[ind] = py;
 726                         featureLaplacian[ind] = maxPos.w;
 727                         featureOctave[ind] = c_octave;
 728                         featureSize[ind] = psize;
 729                         featureHessian[ind] = N9[1][1][1];
 730                     }
 731                 } // grad_wav_size check
 732             } // If the subpixel interpolation worked
 733         }
 734     } // If this is thread 0.
 735 }
 736
 737 ////////////////////////////////////////////////////////////////////////
 738 // Orientation
 739
 740 #define ORI_SEARCH_INC 5
 741 #define ORI_WIN        60
 742 #define ORI_SAMPLES    113
 743
 744 __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
 745 __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
 746 __constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
 747                                         0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
 748                                         0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
 749                                         0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
 750                                         0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
 751                                         0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
 752                                         0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
 753                                         0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
 754                                         0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
 755                                         0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
 756                                         0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
 757                                         0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
 758                                         0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
 759                                         0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
 760                                         0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
 761                                         0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
 762                                         0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
 763                                         0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
 764                                         0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
 765                                         0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
 766                                         0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
 767                                         0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
 768                                         0.001707611023448408f, 0.001455130288377404f
 769                                        };
 770
 771 __constant float2 c_NX[5] = { (float2)(0, 2), (float2)(0, 0), (float2)(2, 4), (float2)(4, 4), (float2)(-1, 1) };
 772 __constant float2 c_NY[5] = { (float2)(0, 0), (float2)(0, 2), (float2)(4, 4), (float2)(2, 4), (float2)(1, -1) };
 773
 774 void reduce_32_sum(volatile __local  float * data, volatile float* partial_reduction, int tid)
 775 {
 776 #define op(A, B) (*A)+(B)
 777     data[tid] = *partial_reduction;
 778     barrier(CLK_LOCAL_MEM_FENCE);
 779 #ifndef WAVE_SIZE
 780 #define WAVE_SIZE 1
 781 #endif
 782     if (tid < 16)
 783     {
 784         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
 785 #if WAVE_SIZE < 16
 786     }
 787     barrier(CLK_LOCAL_MEM_FENCE);
 788     if (tid < 8)
 789     {
 790 #endif
 791         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8]);
 792 #if WAVE_SIZE < 8
 793     }
 794     barrier(CLK_LOCAL_MEM_FENCE);
 795     if (tid < 4)
 796     {
 797 #endif
 798         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4]);
 799 #if WAVE_SIZE < 4
 800     }
 801     barrier(CLK_LOCAL_MEM_FENCE);
 802     if (tid < 2)
 803     {
 804 #endif
 805         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
 806 #if WAVE_SIZE < 2
 807     }
 808     barrier(CLK_LOCAL_MEM_FENCE);
 809     if (tid < 1)
 810     {
 811 #endif
 812         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
 813     }
 814 #undef WAVE_SIZE
 815 #undef op
 816 }
 817
 818 __kernel
 819 void icvCalcOrientation(
 820     IMAGE_INT32 sumTex,
 821     __global float * keypoints,
 822     int keypoints_step,
 823     int c_img_rows,
 824     int c_img_cols,
 825     int sum_step
 826 )
 827 {
 828     keypoints_step /= sizeof(*keypoints);
 829     sum_step       /= sizeof(uint);
 830     __global float* featureX    = keypoints + X_ROW * keypoints_step;
 831     __global float* featureY    = keypoints + Y_ROW * keypoints_step;
 832     __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
 833     __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
 834
 835
 836     volatile __local  float s_X[128];
 837     volatile __local  float s_Y[128];
 838     volatile __local  float s_angle[128];
 839
 840     volatile __local  float s_sumx[32 * 4];
 841     volatile __local  float s_sumy[32 * 4];
 842
 843     /* The sampling intervals and wavelet sized for selecting an orientation
 844     and building the keypoint descriptor are defined relative to 's' */
 845     const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f;
 846
 847
 848     /* To find the dominant orientation, the gradients in x and y are
 849     sampled in a circle of radius 6s using wavelets of size 4s.
 850     We ensure the gradient wavelet size is even to ensure the
 851     wavelet pattern is balanced and symmetric around its center */
 852     const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
 853
 854     // check when grad_wav_size is too big
 855     if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
 856         return;
 857
 858     // Calc X, Y, angle and store it to shared memory
 859     const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
 860
 861     float X = 0.0f, Y = 0.0f, angle = 0.0f;
 862
 863     if (tid < ORI_SAMPLES)
 864     {
 865         const float margin = (float)(grad_wav_size - 1) / 2.0f;
 866         const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin);
 867         const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin);
 868
 869         if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
 870                 x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
 871         {
 872             X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
 873             Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
 874
 875             angle = atan2(Y, X);
 876
 877             if (angle < 0)
 878                 angle += 2.0f * CV_PI_F;
 879             angle *= 180.0f / CV_PI_F;
 880
 881         }
 882     }
 883     s_X[tid] = X;
 884     s_Y[tid] = Y;
 885     s_angle[tid] = angle;
 886     barrier(CLK_LOCAL_MEM_FENCE);
 887
 888     float bestx = 0, besty = 0, best_mod = 0;
 889
 890 #pragma unroll
 891     for (int i = 0; i < 18; ++i)
 892     {
 893         const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;
 894
 895         volatile float sumx = 0.0f, sumy = 0.0f;
 896         int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
 897         if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
 898         {
 899             sumx = s_X[get_local_id(0)];
 900             sumy = s_Y[get_local_id(0)];
 901         }
 902         d = abs(convert_int_rte(s_angle[get_local_id(0) + 32]) - dir);
 903         if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
 904         {
 905             sumx += s_X[get_local_id(0) + 32];
 906             sumy += s_Y[get_local_id(0) + 32];
 907         }
 908         d = abs(convert_int_rte(s_angle[get_local_id(0) + 64]) - dir);
 909         if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
 910         {
 911             sumx += s_X[get_local_id(0) + 64];
 912             sumy += s_Y[get_local_id(0) + 64];
 913         }
 914         d = abs(convert_int_rte(s_angle[get_local_id(0) + 96]) - dir);
 915         if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
 916         {
 917             sumx += s_X[get_local_id(0) + 96];
 918             sumy += s_Y[get_local_id(0) + 96];
 919         }
 920         reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0));
 921         reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0));
 922
 923         const float temp_mod = sumx * sumx + sumy * sumy;
 924         if (temp_mod > best_mod)
 925         {
 926             best_mod = temp_mod;
 927             bestx = sumx;
 928             besty = sumy;
 929         }
 930         barrier(CLK_LOCAL_MEM_FENCE);
 931     }
 932     if (get_local_id(0) == 0)
 933     {
 934         s_X[get_local_id(1)] = bestx;
 935         s_Y[get_local_id(1)] = besty;
 936         s_angle[get_local_id(1)] = best_mod;
 937     }
 938     barrier(CLK_LOCAL_MEM_FENCE);
 939
 940     if (get_local_id(1) == 0 && get_local_id(0) == 0)
 941     {
 942         int bestIdx = 0;
 943
 944         if (s_angle[1] > s_angle[bestIdx])
 945             bestIdx = 1;
 946         if (s_angle[2] > s_angle[bestIdx])
 947             bestIdx = 2;
 948         if (s_angle[3] > s_angle[bestIdx])
 949             bestIdx = 3;
 950
 951         float kp_dir = atan2(s_Y[bestIdx], s_X[bestIdx]);
 952         if (kp_dir < 0)
 953             kp_dir += 2.0f * CV_PI_F;
 954         kp_dir *= 180.0f / CV_PI_F;
 955
 956         kp_dir = 360.0f - kp_dir;
 957         if (fabs(kp_dir - 360.f) < FLT_EPSILON)
 958             kp_dir = 0.f;
 959
 960         featureDir[get_group_id(0)] = kp_dir;
 961     }
 962 }
 963
 964
 965 __kernel
 966 void icvSetUpright(
 967     __global float * keypoints,
 968     int keypoints_step,
 969     int nFeatures
 970 )
 971 {
 972     keypoints_step /= sizeof(*keypoints);
 973     __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
 974
 975     if(get_global_id(0) <= nFeatures)
 976     {
 977         featureDir[get_global_id(0)] = 270.0f;
 978     }
 979 }
 980
 981
 982 #undef ORI_SEARCH_INC
 983 #undef ORI_WIN
 984 #undef ORI_SAMPLES
 985
 986 ////////////////////////////////////////////////////////////////////////
 987 // Descriptors
 988
 989 #define PATCH_SZ 20
 990
 991 __constant float c_DW[PATCH_SZ * PATCH_SZ] =
 992 {
 993     3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
 994     8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
 995     1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
 996     3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
 997     5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
 998     9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
 999     0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
1000     0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
1001     0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
1002     0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
1003     0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
1004     0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
1005     0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
1006     0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
1007     9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
1008     5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
1009     3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
1010     1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
1011     8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
1012     3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
1013 };
1014
1015 // utility for linear filter
1016 inline uchar readerGet(
1017     IMAGE_INT8 src,
1018     const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
1019     int i, int j, int rows, int cols, int elemPerRow
1020 )
1021 {
1022     float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
1023     float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
1024     return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow);
1025 }
1026
1027 inline float linearFilter(
1028     IMAGE_INT8 src,
1029     const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
1030     float y, float x, int rows, int cols, int elemPerRow
1031 )
1032 {
1033     x -= 0.5f;
1034     y -= 0.5f;
1035
1036     float out = 0.0f;
1037
1038     const int x1 = convert_int_rtn(x);
1039     const int y1 = convert_int_rtn(y);
1040     const int x2 = x1 + 1;
1041     const int y2 = y1 + 1;
1042
1043     uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow);
1044     out = out + src_reg * ((x2 - x) * (y2 - y));
1045
1046     src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow);
1047     out = out + src_reg * ((x - x1) * (y2 - y));
1048
1049     src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow);
1050     out = out + src_reg * ((x2 - x) * (y - y1));
1051
1052     src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow);
1053     out = out + src_reg * ((x - x1) * (y - y1));
1054
1055     return out;
1056 }
1057
1058 void calc_dx_dy(
1059     IMAGE_INT8 imgTex,
1060     volatile __local  float *s_dx_bin,
1061     volatile __local  float *s_dy_bin,
1062     volatile __local  float *s_PATCH,
1063     __global const float* featureX,
1064     __global const float* featureY,
1065     __global const float* featureSize,
1066     __global const float* featureDir,
1067     int rows,
1068     int cols,
1069     int elemPerRow
1070 )
1071 {
1072     const float centerX = featureX[get_group_id(0)];
1073     const float centerY = featureY[get_group_id(0)];
1074     const float size = featureSize[get_group_id(0)];
1075     float descriptor_dir = 360.0f - featureDir[get_group_id(0)];
1076     if(fabs(descriptor_dir - 360.0f) < FLT_EPSILON)
1077     {
1078         descriptor_dir = 0.0f;
1079     }
1080
1081     descriptor_dir *= (float)(CV_PI_F / 180.0f);
1082
1083     /* The sampling intervals and wavelet sized for selecting an orientation
1084     and building the keypoint descriptor are defined relative to 's' */
1085     const float s = size * 1.2f / 9.0f;
1086
1087     /* Extract a window of pixels around the keypoint of size 20s */
1088     const int win_size = (int)((PATCH_SZ + 1) * s);
1089
1090     float sin_dir;
1091     float cos_dir;
1092     sin_dir = sincos(descriptor_dir, &cos_dir);
1093
1094     /* Nearest neighbour version (faster) */
1095     const float win_offset = -(float)(win_size - 1) / 2;
1096
1097     // Compute sampling points
1098     // since grids are 2D, need to compute xBlock and yBlock indices
1099     const int xBlock = (get_group_id(1) & 3);  // get_group_id(1) % 4
1100     const int yBlock = (get_group_id(1) >> 2); // floor(get_group_id(1)/4)
1101     const int xIndex = xBlock * 5 + get_local_id(0);
1102     const int yIndex = yBlock * 5 + get_local_id(1);
1103
1104     const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
1105     const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
1106
1107     s_PATCH[get_local_id(1) * 6 + get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);
1108
1109     barrier(CLK_LOCAL_MEM_FENCE);
1110
1111     if (get_local_id(0) < 5 && get_local_id(1) < 5)
1112     {
1113         const int tid = get_local_id(1) * 5 + get_local_id(0);
1114
1115         const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
1116
1117         const float vx = (
1118                              s_PATCH[      get_local_id(1) * 6 + get_local_id(0) + 1] -
1119                              s_PATCH[      get_local_id(1) * 6 + get_local_id(0)    ] +
1120                              s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) + 1] -
1121                              s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0)    ])
1122                          * dw;
1123         const float vy = (
1124                              s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0)    ] -
1125                              s_PATCH[      get_local_id(1) * 6 + get_local_id(0)    ] +
1126                              s_PATCH[(get_local_id(1) + 1) * 6 + get_local_id(0) + 1] -
1127                              s_PATCH[      get_local_id(1) * 6 + get_local_id(0) + 1])
1128                          * dw;
1129         s_dx_bin[tid] = vx;
1130         s_dy_bin[tid] = vy;
1131     }
1132 }
1133 void reduce_sum25(
1134     volatile __local  float* sdata1,
1135     volatile __local  float* sdata2,
1136     volatile __local  float* sdata3,
1137     volatile __local  float* sdata4,
1138     int tid
1139 )
1140 {
1141 #ifndef WAVE_SIZE
1142 #define WAVE_SIZE 1
1143 #endif
1144     // first step is to reduce from 25 to 16
1145     if (tid < 9)
1146     {
1147         sdata1[tid] += sdata1[tid + 16];
1148         sdata2[tid] += sdata2[tid + 16];
1149         sdata3[tid] += sdata3[tid + 16];
1150         sdata4[tid] += sdata4[tid + 16];
1151 #if WAVE_SIZE < 16
1152     }
1153     barrier(CLK_LOCAL_MEM_FENCE);
1154     if (tid < 8)
1155     {
1156 #endif
1157         sdata1[tid] += sdata1[tid + 8];
1158         sdata2[tid] += sdata2[tid + 8];
1159         sdata3[tid] += sdata3[tid + 8];
1160         sdata4[tid] += sdata4[tid + 8];
1161 #if WAVE_SIZE < 8
1162     }
1163     barrier(CLK_LOCAL_MEM_FENCE);
1164     if (tid < 4)
1165     {
1166 #endif
1167         sdata1[tid] += sdata1[tid + 4];
1168         sdata2[tid] += sdata2[tid + 4];
1169         sdata3[tid] += sdata3[tid + 4];
1170         sdata4[tid] += sdata4[tid + 4];
1171 #if WAVE_SIZE < 4
1172     }
1173     barrier(CLK_LOCAL_MEM_FENCE);
1174     if (tid < 2)
1175     {
1176 #endif
1177         sdata1[tid] += sdata1[tid + 2];
1178         sdata2[tid] += sdata2[tid + 2];
1179         sdata3[tid] += sdata3[tid + 2];
1180         sdata4[tid] += sdata4[tid + 2];
1181 #if WAVE_SIZE < 2
1182     }
1183     barrier(CLK_LOCAL_MEM_FENCE);
1184     if (tid < 1)
1185     {
1186 #endif
1187         sdata1[tid] += sdata1[tid + 1];
1188         sdata2[tid] += sdata2[tid + 1];
1189         sdata3[tid] += sdata3[tid + 1];
1190         sdata4[tid] += sdata4[tid + 1];
1191     }
1192 #undef WAVE_SIZE
1193 }
1194
1195 __kernel
1196 void compute_descriptors64(
1197     IMAGE_INT8 imgTex,
1198     __global float * descriptors,
1199     __global const float * keypoints,
1200     int descriptors_step,
1201     int keypoints_step,
1202     int rows,
1203     int cols,
1204     int img_step
1205 )
1206 {
1207     descriptors_step /= sizeof(float);
1208     keypoints_step   /= sizeof(float);
1209     __global const float * featureX    = keypoints + X_ROW * keypoints_step;
1210     __global const float * featureY    = keypoints + Y_ROW * keypoints_step;
1211     __global const float * featureSize = keypoints + SIZE_ROW * keypoints_step;
1212     __global const float * featureDir  = keypoints + ANGLE_ROW * keypoints_step;
1213
1214     // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
1215     volatile __local  float sdx[25];
1216     volatile __local  float sdy[25];
1217     volatile __local  float sdxabs[25];
1218     volatile __local  float sdyabs[25];
1219     volatile __local  float s_PATCH[6*6];
1220
1221     calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
1222     barrier(CLK_LOCAL_MEM_FENCE);
1223
1224     const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
1225
1226     if (tid < 25)
1227     {
1228         sdxabs[tid] = fabs(sdx[tid]); // |dx| array
1229         sdyabs[tid] = fabs(sdy[tid]); // |dy| array
1230     }
1231     barrier(CLK_LOCAL_MEM_FENCE);
1232
1233     reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
1234
1235     barrier(CLK_LOCAL_MEM_FENCE);
1236     if (tid < 25)
1237     {
1238         __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
1239
1240         // write dx, dy, |dx|, |dy|
1241         if (tid == 0)
1242         {
1243             descriptors_block[0] = sdx[0];
1244             descriptors_block[1] = sdy[0];
1245             descriptors_block[2] = sdxabs[0];
1246             descriptors_block[3] = sdyabs[0];
1247         }
1248     }
1249 }
1250 __kernel
1251 void compute_descriptors128(
1252     IMAGE_INT8 imgTex,
1253     __global float * descriptors,
1254     __global float * keypoints,
1255     int descriptors_step,
1256     int keypoints_step,
1257     int rows,
1258     int cols,
1259     int img_step
1260 )
1261 {
1262     descriptors_step /= sizeof(*descriptors);
1263     keypoints_step   /= sizeof(*keypoints);
1264
1265     __global float * featureX   = keypoints + X_ROW * keypoints_step;
1266     __global float * featureY   = keypoints + Y_ROW * keypoints_step;
1267     __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
1268     __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
1269
1270     // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
1271     volatile __local  float sdx[25];
1272     volatile __local  float sdy[25];
1273
1274     // sum (reduce) 5x5 area response
1275     volatile __local  float sd1[25];
1276     volatile __local  float sd2[25];
1277     volatile __local  float sdabs1[25];
1278     volatile __local  float sdabs2[25];
1279     volatile __local  float s_PATCH[6*6];
1280
1281     calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
1282     barrier(CLK_LOCAL_MEM_FENCE);
1283
1284     const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
1285
1286     if (tid < 25)
1287     {
1288         if (sdy[tid] >= 0)
1289         {
1290             sd1[tid] = sdx[tid];
1291             sdabs1[tid] = fabs(sdx[tid]);
1292             sd2[tid] = 0;
1293             sdabs2[tid] = 0;
1294         }
1295         else
1296         {
1297             sd1[tid] = 0;
1298             sdabs1[tid] = 0;
1299             sd2[tid] = sdx[tid];
1300             sdabs2[tid] = fabs(sdx[tid]);
1301         }
1302     }
1303     barrier(CLK_LOCAL_MEM_FENCE);
1304
1305     reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
1306     barrier(CLK_LOCAL_MEM_FENCE);
1307
1308     __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
1309     if (tid < 25)
1310     {
1311         // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
1312         if (tid == 0)
1313         {
1314             descriptors_block[0] = sd1[0];
1315             descriptors_block[1] = sdabs1[0];
1316             descriptors_block[2] = sd2[0];
1317             descriptors_block[3] = sdabs2[0];
1318         }
1319
1320         if (sdx[tid] >= 0)
1321         {
1322             sd1[tid] = sdy[tid];
1323             sdabs1[tid] = fabs(sdy[tid]);
1324             sd2[tid] = 0;
1325             sdabs2[tid] = 0;
1326         }
1327         else
1328         {
1329             sd1[tid] = 0;
1330             sdabs1[tid] = 0;
1331             sd2[tid] = sdy[tid];
1332             sdabs2[tid] = fabs(sdy[tid]);
1333         }
1334     }
1335     barrier(CLK_LOCAL_MEM_FENCE);
1336     reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
1337     barrier(CLK_LOCAL_MEM_FENCE);
1338
1339     if (tid < 25)
1340     {
1341         // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
1342         if (tid == 0)
1343         {
1344             descriptors_block[4] = sd1[0];
1345             descriptors_block[5] = sdabs1[0];
1346             descriptors_block[6] = sd2[0];
1347             descriptors_block[7] = sdabs2[0];
1348         }
1349     }
1350 }
1351
1352 void reduce_sum128(volatile __local  float* smem, int tid)
1353 {
1354 #ifndef WAVE_SIZE
1355 #define WAVE_SIZE 1
1356 #endif
1357
1358     if (tid < 64)
1359     {
1360         smem[tid] += smem[tid + 64];
1361 #if WAVE_SIZE < 64
1362     }
1363     barrier(CLK_LOCAL_MEM_FENCE);
1364     if (tid < 32)
1365     {
1366 #endif
1367         smem[tid] += smem[tid + 32];
1368 #if WAVE_SIZE < 32
1369     }
1370     barrier(CLK_LOCAL_MEM_FENCE);
1371     if (tid < 16)
1372     {
1373 #endif
1374         smem[tid] += smem[tid + 16];
1375 #if WAVE_SIZE < 16
1376     }
1377     barrier(CLK_LOCAL_MEM_FENCE);
1378     if (tid < 8)
1379     {
1380 #endif
1381         smem[tid] += smem[tid + 8];
1382 #if WAVE_SIZE < 8
1383     }
1384     barrier(CLK_LOCAL_MEM_FENCE);
1385     if (tid < 4)
1386     {
1387 #endif
1388         smem[tid] += smem[tid + 4];
1389 #if WAVE_SIZE < 4
1390     }
1391     barrier(CLK_LOCAL_MEM_FENCE);
1392     if (tid < 2)
1393     {
1394 #endif
1395         smem[tid] += smem[tid + 2];
1396 #if WAVE_SIZE < 2
1397     }
1398     barrier(CLK_LOCAL_MEM_FENCE);
1399     if (tid < 1)
1400     {
1401 #endif
1402         smem[tid] += smem[tid + 1];
1403     }
1404 }
1405
1406
1407 void reduce_sum64(volatile __local  float* smem, int tid)
1408 {
1409 #ifndef WAVE_SIZE
1410 #define WAVE_SIZE 1
1411 #endif
1412     if (tid < 32)
1413     {
1414         smem[tid] += smem[tid + 32];
1415 #if WAVE_SIZE < 32
1416     }
1417     barrier(CLK_LOCAL_MEM_FENCE);
1418     if (tid < 16)
1419     {
1420 #endif
1421         smem[tid] += smem[tid + 16];
1422 #if WAVE_SIZE < 16
1423     }
1424     barrier(CLK_LOCAL_MEM_FENCE);
1425     if (tid < 8)
1426     {
1427 #endif
1428         smem[tid] += smem[tid + 8];
1429 #if WAVE_SIZE < 8
1430     }
1431     barrier(CLK_LOCAL_MEM_FENCE);
1432     if (tid < 4)
1433     {
1434 #endif
1435         smem[tid] += smem[tid + 4];
1436 #if WAVE_SIZE < 4
1437     }
1438     barrier(CLK_LOCAL_MEM_FENCE);
1439     if (tid < 2)
1440     {
1441 #endif
1442         smem[tid] += smem[tid + 2];
1443 #if WAVE_SIZE < 2
1444     }
1445     barrier(CLK_LOCAL_MEM_FENCE);
1446     if (tid < 1)
1447     {
1448 #endif
1449         smem[tid] += smem[tid + 1];
1450     }
1451 }
1452
1453 __kernel
1454 void normalize_descriptors128(__global float * descriptors, int descriptors_step)
1455 {
1456     descriptors_step /= sizeof(*descriptors);
1457     // no need for thread ID
1458     __global float* descriptor_base = descriptors + descriptors_step * get_group_id(0);
1459
1460     // read in the unnormalized descriptor values (squared)
1461     volatile __local  float sqDesc[128];
1462     const float lookup = descriptor_base[get_local_id(0)];
1463     sqDesc[get_local_id(0)] = lookup * lookup;
1464     barrier(CLK_LOCAL_MEM_FENCE);
1465
1466     reduce_sum128(sqDesc, get_local_id(0));
1467     barrier(CLK_LOCAL_MEM_FENCE);
1468
1469     // compute length (square root)
1470     volatile __local  float len;
1471     if (get_local_id(0) == 0)
1472     {
1473         len = sqrt(sqDesc[0]);
1474     }
1475     barrier(CLK_LOCAL_MEM_FENCE);
1476
1477     // normalize and store in output
1478     descriptor_base[get_local_id(0)] = lookup / len;
1479 }
1480 __kernel
1481 void normalize_descriptors64(__global float * descriptors, int descriptors_step)
1482 {
1483     descriptors_step /= sizeof(*descriptors);
1484     // no need for thread ID
1485     __global float* descriptor_base = descriptors + descriptors_step * get_group_id(0);
1486
1487     // read in the unnormalized descriptor values (squared)
1488     volatile __local  float sqDesc[64];
1489     const float lookup = descriptor_base[get_local_id(0)];
1490     sqDesc[get_local_id(0)] = lookup * lookup;
1491     barrier(CLK_LOCAL_MEM_FENCE);
1492
1493     reduce_sum64(sqDesc, get_local_id(0));
1494     barrier(CLK_LOCAL_MEM_FENCE);
1495
1496     // compute length (square root)
1497     volatile __local  float len;
1498     if (get_local_id(0) == 0)
1499     {
1500         len = sqrt(sqDesc[0]);
1501     }
1502     barrier(CLK_LOCAL_MEM_FENCE);
1503
1504     // normalize and store in output
1505     descriptor_base[get_local_id(0)] = lookup / len;
1506 }