modules/ocl/src/opencl/knearest.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Jin Ma, jin@multicorewareinc.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other materials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46 #ifdef DOUBLE_SUPPORT
  47 #ifdef cl_amd_fp64
  48 #pragma OPENCL EXTENSION cl_amd_fp64:enable
  49 #elif defined (cl_khr_fp64)
  50 #pragma OPENCL EXTENSION cl_khr_fp64:enable
  51 #endif
  52 #define TYPE double
  53 #else
  54 #define TYPE float
  55 #endif
  56
  57 #define CV_SWAP(a,b,t) ((t) = (a), (a) = (b), (b) = (t))
  58 ///////////////////////////////////// find_nearest //////////////////////////////////////
  59 __kernel void knn_find_nearest(__global float* sample, int sample_row, int sample_col, int sample_step,
  60                                int k, __global float* samples_ocl, int sample_ocl_row, int sample_ocl_step,
  61                                __global float* _results, int _results_step, int _regression, int K1,
  62                                int sample_ocl_col, int nThreads, __local float* nr)
  63 {
  64     int k1 = 0;
  65     int k2 = 0;
  66
  67     bool regression = false;
  68
  69     if(_regression)
  70         regression = true;
  71
  72     TYPE inv_scale;
  73 #ifdef DOUBLE_SUPPORT
  74     inv_scale = 1.0/K1;
  75 #else
  76     inv_scale = 1.0f/K1;
  77 #endif
  78
  79     int y = get_global_id(1);
  80     int j, j1;
  81     int threadY = (y % nThreads);
  82     __local float* dd = nr + nThreads * k;
  83     if(y >= sample_row)
  84     {
  85         return;
  86     }
  87     for(j = 0; j < sample_ocl_row; j++)
  88     {
  89         TYPE sum;
  90 #ifdef DOUBLE_SUPPORT
  91         sum = 0.0;
  92 #else
  93         sum = 0.0f;
  94 #endif
  95         float si;
  96         int t, ii, ii1;
  97         for(t = 0; t < sample_col - 16; t += 16)
  98         {
  99             float16 t0 = vload16(0, sample + y * sample_step + t) - vload16(0, samples_ocl + j * sample_ocl_step + t);
 100             t0 *= t0;
 101             sum += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
 102                 t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
 103         }
 104
 105         for(; t < sample_col; t++)
 106         {
 107 #ifdef DOUBLE_SUPPORT
 108             double t0 = sample[y * sample_step + t] - samples_ocl[j * sample_ocl_step + t];
 109 #else
 110             float t0 = sample[y * sample_step + t] - samples_ocl[j * sample_ocl_step + t];
 111 #endif
 112             sum = sum + t0 * t0;
 113         }
 114
 115         si = (float)sum;
 116         for(ii = k1 - 1; ii >= 0; ii--)
 117         {
 118             if(as_int(si) > as_int(dd[ii * nThreads + threadY]))
 119                 break;
 120         }
 121         if(ii < k - 1)
 122         {
 123             for(ii1 = k2 - 1; ii1 > ii; ii1--)
 124             {
 125                 dd[(ii1 + 1) * nThreads + threadY] = dd[ii1 * nThreads + threadY];
 126                 nr[(ii1 + 1) * nThreads + threadY] = nr[ii1 * nThreads + threadY];
 127             }
 128
 129             dd[(ii + 1) * nThreads + threadY] = si;
 130             nr[(ii + 1) * nThreads + threadY] = samples_ocl[sample_col + j * sample_ocl_step];
 131         }
 132         k1 = (k1 + 1) < k ? (k1 + 1) : k;
 133         k2 = k1 < (k - 1) ? k1 : (k - 1);
 134     }
 135     /*! find_nearest_neighbor done!*/
 136     /*! write_results start!*/
 137     if (regression)
 138     {
 139         TYPE s;
 140 #ifdef DOUBLE_SUPPORT
 141         s = 0.0;
 142 #else
 143         s = 0.0f;
 144 #endif
 145         for(j = 0; j < K1; j++)
 146             s += nr[j * nThreads + threadY];
 147
 148         _results[y * _results_step] = (float)(s * inv_scale);
 149     }
 150     else
 151     {
 152         int prev_start = 0, best_count = 0, cur_count;
 153         float best_val;
 154
 155         for(j = K1 - 1; j > 0; j--)
 156         {
 157             bool swap_f1 = false;
 158             for(j1 = 0; j1 < j; j1++)
 159             {
 160                 if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY])
 161                 {
 162                     int t;
 163                     CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t);
 164                     swap_f1 = true;
 165                 }
 166             }
 167             if(!swap_f1)
 168                 break;
 169         }
 170
 171         best_val = 0;
 172         for(j = 1; j <= K1; j++)
 173             if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY])
 174             {
 175                 cur_count = j - prev_start;
 176                 if(best_count < cur_count)
 177                 {
 178                     best_count = cur_count;
 179                     best_val = nr[(j - 1) * nThreads + threadY];
 180                 }
 181                 prev_start = j;
 182             }
 183             _results[y * _results_step] = best_val;
 184     }
 185     ///*! write_results done!*/
 186 }