From 933f6937475fb626133573892922232d6cc024c3 Mon Sep 17 00:00:00 2001 From: Chunseok Lee Date: Tue, 28 Sep 2021 12:54:11 +0900 Subject: [PATCH] Remove topkv2_radixsort.cl This resolves license isseu by deleting topkv2_radixsort.cl file Change-Id: I27da2c6d8e82f05d1c1359f997e7a0ff9c944326 Signed-off-by: Chunseok Lee --- .../ARMComputeEx/src/core/CL/CLKernelLibrary.cpp | 8 - .../src/core/CL/cl_kernels/topkv2_radixsort.cl | 269 --------------------- 2 files changed, 277 deletions(-) delete mode 100644 runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index 8081256..6525fb6 100644 --- a/runtimes/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/runtimes/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -65,10 +65,6 @@ const std::map CLKernelLibraryEx::_kernel_program_map {"topkv2_find_first_negative", "topkv2.cl"}, {"topkv2_reorder_negatives", "topkv2.cl"}, {"topkv2_store", "topkv2.cl"}, - {"radixsort_histogram", "topkv2_radixsort.cl"}, - {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, - {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, - {"radixsort_reorder", "topkv2_radixsort.cl"}, {"topkv2_quicksort", "topkv2_quicksort.cl"}, {"space_to_batch_4d_nchw", "space_to_batch.cl"}, {"space_to_batch_4d_nhwc", "space_to_batch.cl"}, @@ -143,10 +139,6 @@ const std::map CLKernelLibraryEx::_program_source_map #include "./cl_kernels/topkv2.clembed" }, { - "topkv2_radixsort.cl", -#include "./cl_kernels/topkv2_radixsort.clembed" - }, - { "topkv2_quicksort.cl", #include "./cl_kernels/topkv2_quicksort.clembed" }, diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl deleted file mode 100644 index f6830d2..0000000 --- a/runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// reference: -// https://code.google.com/archive/p/ocl-radix-sort/source/default/source -// OpenCL kernel sources for the CLRadixSort class -// the #include does not exist in OpenCL -// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr -// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html -// if you find this software usefull you can cite the following work in your reports or articles: -// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011. -// http://hal.archives-ouvertes.fr/hal-00596730 - -// Reference for floating point radix sort: -// http://www.codercorner.com/RadixSortRevisited.htm - -// compute the histogram for each radix and each virtual processor for the pass -__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms, - const int pass, __local int *loc_histo, const int n) -{ - int it = get_local_id(0); // i local number of the processor - int ig = get_global_id(0); // global number = i + g I - - int gr = get_group_id(0); // g group number - - int groups = get_num_groups(0); - int items = get_local_size(0); - - // set the local histograms to zero - for (int ir = 0; ir < _RADIX; ir++) - { - loc_histo[ir * items + it] = 0; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // range of keys that are analyzed by the work item - int size = n / groups / items; // size of the sub-list - int start = ig * size; // beginning of the sub-list - - unsigned int key; - int shortkey, k; - - // compute the index - // the computation depends on the transposition - for (int j = 0; j < size; j++) - { -#ifdef TRANSPOSE - k = groups * items * j + ig; -#else - k = j + start; -#endif - - key = *((__global unsigned int *)(in_key_buf + k)); - - // extract the group of _BITS bits of the pass - // the result is in the range 0.._RADIX-1 - shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); - - // increment the local histogram - loc_histo[shortkey * items + it]++; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // copy the local histogram to the global one - for (int ir = 0; ir < _RADIX; ir++) - { - d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; - } - - barrier(CLK_GLOBAL_MEM_FENCE); -} - -// initial transpose of the list for improving -// coalescent memory access -__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol, - const int nbrow, const __global int *inperm, __global int *outperm, - __local int *blockmat, __local int *blockperm, const int tilesize) -{ - - int i0 = get_global_id(0) * tilesize; // first row index - int j = get_global_id(1); // column index - - int jloc = get_local_id(1); // local column index - - // fill the cache - for (int iloc = 0; iloc < tilesize; iloc++) - { - int k = (i0 + iloc) * nbcol + j; // position in the matrix - blockmat[iloc * tilesize + jloc] = invect[k]; -#ifdef PERMUT - blockperm[iloc * tilesize + jloc] = inperm[k]; -#endif - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // first row index in the transpose - int j0 = get_group_id(1) * tilesize; - - // put the cache at the good place - for (int iloc = 0; iloc < tilesize; iloc++) - { - int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose - outvect[kt] = blockmat[jloc * tilesize + iloc]; -#ifdef PERMUT - outperm[kt] = blockperm[jloc * tilesize + iloc]; -#endif - } -} - -// each virtual processor reorders its data using the scanned histogram -__kernel void radixsort_reorder(__global float *in_key, __global float *out_key, - __global int *d_Histograms, const int pass, - __global int *indices_in, __global int *indices_out, - __local int *loc_histo, const int n) -{ - - int it = get_local_id(0); - int ig = get_global_id(0); - - int gr = get_group_id(0); - int groups = get_num_groups(0); - int items = get_local_size(0); - - int start = ig * (n / groups / items); - int size = n / groups / items; - - // take the histogram in the cache - for (int ir = 0; ir < _RADIX; ir++) - { - loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int newpos, shortkey, k, newpost; - unsigned int key; - - for (int j = 0; j < size; j++) - { -#ifdef TRANSPOSE - k = groups * items * j + ig; -#else - k = j + start; -#endif - float org_value = in_key[k]; - key = *(__global unsigned int *)(in_key + k); - shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); - - newpos = loc_histo[shortkey * items + it]; - -#ifdef TRANSPOSE - int ignew, jnew; - ignew = newpos / (n / groups / items); - jnew = newpos % (n / groups / items); - newpost = jnew * (groups * items) + ignew; -#else - newpost = newpos; -#endif - - // d_outKeys[newpost]= key; // killing line !!! - out_key[newpost] = org_value; - -#ifdef PERMUT - indices_out[newpost] = indices_in[k]; -#endif - - newpos++; - loc_histo[shortkey * items + it] = newpos; - } -} - -// perform a parallel prefix sum (a scan) on the local histograms -// (see Blelloch 1990) each workitem worries about two memories -// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html -__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp, - __global int *globsum) -{ - int it = get_local_id(0); - int ig = get_global_id(0); - int decale = 1; - int n = get_local_size(0) * 2; - int gr = get_group_id(0); - - // load input into local memory - // up sweep phase - temp[2 * it] = histo[2 * ig]; - temp[2 * it + 1] = histo[2 * ig + 1]; - - // parallel prefix sum (algorithm of Blelloch 1990) - for (int d = n >> 1; d > 0; d >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - if (it < d) - { - int ai = decale * (2 * it + 1) - 1; - int bi = decale * (2 * it + 2) - 1; - temp[bi] += temp[ai]; - } - decale *= 2; - } - - // store the last element in the global sum vector - // (maybe used in the next step for constructing the global scan) - // clear the last element - if (it == 0) - { - globsum[gr] = temp[n - 1]; - temp[n - 1] = 0; - } - - // down sweep phase - for (int d = 1; d < n; d *= 2) - { - decale >>= 1; - barrier(CLK_LOCAL_MEM_FENCE); - - if (it < d) - { - int ai = decale * (2 * it + 1) - 1; - int bi = decale * (2 * it + 2) - 1; - - int t = temp[ai]; - temp[ai] = temp[bi]; - temp[bi] += t; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - // write results to device memory - - histo[2 * ig] = temp[2 * it]; - histo[2 * ig + 1] = temp[2 * it + 1]; - - barrier(CLK_GLOBAL_MEM_FENCE); -} - -// use the global sum for updating the local histograms -// each work item updates two values -__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum) -{ - int ig = get_global_id(0); - int gr = get_group_id(0); - - int s; - - s = globsum[gr]; - - // write results to device memory - histo[2 * ig] += s; - histo[2 * ig + 1] += s; - - barrier(CLK_GLOBAL_MEM_FENCE); -} -- 2.7.4