inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/inplace_reduce.hpp

   1 //---------------------------------------------------------------------------//
   2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
   3 //
   4 // Distributed under the Boost Software License, Version 1.0
   5 // See accompanying file LICENSE_1_0.txt or copy at
   6 // http://www.boost.org/LICENSE_1_0.txt
   7 //
   8 // See http://boostorg.github.com/compute for more information.
   9 //---------------------------------------------------------------------------//
  10
  11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
  12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
  13
  14 #include <iterator>
  15
  16 #include <boost/utility/result_of.hpp>
  17
  18 #include <boost/compute/command_queue.hpp>
  19 #include <boost/compute/container/vector.hpp>
  20 #include <boost/compute/detail/iterator_range_size.hpp>
  21 #include <boost/compute/memory/local_buffer.hpp>
  22
  23 namespace boost {
  24 namespace compute {
  25 namespace detail {
  26
  27 template<class Iterator, class BinaryFunction>
  28 inline void inplace_reduce(Iterator first,
  29                            Iterator last,
  30                            BinaryFunction function,
  31                            command_queue &queue)
  32 {
  33     typedef typename
  34         std::iterator_traits<Iterator>::value_type
  35         value_type;
  36
  37     size_t input_size = iterator_range_size(first, last);
  38     if(input_size < 2){
  39         return;
  40     }
  41
  42     const context &context = queue.get_context();
  43
  44     size_t block_size = 64;
  45     size_t values_per_thread = 8;
  46     size_t block_count = input_size / (block_size * values_per_thread);
  47     if(block_count * block_size * values_per_thread != input_size)
  48         block_count++;
  49
  50     vector<value_type> output(block_count, context);
  51
  52     meta_kernel k("inplace_reduce");
  53     size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input");
  54     size_t input_size_arg = k.add_arg<const uint_>("input_size");
  55     size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output");
  56     size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch");
  57     k <<
  58         "const uint gid = get_global_id(0);\n" <<
  59         "const uint lid = get_local_id(0);\n" <<
  60         "const uint values_per_thread =\n"
  61             << uint_(values_per_thread) << ";\n" <<
  62
  63         // thread reduce
  64         "const uint index = gid * values_per_thread;\n" <<
  65         "if(index < input_size){\n" <<
  66             k.decl<value_type>("sum") << " = input[index];\n" <<
  67             "for(uint i = 1;\n" <<
  68                  "i < values_per_thread && (index + i) < input_size;\n" <<
  69                  "i++){\n" <<
  70             "    sum = " <<
  71                      function(k.var<value_type>("sum"),
  72                               k.var<value_type>("input[index+i]")) << ";\n" <<
  73             "}\n" <<
  74             "scratch[lid] = sum;\n" <<
  75         "}\n" <<
  76
  77         // local reduce
  78         "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" <<
  79         "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  80         "    uint mask = (i << 1) - 1;\n" <<
  81         "    uint next_index = (gid + i) * values_per_thread;\n"
  82         "    if((lid & mask) == 0 && next_index < input_size){\n" <<
  83         "        scratch[lid] = " <<
  84                      function(k.var<value_type>("scratch[lid]"),
  85                               k.var<value_type>("scratch[lid+i]")) << ";\n" <<
  86         "    }\n" <<
  87         "}\n" <<
  88
  89         // write output for block
  90         "if(lid == 0){\n" <<
  91         "    output[get_group_id(0)] = scratch[0];\n" <<
  92         "}\n"
  93         ;
  94
  95     const buffer *input_buffer = &first.get_buffer();
  96     const buffer *output_buffer = &output.get_buffer();
  97
  98     kernel kernel = k.compile(context);
  99
 100     while(input_size > 1){
 101         kernel.set_arg(input_arg, *input_buffer);
 102         kernel.set_arg(input_size_arg, static_cast<uint_>(input_size));
 103         kernel.set_arg(output_arg, *output_buffer);
 104         kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size));
 105
 106         queue.enqueue_1d_range_kernel(kernel,
 107                                       0,
 108                                       block_count * block_size,
 109                                       block_size);
 110
 111         input_size =
 112             static_cast<size_t>(
 113                 std::ceil(float(input_size) / (block_size * values_per_thread)
 114             )
 115         );
 116
 117         block_count = input_size / (block_size * values_per_thread);
 118         if(block_count * block_size * values_per_thread != input_size)
 119             block_count++;
 120
 121         std::swap(input_buffer, output_buffer);
 122     }
 123
 124     if(input_buffer != &first.get_buffer()){
 125         ::boost::compute::copy(output.begin(),
 126                                output.begin() + 1,
 127                                first,
 128                                queue);
 129     }
 130 }
 131
 132 } // end detail namespace
 133 } // end compute namespace
 134 } // end boost namespace
 135
 136 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP