compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp

   1 /*
   2  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Copyright (c) 2017-2018 ARM Limited.
  19  *
  20  * SPDX-License-Identifier: MIT
  21  *
  22  * Permission is hereby granted, free of charge, to any person obtaining a copy
  23  * of this software and associated documentation files (the "Software"), to
  24  * deal in the Software without restriction, including without limitation the
  25  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  26  * sell copies of the Software, and to permit persons to whom the Software is
  27  * furnished to do so, subject to the following conditions:
  28  *
  29  * The above copyright notice and this permission notice shall be included in all
  30  * copies or substantial portions of the Software.
  31  *
  32  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  33  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  34  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  35  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  36  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  37  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  38  * SOFTWARE.
  39  */
  40
  41 #include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
  42
  43 #include "arm_compute/core/CL/CLHelpers.h"
  44 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
  45 #include "arm_compute/core/CL/ICLTensor.h"
  46 #include "support/StringSupport.h"
  47
  48 using namespace arm_compute;
  49 namespace
  50 {
  51 // NOTE This is necessary because it is not guaranteed that the axis positions of input and output
  52 // are the same.
  53 const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
  54 {
  55   TensorShape out_shape{input_shape};
  56
  57   out_shape.set(axis, 1);
  58
  59   return out_shape;
  60 }
  61 } // namespace
  62
  63 namespace
  64 {
  65 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
  66                           ReduceOperation op)
  67 {
  68   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
  69
  70   if (output->total_size() != 0)
  71   {
  72     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
  73   }
  74
  75   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
  76                                                        DataType::F32, DataType::S32);
  77   if (op == ReduceOperation::SUM)
  78   {
  79     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
  80                                     "Not support QASYMM8, yet");
  81   }
  82   ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
  83
  84   ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
  85                                   "Inputs are not broadcast compatible");
  86
  87   const auto num_dimensions = input->tensor_shape().num_dimensions();
  88   ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
  89
  90   const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
  91   ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
  92                                   "output shape's size does not match axis");
  93
  94   return Status{};
  95 }
  96 } // namespace
  97
  98 CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
  99
 100 void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
 101                                         const uint32_t axis, ReduceOperation op)
 102 {
 103   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 104
 105   ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 106
 107   _input = input;
 108   _output = output;
 109   _axis = axis;
 110
 111   std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
 112   output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
 113
 114   // Construct kernel name
 115   std::string kernel_name;
 116   int op_code = 0;
 117   if (op == ReduceOperation::MAX)
 118   {
 119     kernel_name = "reduce_min_max";
 120     op_code = 1;
 121   }
 122   else if (op == ReduceOperation::MIN)
 123   {
 124     kernel_name = "reduce_min_max";
 125     op_code = 2;
 126   }
 127   else if (op == ReduceOperation::SUM)
 128   {
 129     kernel_name = "reduce_sum_mean";
 130     op_code = 3;
 131   }
 132   else if (op == ReduceOperation::MEAN)
 133   {
 134     kernel_name = "reduce_sum_mean";
 135     op_code = 4;
 136   }
 137   else
 138     throw std::runtime_error("Operation not supported, yet");
 139
 140   // Set kernel build options
 141   std::set<std::string> build_opts;
 142   build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));
 143   build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
 144   build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
 145
 146   // Create kernel
 147   _kernel =
 148       static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
 149
 150   // Configure  kernel window
 151   Window win = calculate_max_window(*output_info, Steps());
 152
 153   Coordinates coord;
 154   coord.set_num_dimensions(output_info->num_dimensions());
 155   output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
 156
 157   ICLKernel::configure_internal(win);
 158 }
 159
 160 Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
 161                                          const uint32_t axis, ReduceOperation op)
 162 {
 163   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 164   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
 165
 166   return Status{};
 167 }
 168
 169 void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue)
 170 {
 171   ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
 172   ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 173
 174   const TensorShape &shape_in = _input->info()->tensor_shape();
 175
 176   unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
 177
 178   _kernel.setArg<cl_int>(idx++, _axis);
 179   _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
 180
 181   // Support dimensions up to 4
 182   Window slice_out = window.collapse(ICLKernel::window(), 2, 4);
 183
 184   // Setup input slice
 185   Window slice_in(slice_out);
 186   slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
 187   slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
 188   slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
 189   slice_in.set(3, Window::Dimension(0, 0, 0));
 190
 191   // Copy output's shape in order to use for recovering at end of this method
 192   // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions
 193   // of input and output are the same
 194   const TensorShape shape_out = _output->info()->tensor_shape();
 195   _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
 196
 197   idx = 0;
 198   add_4D_tensor_argument(idx, _input, slice_in);
 199   add_4D_tensor_argument(idx, _output, slice_out);
 200   enqueue(queue, *this, slice_out, lws_hint());
 201
 202   // Recover output's shape of output tensor
 203   _output->info()->set_tensor_shape(shape_out);
 204 }