compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Copyright (c) 2019 ARM Limited.
  19  *
  20  * SPDX-License-Identifier: MIT
  21  *
  22  * Permission is hereby granted, free of charge, to any person obtaining a copy
  23  * of this software and associated documentation files (the "Software"), to
  24  * deal in the Software without restriction, including without limitation the
  25  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  26  * sell copies of the Software, and to permit persons to whom the Software is
  27  * furnished to do so, subject to the following conditions:
  28  *
  29  * The above copyright notice and this permission notice shall be included in all
  30  * copies or substantial portions of the Software.
  31  *
  32  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  33  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  34  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  35  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  36  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  37  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  38  * SOFTWARE.
  39  */
  40
  41 #include "helpers.h"
  42
  43 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
  44     defined(DIM_Y) && defined(DIM_Z)
  45 /** This function normalizes the input 2D tensor across the first dimension with respect to mean and
  46  * standard deviation of the same dimension.
  47  *
  48  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
  49  * -DVEC_SIZE=16
  50  * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g.
  51  * -DDATA_TYPE=float
  52  * @attention Normalization epsilon parameter should be given as a preprocessor argument with
  53  * -DEPSILON=value. e.g. -DEPSILON=0.001f
  54  * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value,
  55  * -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
  56  *
  57  * @param[in]  input_ptr                            Pointer to the first source tensor. Supported
  58  * data types: F16/F32
  59  * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension
  60  * (in bytes)
  61  * @param[in]  input_step_x                         input_stride_x * number of elements along X
  62  * processed per workitem(in bytes)
  63  * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension
  64  * (in bytes)
  65  * @param[in]  input_step_y                         input_stride_y * number of elements along Y
  66  * processed per workitem(in bytes)
  67  * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension
  68  * (in bytes)
  69  * @param[in]  input_step_z                         input_stride_z * number of elements along Z
  70  * processed per workitem(in bytes)
  71  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first
  72  * source tensor
  73  * @param[out] output_ptr                           (Optional) Pointer to the destination tensor.
  74  * Supported data types: same as @p input_ptr
  75  * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X
  76  * dimension (in bytes)
  77  * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements
  78  * along X processed per workitem(in bytes)
  79  * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y
  80  * dimension (in bytes)
  81  * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements
  82  * along Y processed per workitem(in bytes)
  83  * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z
  84  * dimension (in bytes)
  85  * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements
  86  * along Z processed per workitem(in bytes)
  87  * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in
  88  * the destination tensor
  89  * @param[in]  gamma_ptr                            (Optional) Pointer to the gamma tensor.
  90  * Supported data types: same as @p input_ptr
  91  * @param[in]  gamma_stride_x                       (Optional) Stride of the gamma tensor in X
  92  * dimension (in bytes)
  93  * @param[in]  gamma_step_x                         (Optional) output_stride_x * number of elements
  94  * along X processed per workitem(in bytes)
  95  * @param[in]  gamma_offset_first_element_in_bytes  (Optional) The offset of the first element in
  96  * the gamma tensor
  97  * @param[in]  beta_ptr                             (Optional) Pointer to the beta tensor. Supported
  98  * data types: same as @p input_ptr
  99  * @param[in]  beta_stride_x                        (Optional) Stride of the beta tensor in X
 100  * dimension (in bytes)
 101  * @param[in]  beta_step_x                          (Optional) output_stride_x * number of elements
 102  * along X processed per workitem(in bytes)
 103  * @param[in]  beta_offset_first_element_in_bytes   (Optional) The offset of the first element in
 104  * the beta tensor
 105  */
 106 __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
 107 #ifndef IN_PLACE
 108                                         TENSOR4D_DECLARATION(output)
 109 #endif /* IN_PLACE */
 110 #ifdef GAMMA
 111                                             ,
 112                                         VECTOR_DECLARATION(gamma)
 113 #endif // GAMMA
 114 #ifdef BETA
 115                                             ,
 116                                         VECTOR_DECLARATION(beta)
 117 #endif // BETA
 118                                             )
 119 {
 120   Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
 121 #ifndef IN_PLACE
 122   Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
 123 #endif /* IN_PLACE */
 124
 125   float sum = 0.f;
 126   float sum_sq = 0.f;
 127
 128 #if defined(NHWC)
 129
 130   const int ch = get_global_id(0);    // Current channel
 131   const int batch = get_global_id(2); // Current batch
 132   const int elements_plane = DIM_Y * DIM_Z;
 133
 134   for (int i_w = 0; i_w < DIM_Y; ++i_w)
 135   {
 136     for (int i_h = 0; i_h < DIM_Z; ++i_h)
 137     {
 138       float data = (float)*((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
 139       sum += data;
 140       sum_sq += data * data;
 141     }
 142   }
 143
 144 #else // !defined(NHWC)
 145   const int ch = get_global_id(2) % DIM_Z;    // Current channel
 146   const int batch = get_global_id(2) / DIM_Z; // Current batch
 147   const int elements_plane = DIM_X * DIM_Y;
 148
 149   VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 150   part_sum = 0.f;
 151   VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 152   part_sum_sq = 0.f;
 153   // Calculate partial sum
 154   for (int y = 0; y < DIM_Y; ++y)
 155   {
 156     int x = 0;
 157     for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
 158     {
 159       // Load data
 160       VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 161       data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
 162       part_sum += data;
 163       part_sum_sq += data * data;
 164     }
 165     // Left-overs loop
 166     for (; x < DIM_X; ++x)
 167     {
 168       DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
 169       part_sum.s0 += data;
 170       part_sum_sq.s0 += data * data;
 171     }
 172   }
 173 // Perform reduction
 174 #if VEC_SIZE > 8
 175   part_sum.s01234567 += part_sum.s89abcdef;
 176   part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
 177 #endif // VEC_SIZE > 8
 178 #if VEC_SIZE > 4
 179   part_sum.s0123 += part_sum.s4567;
 180   part_sum_sq.s0123 += part_sum_sq.s4567;
 181 #endif // VEC_SIZE > 4
 182 #if VEC_SIZE > 2
 183   part_sum.s01 += part_sum.s23;
 184   part_sum_sq.s01 += part_sum_sq.s23;
 185 #endif // VEC_SIZE > 2
 186   part_sum.s0 += part_sum.s1;
 187   part_sum_sq.s0 += part_sum_sq.s1;
 188
 189   sum = (float)part_sum.s0;
 190   sum_sq = (float)part_sum_sq.s0;
 191
 192 #endif // defined(NHWC)
 193
 194   const float mean_float = (sum / elements_plane);
 195   const DATA_TYPE mean = (DATA_TYPE)mean_float;
 196   const float var_float = (sum_sq / elements_plane) - (mean_float * mean_float);
 197 #if defined(GAMMA)
 198   const float multip_float = *((__global DATA_TYPE *)gamma_ptr + ch) / sqrt(var_float + EPSILON);
 199   const DATA_TYPE multip = (DATA_TYPE)multip_float;
 200 #else  // !defined(GAMMA)
 201   const DATA_TYPE multip = (DATA_TYPE)0;
 202 #endif // defined(GAMMA)
 203 #if defined(BETA)
 204   const DATA_TYPE beta = *((__global DATA_TYPE *)beta_ptr + ch);
 205 #else  // !defined(BETA)
 206   const DATA_TYPE beta = 0;
 207 #endif // defined(BETA)
 208
 209 #if defined(NHWC)
 210
 211   for (int i_w = 0; i_w < DIM_Y; ++i_w)
 212   {
 213     for (int i_h = 0; i_h < DIM_Z; ++i_h)
 214     {
 215       __global DATA_TYPE *input_address =
 216           (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
 217 #ifdef IN_PLACE
 218       __global DATA_TYPE *output_address = input_address;
 219 #else  /* !IN_PLACE */
 220       __global DATA_TYPE *output_address =
 221           (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
 222 #endif /* IN_PLACE */
 223       *(output_address) = (*(input_address)-mean) * multip + beta;
 224     }
 225   }
 226
 227 #else // !defined(NHWC)
 228   for (int y = 0; y < DIM_Y; ++y)
 229   {
 230     int x = 0;
 231     for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
 232     {
 233       __global DATA_TYPE *input_address =
 234           (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
 235 #ifdef IN_PLACE
 236       __global DATA_TYPE *output_address = input_address;
 237 #else  /* !IN_PLACE */
 238       __global DATA_TYPE *output_address =
 239           (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
 240 #endif /* IN_PLACE */
 241
 242       VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 243       data = VLOAD(VEC_SIZE)(0, input_address);
 244
 245       VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 246       res = (data - mean) * multip + beta;
 247       VSTORE(VEC_SIZE)
 248       (res, 0, output_address);
 249     }
 250     // Left-overs loop
 251     for (; x < DIM_X; ++x)
 252     {
 253       __global DATA_TYPE *input_address =
 254           (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
 255 #ifdef IN_PLACE
 256       __global DATA_TYPE *output_address = input_address;
 257 #else  /* !IN_PLACE */
 258       __global DATA_TYPE *output_address =
 259           (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
 260 #endif /* IN_PLACE */
 261       *(output_address) = (*(input_address)-mean) * multip + beta;
 262     }
 263   }
 264 #endif // defined(NHWC)
 265 }
 266 #endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
 267           defined(DIM_Y) && defined(DIM_Z) */