inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reducer.hpp

   1 /*******************************************************************************
   2 * Copyright 2017-2018 Intel Corporation
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *     http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 * See the License for the specific language governing permissions and
  14 * limitations under the License.
  15 *******************************************************************************/
  16
  17 #ifndef CPU_REDUCER_HPP
  18 #define CPU_REDUCER_HPP
  19
  20 #include <assert.h>
  21
  22 #include "c_types_map.hpp"
  23 #include "memory_tracking.hpp"
  24 #include "mkldnn_thread.hpp"
  25 #include "mkldnn_types.h"
  26 #include "nstl.hpp"
  27 #include "type_helpers.hpp"
  28
  29 #include "cpu_barrier.hpp"
  30
  31 namespace mkldnn {
  32 namespace impl {
  33 namespace cpu {
  34
  35 /** class to perform balancing over 3D array
  36  *
  37  * Conceptually the reduction happens according to the picture below:
  38  *
  39  *  <--job_size->
  40  *  +-----------+   +-----------+           +-----------+  ^
  41  *  |           |   |           |           |           |  |
  42  *  |           |   |           |           |           |  |
  43  *  |     1     |   |     2     |   . . .   |   njobs   |  | reduction_size
  44  *  |           |   |           |           |           |  |
  45  *  |           |   |           |           |           |  |
  46  *  +-----------+   +-----------+           +-----------+  v
  47  *
  48  *    |   |   |       |   |   |               |   |   |
  49  *    v   v   v       v   v   v               v   v   v
  50  *  ===================================================== vertical reduction
  51  *
  52  *  +-----------+   +-----------+   . . .   +-----------+ result
  53  *
  54  * In a simple case the result must be contiguous in memory.
  55  * @class cpu_reducer_t is an implementation.
  56  *
  57  * Threads are divided into groups. The groups are independent of each other.
  58  * Each group may work on several jobs (the distribution is not uniform, since
  59  * njobs might be not a multiple of groups). Threads within a group work on
  60  * different parts of the reduction dimension. Thread 0 in each group is called
  61  * master (@sa reduce_balancer_t::master()).
  62  *
  63  * If threading driver does not allow sync between sub-group of threads (e.g.
  64  * Intel(R) TBB) the # of thread per group is enforced to be 1.
  65  */
  66 struct reduce_balancer_t {
  67     reduce_balancer_t() { init(1, 1, 1, 1, 0); } /* trivial balance */
  68     reduce_balancer_t(int nthr, int job_size, int njobs, int reduction_size,
  69             size_t max_buffer_size)
  70     { init(nthr, job_size, njobs, reduction_size, max_buffer_size); }
  71
  72     reduce_balancer_t &init(int nthr, int job_size, int njobs,
  73             int reduction_size, size_t max_buffer_size)
  74     {
  75         syncable_ = mkldnn_thr_syncable();
  76         nthr_ = nthr;
  77         job_size_ = job_size;
  78         njobs_ = njobs;
  79         reduction_size_ = reduction_size;
  80         max_buffer_size_ = max_buffer_size;
  81         balance();
  82         return *this;
  83     }
  84
  85     bool syncable_;
  86     int nthr_;
  87     int job_size_, njobs_, reduction_size_;
  88
  89     int ngroups_; /** number of independent work (thread) groups */
  90     int nthr_per_group_; /** number of threads within a single work group */
  91     int njobs_per_group_ub_; /** the max # of jobs within a work group */
  92
  93     bool master(int ithr) const { return id_in_group(ithr) == 0; }
  94     bool idle(int ithr) const { return ithr >= nthr_per_group_ * ngroups_; }
  95
  96     int group_id(int ithr) const { return ithr / nthr_per_group_; }
  97     int id_in_group(int ithr) const { return ithr % nthr_per_group_; }
  98
  99     int grp_njobs(int grp) const {
 100         if (grp >= ngroups_) return 0;
 101         return njobs_ / ngroups_ + (grp < njobs_ % ngroups_);
 102     }
 103     int grp_job_off(int grp) const {
 104         if (grp >= ngroups_) return njobs_;
 105         return njobs_ / ngroups_ * grp + nstl::min(grp, njobs_ % ngroups_);
 106     }
 107
 108     int ithr_njobs(int ithr) const { return grp_njobs(group_id(ithr)); }
 109     int ithr_job_off(int ithr) const { return grp_job_off(group_id(ithr)); }
 110
 111 private:
 112     size_t max_buffer_size_;
 113     void balance();
 114 };
 115
 116 /** forward declaration of reduce driver */
 117 template <impl::data_type_t data_type> struct reducer_2d_driver_t;
 118
 119 /** class to perform a reduction over 3D array
 120  *
 121  * Balancing is based on @class reduce_balancer_t.
 122  * Restrictions: the result of the reduction must be contiguous in memory. *
 123  * The reduction happens according to the picture below (once more):
 124  *
 125  *  <--job_size->
 126  *  +-----------+   +-----------+           +-----------+  ^
 127  *  |           |   |           |           |           |  |
 128  *  |           |   |           |           |           |  |
 129  *  |     1     |   |     2     |   . . .   |   njobs   |  | reduction_size
 130  *  |           |   |           |           |           |  |
 131  *  |           |   |           |           |           |  |
 132  *  +-----------+   +-----------+           +-----------+  v
 133  *
 134  *    |   |   |       |   |   |               |   |   |
 135  *    v   v   v       v   v   v               v   v   v
 136  *  ===================================================== vertical reduction
 137  *
 138  *  +-----------+   +-----------+   . . .   +-----------+ (contiguous) result
 139  *
 140  * An example how work might be shared is shown below.
 141  *
 142  * In this example group 0 owns 2 (independent) jobs -- 2 big squares.
 143  * The number of threads per group is also 2 (thread 0 of group 0 and thread 1
 144  * of group 0). Master threads (i.e. threads with id 0 in corresponding group)
 145  * from each group put the partial result directly into destination memory,
 146  * while all the other threads with-in the group use workspace (on the picture
 147  * the only thread 1). Once intermediate results obtained each group reduces
 148  * corresponding part (own jobs) to the destination memory.
 149  *
 150  *  <-------   group 0   ------->
 151  *
 152  *  +-----------+   +-----------+  ^
 153  *  |           |   |           |  | thread 0 of  reduces to the dest-memory
 154  *  |           |   |           |  | group 0      +-----------+   +-----------+
 155  *  |- - - - - -|   |- - - - - -|  X
 156  *  |           |   |           |  | thread 1 of  reduces to workspace[tid=1]:
 157  *  |           |   |           |  | group 0      +-----------+   +-----------+
 158  *  +-----------+   +-----------+  v
 159  *                                                  |   |   |       |   |   |
 160  *                                                  v   v   v       v   v   v
 161  *                                   ((barrier))  =============================
 162  *
 163  *                                  dest-memory:  +-----------+   +-----------+
 164  */
 165 template <impl::data_type_t data_type>
 166 struct cpu_reducer_t {
 167     typedef typename prec_traits<data_type>::type data_t;
 168
 169     struct conf_t {
 170         conf_t() = default;
 171         conf_t &init(const reduce_balancer_t &balancer)
 172         { balancer_ = balancer; return *this; }
 173
 174         void init_scratchpad(memory_tracking::registrar_t &scratchpad) const;
 175
 176         reduce_balancer_t balancer_;
 177     };
 178
 179     cpu_reducer_t(const conf_t &conf);
 180     ~cpu_reducer_t();
 181
 182     /** initializes reducer.
 183      * Must be called from a single thread prior to actual usage */
 184     void init(const memory_tracking::grantor_t &scratchpad) const {
 185         if (balancer().nthr_per_group_ == 1) return;
 186
 187         auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
 188                 memory_tracking::names::key_reducer_space_bctx);
 189         for (int i = 0; i < balancer().ngroups_; ++i)
 190             simple_barrier::ctx_init(&bctx[i]);
 191     }
 192
 193     /** for given thread returns the pointer where to put partial results.
 194      * Reduction destination @p dst must be provided as well (master threads
 195      * from each group will use it for partial result to reduce memory
 196      * pressure).
 197      *
 198      * @note: job offset is already applied by get_local_ptr(), which means all
 199      *        threads should start writing from the very beginning of returned
 200      *        address.
 201      */
 202     data_t *get_local_ptr(int ithr, data_t *dst,
 203             const memory_tracking::grantor_t &scratchpad) const;
 204
 205     /** performs the reduction with built-in synchronization. */
 206     void reduce(int ithr, data_t *dst,
 207             const memory_tracking::grantor_t &scratchpad) const {
 208         bool redundant_reduction = balancer().nthr_per_group_ == 1
 209             || balancer().idle(ithr);
 210         if (redundant_reduction) return;
 211
 212         auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
 213                 memory_tracking::names::key_reducer_space_bctx);
 214         simple_barrier::barrier(&bctx[balancer().group_id(ithr)],
 215                 balancer().nthr_per_group_);
 216
 217         reduce_nolock(ithr, dst, scratchpad);
 218     }
 219
 220     const reduce_balancer_t &balancer() const { return conf_.balancer_; }
 221
 222 private:
 223     static size_t space_per_thread(const reduce_balancer_t &balancer)
 224     { return balancer.njobs_per_group_ub_ * balancer.job_size_; }
 225
 226     /* The scratchpad is organized as follows:
 227      *
 228      * data_t space[nthr_][njobs_per_group_ub_][jobs_size_];
 229      * simple_barrier::ctx_t barriers[groups_]; */
 230
 231     const conf_t conf_;
 232     reducer_2d_driver_t<data_type> *drv_;
 233
 234     void reduce_nolock(int ithr, data_t *dst,
 235             const memory_tracking::grantor_t &scratchpad) const;
 236 };
 237
 238 template <impl::data_type_t data_type>
 239 struct cpu_reducer_2d_t {
 240     typedef typename prec_traits<data_type>::type data_t;
 241
 242     struct conf_t {
 243         conf_t() = default;
 244         conf_t &init(const reduce_balancer_t &balancer, int job_size_x,
 245                 int job_size_y, int x_block, int dst_x, int dst_y) {
 246             balancer_ = balancer;
 247             job_size_x_ = job_size_x;
 248             job_size_y_ = job_size_y;
 249             x_block_ = x_block;
 250             dst_x_ = dst_x;
 251             dst_y_ = dst_y;
 252             return *this;
 253         }
 254
 255         void init_scratchpad(memory_tracking::registrar_t &scratchpad) const;
 256
 257         reduce_balancer_t balancer_;
 258         int job_size_x_, job_size_y_, x_block_, dst_x_, dst_y_;
 259     };
 260
 261     cpu_reducer_2d_t(const conf_t &conf);
 262     ~cpu_reducer_2d_t();
 263
 264     /** initializes reducer.
 265      * Must be called from a single thread prior to actual usage */
 266     void init(const memory_tracking::grantor_t &scratchpad) const {
 267         if (balancer().nthr_per_group_ == 1) return;
 268
 269         auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
 270                 memory_tracking::names::key_reducer_space_bctx);
 271         for (int i = 0; i < balancer().ngroups_; ++i)
 272             simple_barrier::ctx_init(&bctx[i]);
 273     }
 274
 275     /** for given thread returns the pointer where to put partial results */
 276     data_t *get_local_ptr(int ithr,
 277             const memory_tracking::grantor_t &scratchpad) const;
 278
 279     /** performs the reduction with built-in synchronization. */
 280     void reduce(int ithr, data_t *dst,
 281             const memory_tracking::grantor_t &scratchpad) const {
 282         bool redundant_reduction = balancer().nthr_per_group_ == 1
 283             || balancer().idle(ithr);
 284         if (redundant_reduction) return;
 285
 286         auto bctx = scratchpad.template get<simple_barrier::ctx_t>(
 287                 memory_tracking::names::key_reducer_space_bctx);
 288         simple_barrier::barrier(&bctx[balancer().group_id(ithr)],
 289                 balancer().nthr_per_group_);
 290
 291         reduce_nolock(ithr, dst, scratchpad);
 292     }
 293
 294     const reduce_balancer_t &balancer() const { return conf_.balancer_; }
 295
 296 private:
 297     static size_t space_per_thread(const reduce_balancer_t &balancer)
 298     { return balancer.njobs_per_group_ub_ * balancer.job_size_; }
 299
 300     /* The scratchpad is organized as follows:
 301      *
 302      * data_t space[nthr_][njobs_per_group_ub_][jobs_size_];
 303      * simple_barrier::ctx_t barriers[groups_]; */
 304
 305     const conf_t conf_;
 306     reducer_2d_driver_t<data_type> *drv_;
 307
 308     int choose_x_blocking(int nx, int ny, int nthr_per_grp) const;
 309     void reduce_block(const data_t* space_base, data_t *dst,
 310             int job, int start_y, int start_x,
 311             int ny_start, int nx_start, int ny_step, int nx_step) const;
 312     void reduce_nolock(int ithr, data_t *dst,
 313             const memory_tracking::grantor_t &scratchpad) const;
 314 };
 315
 316 /** simple 1d accumulator: y[:] += x[:] */
 317 template <impl::data_type_t data_type>
 318 struct cpu_accumulator_1d_t {
 319     typedef typename prec_traits<data_type>::type data_t;
 320
 321     cpu_accumulator_1d_t();
 322     ~cpu_accumulator_1d_t();
 323     void accumulate(data_t *dst, const data_t *src, size_t size);
 324
 325     reducer_2d_driver_t<data_type> *drv_;
 326 };
 327
 328 }
 329 }
 330 }
 331
 332 #endif
 333
 334 // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s