inference-engine/thirdparty/mkl-dnn/src/common/memory_tracking.hpp

   1 /*******************************************************************************
   2 * Copyright 2018 Intel Corporation
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *     http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 * See the License for the specific language governing permissions and
  14 * limitations under the License.
  15 *******************************************************************************/
  16
  17 #ifndef MEMORY_TRACKING_HPP
  18 #define MEMORY_TRACKING_HPP
  19
  20 #include <assert.h>
  21 #include <unordered_map>
  22
  23 #include "nstl.hpp"
  24 #include "utils.hpp"
  25
  26 namespace mkldnn {
  27 namespace impl {
  28 namespace memory_tracking {
  29
  30 /* Memory tracking capabilities
  31  *
  32  * The main purpose of this header file is to provide uniform way to register
  33  * required memory for a scratchpad at a primitive descriptor creation time
  34  * and then easily access it having only the base address of the scratchpad.
  35  *
  36  * Primitives might contain multiple disjoint parts that require temporary
  37  * buffers (known as scratchpad) during their execution. A primitive descriptor
  38  * should summarize all the needs into one single number -- the buffer size
  39  * that would be requested from a user. At execution time, the corresponding
  40  * primitive will receive a base pointer to a scratchpad. It then needs to
  41  * provide each part of algorithm the corresponding piece of memory. Three main
  42  * challenges here are:
  43  * 1. Track correct offset (from the base scratchpad address) for each piece
  44  * 2. Algorithm might require that different memory pieces to be aligned, so
  45  *    the scratchpad size is no more just a sum of size of the corresponding
  46  *    subparts.
  47  * 3. While a primitive is responsible for its scratchpad, the implementation
  48  *    might use some other basic blocks (e.g. cpu_reducer) that also require
  49  *    scratchpad memory. So there should be a simple way of passing the
  50  *    information back and force between the main algorithm (a primitive) and
  51  *    auxiliary stuff that lives completely separately from it (e.g. reducer).
  52  *
  53  * To address these challenges this header file provides 3 structures:
  54  * 1. registry_t  -- the class the stores the information about requested
  55  *                   memory. The information includes required size and desired
  56  *                   alignment for each piece. This class is also responsible
  57  *                   for computing the right offset to a given piece using the
  58  *                   base pointer.
  59  *                   This class is basically a ledger with all entries.
  60  *                   Lives in primitive descriptors.
  61  *
  62  * 2. registrar_t -- the interface to a registry_t to book memory. Used at
  63  *                   primitive descriptor creation time only. Contains a
  64  *                   reference to the corresponding *mutable* registry.
  65  *                   Always modifiable.
  66  *                   Allows chaining (using prefixes).
  67  *
  68  * 3. grantor_t   -- the interface to a registry_t to access memory. Used at
  69  *                   primitive execution time only. Contains a reference to
  70  *                   the corresponding *constant* registry and base pointer.
  71  *                   Always constant.
  72  *                   Allows chaining (using prefixes).
  73  *
  74  * Both registrar_t and grantor_t allow chaining with extra prefix provided.
  75  * The feature is useful when a primitive offload a part of computations to
  76  * some other primitives which require their own scratchpad space
  77  * (e.g. reducer). Prefixes are used to avoid key collision in cases when
  78  * multiple sub-primitive (e.g. multiple reducers) are used.
  79  *
  80  * A short example below demonstrates how to use aforementioned classes. In it
  81  * the main primitive is convolution that uses scratchpad for keeping padded
  82  * bias. It also needs a reducer, that needs its own space as well.
  83  *
  84  *  ``` c++
  85  *  struct reducer_t {
  86  *      static void init(registrar_t &scratchpad) {
  87  *          // preserve space for the reduction (one page aligned)
  88  *          scratchpad.book(key_space, sizeof(float) * 980 * 1024, 4096);
  89  *      }
  90  *
  91  *      void exec(const grantor_t &scratchpad) {
  92  *          // get the pointer to preserved space. scratchpad came from
  93  *          // upper primitive (convolution in this example)
  94  *          auto space = scratchpad.get<float>(key_reducer_space);
  95  *
  96  *          space[:] += ...;
  97  *      }
  98  *  };
  99  *
 100  *  struct conv_t {
 101  *      struct pd_t {
 102  *          void init() {
 103  *              registrar_t scratchpad(scratchpad_registry_);
 104  *
 105  *              // preserve a space for padded bias (using default alignment)
 106  *              scratchpad.book(key_conv_padded_bias, 128);
 107  *
 108  *              // create a proxy registrar for the reducer All entries made
 109  *              // by reducer would live in convolution's registry, but would
 110  *              // have their own `prefix`, so no interference with conv's
 111  *              // buffers.
 112  *              registrar_t reducer_scratchpad(scratchpad, prefix_reducer);
 113  *
 114  *              reducer_t::init(reducer_scratchpad);
 115  *          }
 116  *
 117  *          registry_t scratchpad_registry_;
 118  *      }
 119  *
 120  *      void exec() {
 121  *          // get the base pointer to a scratchpad memory from a user
 122  *          void *scratchpad_ptr = this->input(MKLDNN_MEM_SCRATCHPAD);
 123  *
 124  *          // create a grantor to the scratchpad (and provide the base
 125  *          // pointer).
 126  *          grantor_t scratchpad(pd()->scratchpad_registry_, scratchpad_ptr);
 127  *
 128  *          // access the padded_bias (need only key name and the grantor)
 129  *          auto padded_bias = scratchpad.get<float>(key_conv_padded_bias);
 130  *
 131  *          // to give the `right` grantor to reducer we need to add the
 132  *          // corresponding prefix, so that reducer would be able to access
 133  *          // its keys. The call is very similar to the one in pd_t::init
 134  *          // with only difference in types: grantor_t vs registrar_t.
 135  *          grantor_t reducer_scratchpad(scratchpad, prefix_reducer);
 136  *          reducer->exec(reducer_scratchpad);
 137  *      }
 138  *  };
 139  *  ```
 140  */
 141
 142
 143 /* namespace with common keys and prefixes */
 144 namespace names {
 145 enum {
 146     key_none = 0,
 147     key_bnorm_tmp_mean,
 148     key_bnorm_tmp_var,
 149     key_bnorm_tmp_diff_ss,
 150     key_bnorm_tmp_stats,
 151     key_bnorm_reduction,
 152     key_concat_iptrs,
 153     key_concat_istrides,
 154     key_concat_nelems,
 155     key_concat_optrs,
 156     key_conv_adjusted_scales,
 157     key_conv_bia_reduction,
 158     key_conv_gemm_col,
 159     key_conv_int_dat_in_acc_dt,
 160     key_conv_padded_bias,
 161     key_conv_rtus_space,
 162     key_conv_tr_diff_dst,
 163     key_conv_tr_diff_dst_bctx,
 164     key_conv_tr_src,
 165     key_conv_tr_src_bctx,
 166     key_conv_wei_reduction,
 167     key_conv_wei_bia_reduction,
 168     key_conv_wei_bia_reduction_bctx,
 169     key_iprod_int_dat_in_acc_dt,
 170     key_reducer_space,
 171     key_reducer_space_bctx,
 172     key_reorder_wino_plain,
 173     key_reorder_wino_transform_space,
 174     key_reorder_rnn_weights_quantization,
 175     key_reorder_rnn_weights_reduction,
 176     key_rnn_space,
 177     key_rnn_ptrs_bia,
 178     key_rnn_ptrs_wei_layer,
 179     key_rnn_ptrs_wei_iter,
 180     key_softmax_reduction,
 181     key_wino_U,
 182     key_wino_V,
 183     key_wino_M,
 184     key_barrier,
 185     key_dw_conv_buffer,
 186     key_dw_conv_padded_bias,
 187     key_conv_padded_compensation,
 188 };
 189
 190 enum {
 191     prefix_none = 0,
 192     prefix_reducer_bia,
 193     prefix_reducer_wei,
 194 };
 195 }
 196
 197 // level 0: 00 00 00 xxx
 198 // level 1: 00 00 aa xxx
 199 // level 2: 00 aa bb xxx
 200 // level 3: aa bb cc xxx
 201 // max # of levels: 3 + 1 (base_level)
 202 // here:
 203 //      xxx        : [1 ..    MAX_KEY) : key
 204 //      aa, bb, cc : [1 .. MAX_PREFIX) : prefixes for levels 1, 2, and 3
 205
 206 using key_t = uint32_t;
 207 enum { MAX_KEY = (1u << 10), MAX_PREFIX = (1u << 7), };
 208
 209 /// generates global key based on a prefix and a local key
 210 inline key_t make_key(key_t prefix, key_t key) { return prefix + key; }
 211
 212 /// generates global prefix based on the global parent and the local ones
 213 inline key_t make_prefix(key_t parent_prefix, key_t prefix)
 214 { return MAX_PREFIX * parent_prefix + MAX_KEY * prefix; }
 215
 216 struct registrar_t;
 217 struct grantor_t;
 218
 219 struct registry_t {
 220     void book(const key_t &key, size_t size, size_t alignment) {
 221         if (size == 0) return;
 222         assert(offset_map_.count(key) == 0);
 223
 224         size = utils::rnd_up(size, minimal_alignment);
 225         alignment = nstl::max<size_t>(alignment, minimal_alignment);
 226         offset_map_[key] = entry_t{size_, size, alignment};
 227
 228         size_ += size + alignment - minimal_alignment;
 229     }
 230
 231     void *get(const key_t &key, void *base_ptr) const {
 232         if (base_ptr == nullptr) { assert(size() == 0); return nullptr; }
 233         if (offset_map_.count(key) != 1) return nullptr;
 234
 235         const auto &e = offset_map_.at(key);
 236         base_ptr = utils::align_ptr<void>(base_ptr, minimal_alignment);
 237         char *ptr = (char *)base_ptr + e.offset;
 238         return utils::align_ptr<void>(ptr, e.alignment);
 239     }
 240
 241     size_t size() const
 242     { return size_ > 0 ? size_ + minimal_alignment - 1 : 0; }
 243
 244     registrar_t registrar();
 245     grantor_t grantor(void *base_ptr) const;
 246
 247 protected:
 248     enum { minimal_alignment = 64 };
 249     struct entry_t { size_t offset, size, alignment; };
 250
 251     std::unordered_map<key_t, entry_t> offset_map_;
 252     size_t size_ = 0;
 253 };
 254
 255 struct registrar_t {
 256     enum { default_alignment = 64 };
 257
 258     registrar_t(registry_t &registry): registry_(registry), prefix_(0) {}
 259     registrar_t(registrar_t &parent, const key_t &prefix)
 260         : registry_(parent.registry_)
 261         , prefix_(make_prefix(parent.prefix_, prefix)) {}
 262
 263     void book(const key_t &key, size_t size,
 264             size_t alignment = default_alignment)
 265     { registry_.book(make_key(prefix_, key), size, alignment); }
 266
 267 protected:
 268     registry_t &registry_;
 269     const key_t prefix_;
 270 };
 271
 272 struct grantor_t {
 273     grantor_t(const registry_t &registry, void *base_ptr)
 274         : registry_(registry), prefix_(0), base_ptr_(base_ptr) {}
 275     grantor_t(const grantor_t &parent, const key_t &prefix)
 276         : registry_(parent.registry_)
 277         , prefix_(make_prefix(parent.prefix_, prefix))
 278         , base_ptr_(parent.base_ptr_) {}
 279
 280     template <typename T = void> T *get(const key_t &key) const
 281     { return (T *)registry_.get(make_key(prefix_, key), base_ptr_); }
 282
 283 protected:
 284     const registry_t &registry_;
 285     const key_t prefix_;
 286     void *base_ptr_;
 287 };
 288
 289 inline registrar_t registry_t::registrar() { return registrar_t(*this); }
 290 inline grantor_t registry_t::grantor(void *base_ptr) const
 291 { return grantor_t(*this, base_ptr); }
 292
 293 }
 294 }
 295 }
 296
 297 #endif