inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_inner_product.cpp

   1 /*******************************************************************************
   2 * Copyright 2017-2018 Intel Corporation
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *     http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 * See the License for the specific language governing permissions and
  14 * limitations under the License.
  15 *******************************************************************************/
  16
  17 #include "c_types_map.hpp"
  18 #include "type_helpers.hpp"
  19 #include "mkldnn_thread.hpp"
  20
  21 #include "gemm/jit_avx2_gemm_f32.hpp"
  22 #include "gemm/jit_avx512_common_gemm_f32.hpp"
  23 #include "jit_uni_inner_product.hpp"
  24
  25 namespace mkldnn {
  26 namespace impl {
  27 namespace cpu {
  28
  29 using namespace mkldnn::impl::status;
  30 using namespace mkldnn::impl::prop_kind;
  31 using namespace mkldnn::impl::data_type;
  32 using namespace mkldnn::impl::memory_format;
  33 using namespace mkldnn::impl::primitive_kind;
  34
  35 template <cpu_isa_t isa>
  36 jit_uni_inner_product_fwd_t<isa>::jit_uni_inner_product_fwd_t(const pd_t *pd,
  37         const input_vector &inputs, const output_vector &outputs)
  38     : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
  39 {
  40     sgemm_ = new jit_uni_gemm_f32('T', 'N', 0.0, conf_.with_bias());
  41 }
  42
  43 template <cpu_isa_t isa>
  44 jit_uni_inner_product_fwd_t<isa>::~jit_uni_inner_product_fwd_t()
  45 {
  46     delete sgemm_;
  47 }
  48
  49 template <cpu_isa_t isa>
  50 void jit_uni_inner_product_fwd_t<isa>::execute_forward()
  51 {
  52     auto src = reinterpret_cast<const data_t *>(this->input_memory(0));
  53     auto weights = reinterpret_cast<const data_t *>(this->input_memory(1));
  54     auto bias = reinterpret_cast<const data_t *>(this->input_memory(2));
  55     auto dst = reinterpret_cast<data_t *>(this->memory());
  56
  57     // TODO: consistency checks
  58     int MB = conf_.MB();
  59     int OC = conf_.OC();
  60     int IC = conf_.IC_total_padded();
  61
  62     float alpha = 1.0, beta = 0.0;
  63     sgemm_->sgemm("T", "N", &OC, &MB, &IC, &alpha, weights, &IC, src, &IC, &beta,
  64             dst, &OC, bias);
  65 }
  66
  67 template <cpu_isa_t isa>
  68 jit_uni_inner_product_bwd_weights_t<isa>::jit_uni_inner_product_bwd_weights_t(const pd_t *pd,
  69         const input_vector &inputs, const output_vector &outputs)
  70     : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
  71 {
  72     sgemm_ = new jit_uni_gemm_f32('N', 'T', 0.0, false);
  73 }
  74
  75 template <cpu_isa_t isa>
  76 jit_uni_inner_product_bwd_weights_t<isa>::~jit_uni_inner_product_bwd_weights_t()
  77 {
  78     delete sgemm_;
  79 }
  80
  81 template <cpu_isa_t isa>
  82 void jit_uni_inner_product_bwd_weights_t<isa>::execute_backward_weights()
  83 {
  84     auto src = reinterpret_cast<const data_t *>(this->input_memory(0));
  85     auto diff_dst = reinterpret_cast<const data_t *>(this->input_memory(1));
  86     auto diff_weights = reinterpret_cast<data_t *>(this->memory(0));
  87     auto diff_bias = reinterpret_cast<data_t *>(this->memory(1));
  88
  89     const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd());
  90     const memory_desc_wrapper diff_bias_d(conf_.diff_weights_pd(1));
  91
  92     diff_dst += diff_dst_d.blocking_desc().offset_padding;
  93
  94     // TODO: consistency checks
  95     int MB = conf_.MB();
  96     int OC = conf_.OC();
  97     int IC = conf_.IC_total_padded();
  98
  99     float alpha = 1.0, beta = 0.0;
 100     sgemm_->sgemm("N", "T", &IC, &OC, &MB, &alpha, src, &IC, diff_dst, &OC, &beta,
 101             diff_weights, &IC, nullptr);
 102
 103     if (diff_bias) {
 104         diff_bias += diff_bias_d.blocking_desc().offset_padding;
 105         constexpr int blksize = 8;
 106         int OC_blocks = OC / blksize;
 107         int rem_OC = OC % blksize;
 108 #       pragma omp parallel
 109         {
 110             const int ithr = omp_get_thread_num();
 111             const int nthr = omp_get_num_threads();
 112             int oc_st{0}, oc_e{0};
 113             balance211(OC_blocks, nthr, ithr, oc_st, oc_e);
 114             oc_st = oc_st * blksize;
 115             oc_e = oc_e * blksize;
 116
 117             PRAGMA_OMP_SIMD()
 118             for (int oc = oc_st; oc < oc_e; ++oc) {
 119                 diff_bias[oc] = diff_dst[oc];
 120             }
 121
 122             for (int mb = 1; mb < MB; ++mb) {
 123                 PRAGMA_OMP_SIMD()
 124                 for (int oc = oc_st; oc < oc_e; ++oc) {
 125                     diff_bias[oc] += diff_dst[mb * OC + oc];
 126                 }
 127             }
 128
 129             if (rem_OC != 0 && ithr == nthr-1) {
 130                 for (int oc = OC_blocks * blksize; oc < OC; oc++)
 131                     diff_bias[oc] = diff_dst[oc];
 132                 for (int mb = 1; mb < MB; ++mb) {
 133                     for (int oc = OC_blocks * blksize; oc < OC; oc++) {
 134                         diff_bias[oc] += diff_dst[mb * OC + oc];
 135                     }
 136                 }
 137             }
 138         }
 139     }
 140 }
 141
 142 template <cpu_isa_t isa>
 143 jit_uni_inner_product_bwd_data_t<isa>::jit_uni_inner_product_bwd_data_t(const pd_t *pd,
 144         const input_vector &inputs, const output_vector &outputs)
 145     : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
 146 {
 147     sgemm_ = new jit_uni_gemm_f32('N', 'N', 0.0, false);
 148 }
 149
 150 template <cpu_isa_t isa>
 151 jit_uni_inner_product_bwd_data_t<isa>::~jit_uni_inner_product_bwd_data_t()
 152 {
 153     delete sgemm_;
 154 }
 155
 156 template <cpu_isa_t isa>
 157 void jit_uni_inner_product_bwd_data_t<isa>::execute_backward_data()
 158 {
 159     auto diff_dst = reinterpret_cast<const data_t *>(this->input_memory(0));
 160     auto weights = reinterpret_cast<const data_t *>(this->input_memory(1));
 161     auto diff_src = reinterpret_cast<data_t*>(this->memory());
 162
 163     // TODO: consistency checks
 164     int MB = conf_.MB();
 165     int OC = conf_.OC();
 166     int IC = conf_.IC_total_padded();
 167
 168     float alpha = 1.0, beta = 0.0;
 169
 170     sgemm_->sgemm("N", "N", &IC, &MB, &OC, &alpha, weights, &IC, diff_dst, &OC, &beta,
 171             diff_src, &IC, nullptr);
 172 }
 173
 174 template struct jit_uni_inner_product_bwd_data_t<avx512_common>;
 175 template struct jit_uni_inner_product_bwd_weights_t<avx512_common>;
 176 template struct jit_uni_inner_product_fwd_t<avx512_common>;
 177 template struct jit_uni_inner_product_bwd_data_t<avx2>;
 178 template struct jit_uni_inner_product_bwd_weights_t<avx2>;
 179 template struct jit_uni_inner_product_fwd_t<avx2>;
 180
 181 }
 182 }
 183 }
 184
 185 // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s