1 /*******************************************************************************
2 * Copyright 2017-2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
17 #include "c_types_map.hpp"
18 #include "type_helpers.hpp"
19 #include "mkldnn_thread.hpp"
21 #include "gemm/jit_avx2_gemm_f32.hpp"
22 #include "gemm/jit_avx512_common_gemm_f32.hpp"
23 #include "jit_uni_inner_product.hpp"
29 using namespace mkldnn::impl::status;
30 using namespace mkldnn::impl::prop_kind;
31 using namespace mkldnn::impl::data_type;
32 using namespace mkldnn::impl::memory_format;
33 using namespace mkldnn::impl::primitive_kind;
35 template <cpu_isa_t isa>
36 jit_uni_inner_product_fwd_t<isa>::jit_uni_inner_product_fwd_t(const pd_t *pd,
37 const input_vector &inputs, const output_vector &outputs)
38 : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
40 sgemm_ = new jit_uni_gemm_f32('T', 'N', 0.0, conf_.with_bias());
43 template <cpu_isa_t isa>
44 jit_uni_inner_product_fwd_t<isa>::~jit_uni_inner_product_fwd_t()
49 template <cpu_isa_t isa>
50 void jit_uni_inner_product_fwd_t<isa>::execute_forward()
52 auto src = reinterpret_cast<const data_t *>(this->input_memory(0));
53 auto weights = reinterpret_cast<const data_t *>(this->input_memory(1));
54 auto bias = reinterpret_cast<const data_t *>(this->input_memory(2));
55 auto dst = reinterpret_cast<data_t *>(this->memory());
57 // TODO: consistency checks
60 int IC = conf_.IC_total_padded();
62 float alpha = 1.0, beta = 0.0;
63 sgemm_->sgemm("T", "N", &OC, &MB, &IC, &alpha, weights, &IC, src, &IC, &beta,
67 template <cpu_isa_t isa>
68 jit_uni_inner_product_bwd_weights_t<isa>::jit_uni_inner_product_bwd_weights_t(const pd_t *pd,
69 const input_vector &inputs, const output_vector &outputs)
70 : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
72 sgemm_ = new jit_uni_gemm_f32('N', 'T', 0.0, false);
75 template <cpu_isa_t isa>
76 jit_uni_inner_product_bwd_weights_t<isa>::~jit_uni_inner_product_bwd_weights_t()
81 template <cpu_isa_t isa>
82 void jit_uni_inner_product_bwd_weights_t<isa>::execute_backward_weights()
84 auto src = reinterpret_cast<const data_t *>(this->input_memory(0));
85 auto diff_dst = reinterpret_cast<const data_t *>(this->input_memory(1));
86 auto diff_weights = reinterpret_cast<data_t *>(this->memory(0));
87 auto diff_bias = reinterpret_cast<data_t *>(this->memory(1));
89 const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd());
90 const memory_desc_wrapper diff_bias_d(conf_.diff_weights_pd(1));
92 diff_dst += diff_dst_d.blocking_desc().offset_padding;
94 // TODO: consistency checks
97 int IC = conf_.IC_total_padded();
99 float alpha = 1.0, beta = 0.0;
100 sgemm_->sgemm("N", "T", &IC, &OC, &MB, &alpha, src, &IC, diff_dst, &OC, &beta,
101 diff_weights, &IC, nullptr);
104 diff_bias += diff_bias_d.blocking_desc().offset_padding;
105 constexpr int blksize = 8;
106 int OC_blocks = OC / blksize;
107 int rem_OC = OC % blksize;
108 # pragma omp parallel
110 const int ithr = omp_get_thread_num();
111 const int nthr = omp_get_num_threads();
112 int oc_st{0}, oc_e{0};
113 balance211(OC_blocks, nthr, ithr, oc_st, oc_e);
114 oc_st = oc_st * blksize;
115 oc_e = oc_e * blksize;
118 for (int oc = oc_st; oc < oc_e; ++oc) {
119 diff_bias[oc] = diff_dst[oc];
122 for (int mb = 1; mb < MB; ++mb) {
124 for (int oc = oc_st; oc < oc_e; ++oc) {
125 diff_bias[oc] += diff_dst[mb * OC + oc];
129 if (rem_OC != 0 && ithr == nthr-1) {
130 for (int oc = OC_blocks * blksize; oc < OC; oc++)
131 diff_bias[oc] = diff_dst[oc];
132 for (int mb = 1; mb < MB; ++mb) {
133 for (int oc = OC_blocks * blksize; oc < OC; oc++) {
134 diff_bias[oc] += diff_dst[mb * OC + oc];
142 template <cpu_isa_t isa>
143 jit_uni_inner_product_bwd_data_t<isa>::jit_uni_inner_product_bwd_data_t(const pd_t *pd,
144 const input_vector &inputs, const output_vector &outputs)
145 : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
147 sgemm_ = new jit_uni_gemm_f32('N', 'N', 0.0, false);
150 template <cpu_isa_t isa>
151 jit_uni_inner_product_bwd_data_t<isa>::~jit_uni_inner_product_bwd_data_t()
156 template <cpu_isa_t isa>
157 void jit_uni_inner_product_bwd_data_t<isa>::execute_backward_data()
159 auto diff_dst = reinterpret_cast<const data_t *>(this->input_memory(0));
160 auto weights = reinterpret_cast<const data_t *>(this->input_memory(1));
161 auto diff_src = reinterpret_cast<data_t*>(this->memory());
163 // TODO: consistency checks
166 int IC = conf_.IC_total_padded();
168 float alpha = 1.0, beta = 0.0;
170 sgemm_->sgemm("N", "N", &IC, &MB, &OC, &alpha, weights, &IC, diff_dst, &OC, &beta,
171 diff_src, &IC, nullptr);
174 template struct jit_uni_inner_product_bwd_data_t<avx512_common>;
175 template struct jit_uni_inner_product_bwd_weights_t<avx512_common>;
176 template struct jit_uni_inner_product_fwd_t<avx512_common>;
177 template struct jit_uni_inner_product_bwd_data_t<avx2>;
178 template struct jit_uni_inner_product_bwd_weights_t<avx2>;
179 template struct jit_uni_inner_product_fwd_t<avx2>;
185 // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s