1 /*******************************************************************************
2 * Copyright 2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
21 #include "math_utils.hpp"
22 #include "mkldnn_thread.hpp"
24 #include "../simple_q10n.hpp"
25 #include "ref_rnn.hpp"
31 using namespace mkldnn::impl::utils;
32 using namespace mkldnn::impl::math;
33 using namespace rnn_utils;
36 rnn_elemwise_sig(ref_rnn_fwd_f32_t::lstm_elemwise) {
37 ws_gates_aoc_t ws_gates(rnn, ws_gates_);
38 bias_aoc_t bias(rnn, bias_);
39 ws_states_aoc_t states_t_l(rnn, states_t_l_);
40 ws_states_aoc_t c_states_t_l(rnn, c_states_t_l_);
41 ws_states_aoc_t c_states_tm1_l(rnn, c_states_tm1_l_);
43 parallel_nd(rnn.mb, [&](int i) {
44 // WA. Loss of correctnes in case of simd loop unrolling with icc 18
45 #if !defined(__INTEL_COMPILER)
48 for (int j = 0; j < rnn.dic; j++) {
49 ws_gates(i, 0, j) = logistic_fwd(ws_gates(i, 0, j) + bias(0, j));
50 ws_gates(i, 1, j) = logistic_fwd(ws_gates(i, 1, j) + bias(1, j));
51 ws_gates(i, 2, j) = tanh_fwd(ws_gates(i, 2, j) + bias(2, j));
52 ws_gates(i, 3, j) = logistic_fwd(ws_gates(i, 3, j) + bias(3, j));
54 float tmp = ws_gates(i, 1, j) * c_states_tm1_l(i, j)
55 + ws_gates(i, 0, j) * ws_gates(i, 2, j);
56 states_t_l(i, j) = ws_gates(i, 3, j) * tanh_fwd(tmp);
57 c_states_t_l(i, j) = tmp;
63 rnn_elemwise_sig(ref_rnn_fwd_u8s8_t::lstm_elemwise) {
64 ws_gates_aoc_s32_t ws_gates_s32(rnn, ws_gates_);
65 bias_aoc_t bias(rnn, bias_);
66 ws_states_aoc_u8_t states_t_l(rnn, states_t_l_);
67 ws_states_aoc_t c_states_t_l(rnn, c_states_t_l_);
68 ws_states_aoc_t c_states_tm1_l(rnn, c_states_tm1_l_);
70 float *weights_scales = pd()->attr()->rnn_weights_qparams_.scales_;
71 float data_shift = pd()->attr()->rnn_data_qparams_.shift_;
72 float data_scale = pd()->attr()->rnn_data_qparams_.scale_;
73 round_mode_t rmode = pd()->attr()->round_mode_;
75 auto q_d = [&](float f) {
76 float qf = f * data_scale + data_shift;
77 return qz_a1b0<float, src_data_t>()(qf, rmode);
80 auto deq_w = [&](acc_data_t s, int gate, int j) {
81 return pd()->attr()->rnn_weights_qparams_.mask_ == 0 ?
82 saturate<float>(s) * (1.f / (weights_scales[0] * data_scale)) :
83 saturate<float>(s) * (1.f / (weights_scales[gate * rnn.dic + j]
87 parallel_nd(rnn.mb, [&](int i) {
89 for (int j = 0; j < rnn.dic; j++) {
90 float G0 = logistic_fwd<float>(
91 deq_w(ws_gates_s32(i, 0, j), 0, j) + bias(0, j));
92 float G1 = logistic_fwd<float>(
93 deq_w(ws_gates_s32(i, 1, j), 1, j) + bias(1, j));
94 float G2 = tanh_fwd<float>(
95 deq_w(ws_gates_s32(i, 2, j), 2, j) + bias(2, j));
96 float G3 = logistic_fwd<float>(
97 deq_w(ws_gates_s32(i, 3, j), 3, j) + bias(3, j));
98 float tmp = G1 * c_states_tm1_l(i, j) + G0 * G2;
99 states_t_l(i, j) = q_d(G3 * tanh_fwd(tmp));
100 c_states_t_l(i, j) = tmp;
106 rnn_elemwise_sig(ref_rnn_bwd_f32_t::lstm_elemwise) {
107 ws_gates_aoc_t ws_gates(rnn, ws_gates_);
108 bias_aoc_t bias(rnn, bias_);
109 ws_states_aoc_t c_states_t_l(rnn, c_states_t_l_);
110 ws_states_aoc_t c_states_tm1_l(rnn, c_states_tm1_l_);
111 ws_diff_states_aoc_t diff_states_t_l(rnn, diff_states_t_l_);
112 ws_diff_states_aoc_t diff_states_tp1_l(rnn, diff_states_tp1_l_);
113 ws_diff_states_aoc_t diff_states_t_lp1(rnn, diff_states_t_lp1_);
115 parallel_nd(rnn.mb, [&](int i) {
117 for (int j = 0; j < rnn.dic; j++) {
118 float Ct = c_states_t_l(i, j);
119 /// @todo save it in the workspace in fwd pass or recompute it to
121 float tanhCt = tanh_fwd(Ct);
122 // we have 2 incoming diffs on Ht
123 float dHt = diff_states_tp1_l(0, i, j)
124 + diff_states_t_lp1(rnn.n_states, i, j);
125 float dCt = diff_states_tp1_l(1, i, j)
126 + one_m_square(tanhCt) * ws_gates(i, 3, j) * dHt;
128 float dG1 = c_states_tm1_l(i, j) * dCt
129 * x_m_square(ws_gates(i, 1, j));
130 float dG0 = ws_gates(i, 2, j) * dCt * x_m_square(ws_gates(i, 0, j));
131 float dG3 = tanhCt * dHt * x_m_square(ws_gates(i, 3, j));
133 = ws_gates(i, 0, j) * dCt * one_m_square(ws_gates(i, 2, j));
135 diff_states_t_l(1, i, j) = dCt * ws_gates(i, 1, j);
137 ws_gates(i, 0, j) = dG0;
138 ws_gates(i, 1, j) = dG1;
139 ws_gates(i, 2, j) = dG2;
140 ws_gates(i, 3, j) = dG3;