1 /*******************************************************************************
2 * Copyright 2016-2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
20 #include "c_types_map.hpp"
21 #include "mkldnn_thread.hpp"
22 #include "type_helpers.hpp"
24 #include "ref_lrn.hpp"
30 static inline float fast_negative_powf(float omega, float beta) {
34 * = 1.0f / sqrtf(omega) * sqrtf(1.0f / sqrtf(omega))
35 * = sqrtf(1.0f / sqrtf(omega)) * 1.0f / sqrtf(omega)
36 * = sqrtf(1.0f / sqrtf(omega)) / sqrtf(omega)
37 * = sqrtf(1.0f / sqrtf(omega) / omega)
38 * = sqrtf(1.0f / (sqrtf(omega) * omega))
41 Y = sqrtf(1.0f / (sqrtf(omega) * omega));
43 Y = 1.0f / powf(omega, beta);
48 template <impl::data_type_t data_type>
49 template <mkldnn_memory_format_t fmt>
50 void ref_lrn_fwd_t<data_type>::execute_forward() const {
51 using namespace alg_kind;
52 using namespace memory_format;
54 auto src = reinterpret_cast<const data_t *>(this->input_memory(0));
55 auto dst = reinterpret_cast<data_t*>(this->memory(0));
56 auto ws = reinterpret_cast<data_t*>(this->memory(1));
58 const memory_desc_wrapper data_d(pd()->src_pd());
59 const memory_desc_wrapper ws_d(pd()->workspace_pd());
62 const int C = pd()->C();
63 const int H = pd()->H();
64 const int W = pd()->W();
65 const size_t stride_mb = data_d.blocking_desc().strides[0][0];
66 const bool across_channels = pd()->desc()->alg_kind == lrn_across_channels;
67 constexpr int blksize = fmt == nChw16c ? 16 : 8;
69 auto data_off = [&](int mb, int c, int h, int w) -> size_t {
72 case nChw8c: return mb * stride_mb + c / blksize * H * W * blksize
73 + h * W * blksize + w * blksize + c % blksize;
74 case nchw: return mb * stride_mb + c * H * W + h * W + w;
75 case nhwc: return mb * stride_mb + h * W * C + w * C + c;
76 default: return data_d.off(mb, c, h, w);
80 auto ker = [=](data_t *d, int mb, int oc, int oh, int ow) {
81 const float alpha = static_cast<float>(pd()->desc()->lrn_alpha);
82 const float beta = static_cast<float>(pd()->desc()->lrn_beta);
83 const float k = static_cast<float>(pd()->desc()->lrn_k);
85 const int size = pd()->desc()->local_size;
86 const int half_size = (size - 1) / 2;
89 if (across_channels) {
90 const int c_st = nstl::max(oc - half_size + 0, 0);
91 const int c_en = nstl::min(oc + half_size + 1, C);
93 for (int c = c_st; c < c_en; ++c) {
94 const float s = src[data_off(mb, c, oh, ow)];
98 int h_st = nstl::max(oh - half_size + 0, 0);
99 int h_en = nstl::min(oh + half_size + 1, H);
100 int w_st = nstl::max(ow - half_size + 0, 0);
101 int w_en = nstl::min(ow + half_size + 1, W);
102 for (int h = h_st; h < h_en; ++h) {
103 for (int w = w_st; w < w_en; ++w) {
104 const float s = src[data_off(mb, oc, h, w)];
109 const int summands = across_channels ? size : size * size;
110 sum = k + alpha * sum / summands;
111 size_t off = data_off(mb, oc, oh, ow);
113 ws[off] = static_cast<data_t>(sum);
114 d[0] = static_cast<data_t>(src[off] * fast_negative_powf(sum, beta));
117 const int MB = pd()->MB();
118 if (fmt == nChw16c || fmt == nChw8c) {
119 parallel_nd(MB, utils::div_up(C, blksize), H, W,
120 [&](int mb, int c_blk, int h, int w) {
121 int c = c_blk * blksize;
122 const size_t off = mb * stride_mb + c * H * W
123 + (h * W + w) * blksize;
125 for (int cc = 0; cc < nstl::min(blksize, C - c); ++cc)
126 ker(&dst[off + cc], mb, c + cc, h, w);
128 } else if (fmt == nhwc) {
129 parallel_nd(MB, H, W, C,
130 [&](int mb, int h, int w, int c) {
131 const size_t off = mb * stride_mb + h * W * C + w * C + c;
132 ker(&dst[off], mb, c, h, w);
135 parallel_nd(MB, C, H, W,
136 [&](int mb, int c, int h, int w) {
137 const size_t off = data_off(mb, c, h, w);
138 ker(&dst[off], mb, c, h, w);
143 template <impl::data_type_t data_type>
144 template <mkldnn_memory_format_t fmt>
145 void ref_lrn_bwd_t<data_type>::execute_backward() const {
146 using namespace alg_kind;
147 using namespace memory_format;
149 auto src = reinterpret_cast<const data_t *>(this->input_memory(0));
150 auto diff_dst = reinterpret_cast<const data_t *>(this->input_memory(1));
151 auto diff_src = reinterpret_cast<data_t*>(this->memory(0));
153 const memory_desc_wrapper data_d(pd()->src_pd());
154 const memory_desc_wrapper diff_data_d(pd()->diff_dst_pd());
155 MAYBE_UNUSED(diff_data_d);
157 const int MB = pd()->MB();
158 const int C = pd()->C();
159 const int H = pd()->H();
160 const int W = pd()->W();
161 const size_t stride_mb = data_d.blocking_desc().strides[0][0];
162 constexpr int blksize = fmt == nChw16c ? 16 : 8;
164 const float alpha = static_cast<float>(pd()->desc()->lrn_alpha);
165 const float beta = static_cast<float>(pd()->desc()->lrn_beta);
166 const float k = static_cast<float>(pd()->desc()->lrn_k);
167 const int kernel_size = pd()->desc()->local_size;
168 const int half_ksize = (kernel_size - 1) / 2;
170 auto data_off = [&](int mb, int c, int h, int w) -> size_t {
173 case nChw8c: return mb * stride_mb + c/blksize * H * W * blksize
174 + h * W * blksize + w * blksize + c%blksize;
175 case nchw: return mb * stride_mb + c * H * W + h * W + w;
176 case nhwc: return mb * stride_mb + h * W * C + w * C + c;
177 default: return data_d.off(mb, c, h, w);
181 auto ker = [=](data_t *d, int mb, int oc, int oh, int ow) {
182 const int c_st = nstl::max(oc - half_ksize + 0, 0);
183 const int c_en = nstl::min(oc + half_ksize + 1, C);
185 float A = 0, B = 0, omega_mid = 0;
186 for (int c = c_st; c < c_en; c++) {
188 const int i_st = nstl::max(c - half_ksize, 0);
189 const int i_en = nstl::min(c + kernel_size - half_ksize, C);
191 for (int i = i_st; i < i_en; ++i) {
192 const float value = src[data_off(mb, i, oh, ow)];
193 sum += value * value;
195 const float omega = static_cast<float>(k + sum * alpha / kernel_size);
196 if (c == oc) omega_mid = omega;
197 float t = src[data_off(mb, c, oh, ow)]
198 * fast_negative_powf(omega, beta);
199 B += 1.0f / omega * t * diff_dst[data_off(mb, c, oh, ow)];
202 const size_t off = data_off(mb, oc, oh, ow);
203 A = fast_negative_powf(omega_mid, beta) * diff_dst[off];
205 B *= (2.0f * alpha * beta) / kernel_size;
206 *d = static_cast<data_t>(A - B); // final cast down to data_t
209 if (fmt == nChw16c || fmt == nChw8c) {
210 parallel_nd(MB, utils::div_up(C, blksize), H, W,
211 [&](int mb, int c_blk, int h, int w) {
212 int c = c_blk * blksize;
213 const size_t off = mb * stride_mb + c * H * W +
214 (h * W + w) * blksize;
216 for (int cc = 0; cc < nstl::min(blksize, C - c); ++cc)
217 ker(&diff_src[off + cc], mb, c + cc, h, w);
219 } else if (fmt == nhwc) {
220 parallel_nd(MB, H, W, C,
221 [&](int mb, int h, int w, int c) {
222 const size_t off = mb * stride_mb + h * W * C + w * C + c;
223 ker(&diff_src[off], mb, c, h, w);
226 parallel_nd(MB, C, H, W,
227 [&](int mb, int c, int h, int w) {
228 const size_t off = data_off(mb, c, h, w);
229 ker(&diff_src[off], mb, c, h, w);
234 template void ref_lrn_fwd_t<data_type::f32>::execute_forward<memory_format::nChw16c>() const;
235 template void ref_lrn_fwd_t<data_type::f32>::execute_forward<memory_format::nChw8c>() const;
236 template void ref_lrn_fwd_t<data_type::f32>::execute_forward<memory_format::nchw>() const;
237 template void ref_lrn_fwd_t<data_type::f32>::execute_forward<memory_format::nhwc>() const;
238 template void ref_lrn_fwd_t<data_type::f32>::execute_forward<memory_format::any>() const;
239 template void ref_lrn_bwd_t<data_type::f32>::execute_backward<memory_format::nChw16c>() const;
240 template void ref_lrn_bwd_t<data_type::f32>::execute_backward<memory_format::nChw8c>() const;
241 template void ref_lrn_bwd_t<data_type::f32>::execute_backward<memory_format::nchw>() const;
242 template void ref_lrn_bwd_t<data_type::f32>::execute_backward<memory_format::nhwc>() const;
243 template void ref_lrn_bwd_t<data_type::f32>::execute_backward<memory_format::any>() const;
249 // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s