Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / extension / ext_normalize.cpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include "ext_list.hpp"
6 #include "ext_base.hpp"
7
8 #include <algorithm>
9 #include <string>
10 #include <vector>
11 #include <map>
12 #include <cmath>
13 #if defined(HAVE_SSE) || defined(HAVE_AVX2)
14 #include <immintrin.h>
15 #endif
16
17 namespace InferenceEngine {
18 namespace Extensions {
19 namespace Cpu {
20
21 class NormalizeImpl: public ExtLayerBase {
22 public:
23     explicit NormalizeImpl(const CNNLayer* layer) {
24         try {
25             if (layer->insData.size() != 1 || layer->outData.size() != 1)
26                 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
27
28             if (layer->insData[0].lock()->dims.size() < 2 || layer->insData[0].lock()->dims.size() > 4)
29                 THROW_IE_EXCEPTION << "Normalize supports from 2D to 4D blobs!";
30
31             weights = std::dynamic_pointer_cast<TBlob<float>>(layer->blobs.at("weights"));
32             if (!weights)
33                 THROW_IE_EXCEPTION << layer->name << " weights is empty!";
34             across_spatial = layer->GetParamAsBool("across_spatial", false);
35             channel_shared = layer->GetParamAsBool("channel_shared", false);
36             eps = layer->GetParamAsFloat("eps");
37
38             addConfig(layer, {{ConfLayout::PLN, false, 0}}, {{ConfLayout::PLN, false, 0}}, true);
39         } catch (InferenceEngine::details::InferenceEngineException &ex) {
40             errorMsg = ex.what();
41         }
42     }
43
44 #if defined(HAVE_SSE) || defined(HAVE_AVX2)
45     float hsum_sse(__m128 v) {
46         __m128 shuf = _mm_movehdup_ps(v);
47         __m128 sum = _mm_add_ps(v, shuf);
48         shuf = _mm_movehl_ps(shuf, sum);
49         sum = _mm_add_ss(sum, shuf);
50
51         return _mm_cvtss_f32(sum);
52     }
53
54 #if defined(HAVE_AVX2)
55     float hsum_avx2(__m256 v) {
56         __m128 vlow = _mm256_castps256_ps128(v);
57         __m128 vhigh = _mm256_extractf128_ps(v, 1);
58
59         __m128 sum = _mm_add_ps(vlow, vhigh);
60
61         return hsum_sse(sum);
62     }
63 #endif
64 #endif
65
66     StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
67                        ResponseDesc *resp) noexcept override {
68         if (inputs.size() != 1 || outputs.empty()) {
69             if (resp) {
70                 std::string errorMsg = "Incorrect number of input or output edges!";
71                 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
72             }
73             return GENERAL_ERROR;
74         }
75         const float* src = inputs[0]->buffer();
76         const float* scl = weights->buffer();
77         float* dst = outputs[0]->buffer();
78
79         SizeVector dims = inputs[0]->getTensorDesc().getDims();
80
81         const int N = static_cast<const int>(dims[0]);
82         const int C = static_cast<int>(dims[1]);
83         const int H = static_cast<int>(dims.size() > 2 ? dims[2] : 1);
84         const int W = static_cast<int>(dims.size() > 3 ? dims[3] : 1);
85
86         for (int n = 0; n < N; n++) {
87             const float* psrc = src + n*C*H*W;
88             float* pdst = dst + n*C*H*W;
89
90             if (across_spatial) {
91                 float norm = eps;
92                 int i = 0;
93 #if defined(HAVE_AVX2)
94                 {
95                     __m256 vsum = _mm256_setzero_ps();
96                     for (; i <= C*H*W-8; i += 8) {
97                         __m256 vsrc = _mm256_loadu_ps(psrc + i);
98                         vsum = _mm256_fmadd_ps(vsrc, vsrc, vsum);
99                     }
100                     norm += hsum_avx2(vsum);
101                 }
102 #elif defined(HAVE_SSE)
103                 {
104                     __m128 vsum = _mm_setzero_ps();
105                     for (; i <= C*H*W-4; i += 4) {
106                         __m128 vsrc = _mm_loadu_ps(psrc + i);
107                         vsum = _mm_add_ps(_mm_mul_ps(vsrc, vsrc), vsum);
108                     }
109                     norm += hsum_sse(vsum);
110                 }
111 #endif
112                 for (; i < C*H*W; i++) {
113                     norm += psrc[i]*psrc[i];
114                 }
115                 norm = 1.0f / std::sqrt(norm);
116
117                 for (int c = 0 ; c < C; c++) {
118                     int hw = 0;
119 #if defined(HAVE_AVX2)
120                     __m256 vnorm_avx = _mm256_set1_ps(norm);
121                     __m256 vscl_avx = _mm256_set1_ps(channel_shared ? scl[0] : scl[c]);
122                     vnorm_avx = _mm256_mul_ps(vnorm_avx, vscl_avx);
123
124                     for ( ; hw <= H*W - 8; hw += 8) {
125                         __m256 vsrc = _mm256_loadu_ps(psrc + c*H*W + hw);
126                         _mm256_storeu_ps(pdst + c*H*W+hw, _mm256_mul_ps(vsrc, vnorm_avx));
127                     }
128 #elif defined(HAVE_SSE)
129                     __m128 vnorm_sse = _mm_set1_ps(norm);
130                     __m128 vscl_sse = _mm_set1_ps(channel_shared ? scl[0] : scl[c]);
131                     vnorm_sse = _mm_mul_ps(vnorm_sse, vscl_sse);
132
133                     for ( ; hw <= H*W - 4; hw += 4) {
134                         __m128 vsrc = _mm_loadu_ps(psrc + c*H*W + hw);
135                         _mm_storeu_ps(pdst + c*H*W+hw, _mm_mul_ps(vsrc, vnorm_sse));
136                     }
137 #endif
138                     for ( ; hw < H*W; hw++) {
139                         float s = channel_shared ? scl[0] : scl[c];
140                         pdst[c*H*W+hw] = psrc[c*H*W+hw] * norm * s;
141                     }
142                 }
143             } else {
144                 int wh = 0;
145 #if defined(HAVE_AVX2)
146                 for (; wh <= W*H - 8; wh += 8) {
147                     __m256 vnorm = _mm256_set1_ps(eps);
148                     for (int c = 0; c < C; c++) {
149                         const float* psrc_c = psrc + c*W*H;
150                         __m256 vsrc = _mm256_loadu_ps(psrc_c + wh);
151                         vnorm = _mm256_fmadd_ps(vsrc, vsrc, vnorm);
152                     }
153                     vnorm = _mm256_div_ps(_mm256_set1_ps(1.0f), _mm256_sqrt_ps(vnorm));
154
155                     for (int c = 0; c < C; c++) {
156                         const float* psrc_c = psrc + c*W*H;
157                         float* pdst_c = pdst + c*W*H;
158
159                         __m256 vscl = _mm256_set1_ps(channel_shared ? scl[0] : scl[c]);
160
161                         __m256 vsrc = _mm256_loadu_ps(psrc_c + wh);
162                         __m256 vdst = _mm256_mul_ps(vsrc, vnorm);
163                         vdst = _mm256_mul_ps(vdst, vscl);
164
165                         _mm256_storeu_ps(pdst_c + wh, vdst);
166                     }
167                 }
168 #elif defined(HAVE_SSE)
169                 for (; wh <= W*H - 4; wh += 4) {
170                     __m128 vnorm = _mm_set1_ps(eps);
171                     for (int c = 0; c < C; c++) {
172                         const float* psrc_c = psrc + c*W*H;
173                         __m128 vsrc = _mm_loadu_ps(psrc_c + wh);
174
175                         vnorm = _mm_add_ps(_mm_mul_ps(vsrc, vsrc), vnorm);
176                     }
177
178                     vnorm = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sqrt_ps(vnorm));
179
180                     for (int c = 0; c < C; c++) {
181                         const float* psrc_c = psrc + c*W*H;
182                               float* pdst_c = pdst + c*W*H;
183
184                         __m128 vscl = _mm_set1_ps(channel_shared ? scl[0] : scl[c]);
185
186                         __m128 vsrc = _mm_loadu_ps(psrc_c + wh);
187                         __m128 vdst = _mm_mul_ps(vsrc, vnorm);
188                         vdst = _mm_mul_ps(vdst, vscl);
189
190                         _mm_storeu_ps(pdst_c + wh, vdst);
191                     }
192                 }
193 #endif
194                 for (; wh < W*H; wh++) {
195                     float norm = eps;
196                     for (int c = 0; c < C; c++) {
197                         const float* psrc_c = psrc + c*W*H;
198                         norm += psrc_c[wh]*psrc_c[wh];
199                     }
200
201                     norm = 1.0f / std::sqrt(norm);
202
203                     for (int c = 0; c < C; c++) {
204                         const float* psrc_c = psrc + c*W*H;
205                         float* pdst_c = pdst + c*W*H;
206
207                         pdst_c[wh] = channel_shared ? (psrc_c[wh] * norm * scl[0]) : (psrc_c[wh] * norm * scl[c]);
208                     }
209                 }
210             }
211         }
212         return OK;
213     }
214
215 private:
216     TBlob<float>::Ptr weights;
217
218     bool across_spatial = true;
219     bool channel_shared = true;
220     float eps = 1e-10f;
221 };
222
223 REG_FACTORY_FOR(ImplFactory<NormalizeImpl>, Normalize);
224
225 }  // namespace Cpu
226 }  // namespace Extensions
227 }  // namespace InferenceEngine