1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include "ext_list.hpp"
6 #include "ext_base.hpp"
13 #if defined(HAVE_SSE) || defined(HAVE_AVX2)
14 #include <immintrin.h>
17 namespace InferenceEngine {
18 namespace Extensions {
21 class NormalizeImpl: public ExtLayerBase {
23 explicit NormalizeImpl(const CNNLayer* layer) {
25 if (layer->insData.size() != 1 || layer->outData.size() != 1)
26 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
28 if (layer->insData[0].lock()->dims.size() < 2 || layer->insData[0].lock()->dims.size() > 4)
29 THROW_IE_EXCEPTION << "Normalize supports from 2D to 4D blobs!";
31 weights = std::dynamic_pointer_cast<TBlob<float>>(layer->blobs.at("weights"));
33 THROW_IE_EXCEPTION << layer->name << " weights is empty!";
34 across_spatial = layer->GetParamAsBool("across_spatial", false);
35 channel_shared = layer->GetParamAsBool("channel_shared", false);
36 eps = layer->GetParamAsFloat("eps");
38 addConfig(layer, {{ConfLayout::PLN, false, 0}}, {{ConfLayout::PLN, false, 0}}, true);
39 } catch (InferenceEngine::details::InferenceEngineException &ex) {
44 #if defined(HAVE_SSE) || defined(HAVE_AVX2)
45 float hsum_sse(__m128 v) {
46 __m128 shuf = _mm_movehdup_ps(v);
47 __m128 sum = _mm_add_ps(v, shuf);
48 shuf = _mm_movehl_ps(shuf, sum);
49 sum = _mm_add_ss(sum, shuf);
51 return _mm_cvtss_f32(sum);
54 #if defined(HAVE_AVX2)
55 float hsum_avx2(__m256 v) {
56 __m128 vlow = _mm256_castps256_ps128(v);
57 __m128 vhigh = _mm256_extractf128_ps(v, 1);
59 __m128 sum = _mm_add_ps(vlow, vhigh);
66 StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
67 ResponseDesc *resp) noexcept override {
68 if (inputs.size() != 1 || outputs.empty()) {
70 std::string errorMsg = "Incorrect number of input or output edges!";
71 errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
75 const float* src = inputs[0]->buffer();
76 const float* scl = weights->buffer();
77 float* dst = outputs[0]->buffer();
79 SizeVector dims = inputs[0]->getTensorDesc().getDims();
81 const int N = static_cast<const int>(dims[0]);
82 const int C = static_cast<int>(dims[1]);
83 const int H = static_cast<int>(dims.size() > 2 ? dims[2] : 1);
84 const int W = static_cast<int>(dims.size() > 3 ? dims[3] : 1);
86 for (int n = 0; n < N; n++) {
87 const float* psrc = src + n*C*H*W;
88 float* pdst = dst + n*C*H*W;
93 #if defined(HAVE_AVX2)
95 __m256 vsum = _mm256_setzero_ps();
96 for (; i <= C*H*W-8; i += 8) {
97 __m256 vsrc = _mm256_loadu_ps(psrc + i);
98 vsum = _mm256_fmadd_ps(vsrc, vsrc, vsum);
100 norm += hsum_avx2(vsum);
102 #elif defined(HAVE_SSE)
104 __m128 vsum = _mm_setzero_ps();
105 for (; i <= C*H*W-4; i += 4) {
106 __m128 vsrc = _mm_loadu_ps(psrc + i);
107 vsum = _mm_add_ps(_mm_mul_ps(vsrc, vsrc), vsum);
109 norm += hsum_sse(vsum);
112 for (; i < C*H*W; i++) {
113 norm += psrc[i]*psrc[i];
115 norm = 1.0f / std::sqrt(norm);
117 for (int c = 0 ; c < C; c++) {
119 #if defined(HAVE_AVX2)
120 __m256 vnorm_avx = _mm256_set1_ps(norm);
121 __m256 vscl_avx = _mm256_set1_ps(channel_shared ? scl[0] : scl[c]);
122 vnorm_avx = _mm256_mul_ps(vnorm_avx, vscl_avx);
124 for ( ; hw <= H*W - 8; hw += 8) {
125 __m256 vsrc = _mm256_loadu_ps(psrc + c*H*W + hw);
126 _mm256_storeu_ps(pdst + c*H*W+hw, _mm256_mul_ps(vsrc, vnorm_avx));
128 #elif defined(HAVE_SSE)
129 __m128 vnorm_sse = _mm_set1_ps(norm);
130 __m128 vscl_sse = _mm_set1_ps(channel_shared ? scl[0] : scl[c]);
131 vnorm_sse = _mm_mul_ps(vnorm_sse, vscl_sse);
133 for ( ; hw <= H*W - 4; hw += 4) {
134 __m128 vsrc = _mm_loadu_ps(psrc + c*H*W + hw);
135 _mm_storeu_ps(pdst + c*H*W+hw, _mm_mul_ps(vsrc, vnorm_sse));
138 for ( ; hw < H*W; hw++) {
139 float s = channel_shared ? scl[0] : scl[c];
140 pdst[c*H*W+hw] = psrc[c*H*W+hw] * norm * s;
145 #if defined(HAVE_AVX2)
146 for (; wh <= W*H - 8; wh += 8) {
147 __m256 vnorm = _mm256_set1_ps(eps);
148 for (int c = 0; c < C; c++) {
149 const float* psrc_c = psrc + c*W*H;
150 __m256 vsrc = _mm256_loadu_ps(psrc_c + wh);
151 vnorm = _mm256_fmadd_ps(vsrc, vsrc, vnorm);
153 vnorm = _mm256_div_ps(_mm256_set1_ps(1.0f), _mm256_sqrt_ps(vnorm));
155 for (int c = 0; c < C; c++) {
156 const float* psrc_c = psrc + c*W*H;
157 float* pdst_c = pdst + c*W*H;
159 __m256 vscl = _mm256_set1_ps(channel_shared ? scl[0] : scl[c]);
161 __m256 vsrc = _mm256_loadu_ps(psrc_c + wh);
162 __m256 vdst = _mm256_mul_ps(vsrc, vnorm);
163 vdst = _mm256_mul_ps(vdst, vscl);
165 _mm256_storeu_ps(pdst_c + wh, vdst);
168 #elif defined(HAVE_SSE)
169 for (; wh <= W*H - 4; wh += 4) {
170 __m128 vnorm = _mm_set1_ps(eps);
171 for (int c = 0; c < C; c++) {
172 const float* psrc_c = psrc + c*W*H;
173 __m128 vsrc = _mm_loadu_ps(psrc_c + wh);
175 vnorm = _mm_add_ps(_mm_mul_ps(vsrc, vsrc), vnorm);
178 vnorm = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sqrt_ps(vnorm));
180 for (int c = 0; c < C; c++) {
181 const float* psrc_c = psrc + c*W*H;
182 float* pdst_c = pdst + c*W*H;
184 __m128 vscl = _mm_set1_ps(channel_shared ? scl[0] : scl[c]);
186 __m128 vsrc = _mm_loadu_ps(psrc_c + wh);
187 __m128 vdst = _mm_mul_ps(vsrc, vnorm);
188 vdst = _mm_mul_ps(vdst, vscl);
190 _mm_storeu_ps(pdst_c + wh, vdst);
194 for (; wh < W*H; wh++) {
196 for (int c = 0; c < C; c++) {
197 const float* psrc_c = psrc + c*W*H;
198 norm += psrc_c[wh]*psrc_c[wh];
201 norm = 1.0f / std::sqrt(norm);
203 for (int c = 0; c < C; c++) {
204 const float* psrc_c = psrc + c*W*H;
205 float* pdst_c = pdst + c*W*H;
207 pdst_c[wh] = channel_shared ? (psrc_c[wh] * norm * scl[0]) : (psrc_c[wh] * norm * scl[c]);
216 TBlob<float>::Ptr weights;
218 bool across_spatial = true;
219 bool channel_shared = true;
223 REG_FACTORY_FOR(ImplFactory<NormalizeImpl>, Normalize);
226 } // namespace Extensions
227 } // namespace InferenceEngine