1 // Copyright (c) 2018 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "include/include_all.cl"
19 #define UNIT_CVT_FUNC(val) convert_half(val)
21 #define UNIT_CVT_FUNC(val) (val)
24 __attribute__((reqd_work_group_size(LWS, 1, 1)))
25 KERNEL (mvn_gpu_bfyx_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output)
27 const uint data_set_idx = get_global_id(1); //in processing of which data set this WI participates?
28 const uint workers_per_data_set = LWS; //how many WI participates in processing of one data set
29 const uint in_data_set_idx = get_global_id(0); //this WI's id in group of items processing single data set
30 const uint data_set_size = DATA_SET_SIZE; //how many elements are in one data set
31 const uint data_sets_count = DATA_SETS_COUNT; //how many data sets are in the processing payload
33 const uint data_set_offset = data_set_idx * data_set_size;
34 const uint my_data_offset = data_set_offset + in_data_set_idx;
39 __local float lg_storage[LWS];
41 //each WI reads ITEMS_NUM consecutive items from batch*feature
42 for (uint i=0; i<ITEMS_NUM; ++i)
44 my_sum += (float)input[my_data_offset + i * workers_per_data_set];
47 if (in_data_set_idx < LEFTOVERS)
49 my_sum += (float)input[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx];
52 lg_storage[in_data_set_idx] = my_sum;
54 barrier(CLK_LOCAL_MEM_FENCE);
55 if (in_data_set_idx == 0)
57 for (uint i=1; i<LWS; ++i)
58 my_sum += lg_storage[i];
60 lg_storage[0] = my_sum / data_set_size;
62 barrier(CLK_LOCAL_MEM_FENCE);
64 my_sum = lg_storage[0];
66 #if NORMALIZE_VARIANCE == 0
67 for (uint i=0; i<ITEMS_NUM; ++i)
68 output[my_data_offset + i * workers_per_data_set] = ACTIVATION(UNIT_CVT_FUNC(input[my_data_offset + i * workers_per_data_set]) - UNIT_CVT_FUNC(my_sum), NL_M ,NL_N);
69 if (in_data_set_idx < LEFTOVERS)
70 output[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx] = ACTIVATION(UNIT_CVT_FUNC(input[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx]) - UNIT_CVT_FUNC(my_sum), NL_M ,NL_N);
72 barrier(CLK_LOCAL_MEM_FENCE);
74 float my_variance = 0.f;
75 //each WI reads ITEMS_NUM consecutive items from batch*feature
76 for (uint i=0; i<ITEMS_NUM; ++i)
78 tmp = (float)input[my_data_offset + i * workers_per_data_set];
80 my_variance = fma(tmp, tmp, my_variance);
83 if (in_data_set_idx < LEFTOVERS)
85 tmp = (float)input[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx];
87 my_variance = fma(tmp, tmp, my_variance);
90 lg_storage[in_data_set_idx] = my_variance;
92 barrier(CLK_LOCAL_MEM_FENCE);
93 if (in_data_set_idx == 0)
95 for (uint i=1; i<LWS; ++i)
96 my_variance += lg_storage[i];
98 my_variance /= data_set_size;
99 lg_storage[0] = native_powr(my_variance + (float)EPSILON, -0.5f);
101 barrier(CLK_LOCAL_MEM_FENCE);
103 my_variance = lg_storage[0];
105 for (uint i=0; i<ITEMS_NUM; ++i)
106 output[my_data_offset + i * workers_per_data_set] = ACTIVATION((UNIT_CVT_FUNC(input[my_data_offset + i * workers_per_data_set]) - UNIT_CVT_FUNC(my_sum)) * UNIT_CVT_FUNC(my_variance), NL_M ,NL_N);
107 if (in_data_set_idx < LEFTOVERS)
108 output[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx] = ACTIVATION((UNIT_CVT_FUNC(input[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx]) - UNIT_CVT_FUNC(my_sum)) * UNIT_CVT_FUNC(my_variance), NL_M ,NL_N);