1 // Copyright (c) 2018 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "include/common.cl"
16 #include "include/data_types.cl"
20 #define UNIT_CVT_FUNC(val) convert_half(val)
22 #define UNIT_CVT_FUNC(val) (val)
25 __attribute__((reqd_work_group_size(LWS, 1, 1)))
26 KERNEL (mvn_gpu_bfyx_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output)
28 const uint data_set_idx = get_global_id(1); //in processing of which data set this WI participates?
29 const uint workers_per_data_set = LWS; //how many WI participates in processing of one data set
30 const uint in_data_set_idx = get_global_id(0); //this WI's id in group of items processing single data set
31 const uint data_set_size = DATA_SET_SIZE; //how many elements are in one data set
32 const uint data_sets_count = DATA_SETS_COUNT; //how many data sets are in the processing payload
34 const uint data_set_offset = data_set_idx * data_set_size;
35 const uint my_data_offset = data_set_offset + in_data_set_idx;
40 __local float lg_storage[LWS];
42 //each WI reads ITEMS_NUM consecutive items from batch*feature
43 for (uint i=0; i<ITEMS_NUM; ++i)
45 my_sum += (float)input[my_data_offset + i * workers_per_data_set];
48 if (in_data_set_idx < LEFTOVERS)
50 my_sum += (float)input[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx];
53 lg_storage[in_data_set_idx] = my_sum;
55 barrier(CLK_LOCAL_MEM_FENCE);
56 if (in_data_set_idx == 0)
58 for (uint i=1; i<LWS; ++i)
59 my_sum += lg_storage[i];
61 lg_storage[0] = my_sum / data_set_size;
63 barrier(CLK_LOCAL_MEM_FENCE);
65 my_sum = lg_storage[0];
67 #if NORMALIZE_VARIANCE == 0
68 for (uint i=0; i<ITEMS_NUM; ++i)
69 output[my_data_offset + i * workers_per_data_set] = ACTIVATION(UNIT_CVT_FUNC(input[my_data_offset + i * workers_per_data_set]) - UNIT_CVT_FUNC(my_sum), ACTIVATION_PARAMS);
70 if (in_data_set_idx < LEFTOVERS)
71 output[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx] = ACTIVATION(UNIT_CVT_FUNC(input[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx]) - UNIT_CVT_FUNC(my_sum), ACTIVATION_PARAMS);
73 barrier(CLK_LOCAL_MEM_FENCE);
75 float my_variance = 0.f;
76 //each WI reads ITEMS_NUM consecutive items from batch*feature
77 for (uint i=0; i<ITEMS_NUM; ++i)
79 tmp = (float)input[my_data_offset + i * workers_per_data_set];
81 my_variance = fma(tmp, tmp, my_variance);
84 if (in_data_set_idx < LEFTOVERS)
86 tmp = (float)input[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx];
88 my_variance = fma(tmp, tmp, my_variance);
91 lg_storage[in_data_set_idx] = my_variance;
93 barrier(CLK_LOCAL_MEM_FENCE);
94 if (in_data_set_idx == 0)
96 for (uint i=1; i<LWS; ++i)
97 my_variance += lg_storage[i];
99 my_variance /= data_set_size;
100 lg_storage[0] = native_powr(my_variance + (float)EPSILON, -0.5f);
102 barrier(CLK_LOCAL_MEM_FENCE);
104 my_variance = lg_storage[0];
106 for (uint i=0; i<ITEMS_NUM; ++i)
107 output[my_data_offset + i * workers_per_data_set] = ACTIVATION((UNIT_CVT_FUNC(input[my_data_offset + i * workers_per_data_set]) - UNIT_CVT_FUNC(my_sum)) * UNIT_CVT_FUNC(my_variance), ACTIVATION_PARAMS);
108 if (in_data_set_idx < LEFTOVERS)
109 output[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx] = ACTIVATION((UNIT_CVT_FUNC(input[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx]) - UNIT_CVT_FUNC(my_sum)) * UNIT_CVT_FUNC(my_variance), ACTIVATION_PARAMS);