[IE CLDNN] Fixed fsv16 lrn kernel with fp16 input (#2086)
authorVladimir Paramuzov <vladimir.paramuzov@intel.com>
Mon, 7 Sep 2020 06:04:05 +0000 (09:04 +0300)
committerGitHub <noreply@github.com>
Mon, 7 Sep 2020 06:04:05 +0000 (09:04 +0300)
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lrn_gpu_across_channel_multiple_features_fsv16.cl
inference-engine/thirdparty/clDNN/tests/test_cases/lrn_gpu_test.cpp

index 7baa5ad..901e5c6 100644 (file)
@@ -41,14 +41,15 @@ KERNEL (lrn_gpu_across_channel_multiple_features_fsv16)(
     for (uint i = 0; i < LOCAL_SIZE; ++i, ++input_offset_f) {
         bool non_zero = input_offset_f >= 0 && input_offset_f < INPUT0_FEATURE_NUM;
         uint input_idx = INPUT0_GET_INDEX(batch_id, input_offset_f, y, x);
-        val[i] = (int)non_zero * TO_INPUT0_TYPE(input[input_idx]);
+        val[i] = (int)non_zero * TO_INPUT0_TYPE(ALPHA_VAL_FACTOR_DIV_BY_SIZE) * TO_INPUT0_TYPE(input[input_idx]);
         res = mad(val[i], val[i], res);
     }
     res = mad(res, TO_INPUT0_TYPE(ALPHA_DIV_BY_SIZE), TO_INPUT0_TYPE(K));
     res = native_powr(res, -TO_INPUT0_TYPE(BETA));
 
     uint output_idx = OUTPUT_GET_INDEX(batch_id, feature_id, y, x);
-    INPUT0_TYPE lrn_result = res * val[PADDING];
+    uint input_idx = INPUT0_GET_INDEX(batch_id, feature_id, y, x);
+    INPUT0_TYPE lrn_result = res * input[input_idx];
     #if HAS_FUSED_OPS
         FUSED_OPS;
         output[output_idx] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT);
index 38579ff..7ca2850 100644 (file)
@@ -125,6 +125,55 @@ TEST(lrn_fp32_gpu, basic2) {
     }
 }
 
+TEST(lrn_fp16_gpu, basic1) {
+    //  input : 1x16x1x1
+    //  Output : 1x16x1x1
+    const auto& engine = get_test_engine();
+
+    const size_t b = 1;
+    const size_t f = 16;
+    const size_t y = 1;
+    const size_t x = 1;
+
+    auto input = memory::allocate(engine, { data_types::f16, format::b_fs_yx_fsv16, { b, f, x, y } });
+    std::vector<half_t> inputVals(b * f * y * x);
+    std::generate(inputVals.begin(), inputVals.end(), []() {
+        static float n = 0;
+        return half_t(n++);
+    });
+
+    set_values(input, inputVals);
+
+    topology topology;
+    topology.add(input_layout("input", input.get_layout()));
+    uint32_t size = 5;
+    float k = 0.5f;
+    float alpha = 9.9e-05f;
+    float beta = 1.f;
+    topology.add(lrn("lrn", "input", size, k, alpha, beta, cldnn::lrn_norm_region_across_channel));
+
+    network network(engine, topology);
+
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("lrn").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    std::vector<float> expected_results = {
+        0.f, 1.99889f, 3.99525f, 5.98696f,
+        7.97159f, 9.94682f, 11.9104f, 13.86f,
+        15.7936f, 17.709f, 19.6041f, 21.4769f,
+        23.3257f, 25.1485f, 27.2091f, 29.3151f
+    };
+
+    ASSERT_EQ(output_ptr.size(), expected_results.size());
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_TRUE(are_equal(expected_results[i], half_to_float(output_ptr[i]))) << i;
+    }
+}
+
 TEST(lrn_fp32_gpu, basic3) {
     //  input : 2x16x4x4
     //  Output : 2x16x4x4