Publishing 2019 R1 content

[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / tests / test_cases / lstm_gpu_test.cpp
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/lstm_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/lstm_gpu_test.cpp

index 13c6c93..ba109f0 100644 (file)
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/lstm_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/lstm_gpu_test.cpp
@@ -21,6 +21,7 @@
  #include "api/CPP/lstm.hpp"
  #include <api/CPP/split.hpp>
  #include <api/CPP/crop.hpp>
+#include <api/CPP/reshape.hpp>
  #include <api/CPP/concatenation.hpp>
  #include <api/CPP/topology.hpp>
  #include <api/CPP/tensor.hpp>
@@ -29,10 +30,14 @@
  #include "test_utils/test_utils.h"
  #include <api/CPP/data.hpp>
  #include "instrumentation.h"
+#include <test_utils/float16.h>
  
  #include <sstream>
  #include <iomanip>
  
+#ifdef WIN32
+#pragma warning(disable: 4503)
+#endif
  
  using namespace cldnn;
  using namespace tests;
@@ -88,7 +93,7 @@ VVVVF<T> lstm_gemm_reference(VVVVF<T>& input, VVVVF<T>& weights, VVVVF<T>& recur
              }
              if (hasHidden) {
                  for (size_t x = 0; x < hidden_size; ++x) {
-                    res += (T)recurrent[0][dir][y][x] * (T)hidden[b][dir][0][x];
+                    res += (T)recurrent[0][dir][y][x] * (T)hidden[b][0][dir][x];
                  }
              }
              if (hasBias) {
@@ -102,7 +107,9 @@ VVVVF<T> lstm_gemm_reference(VVVVF<T>& input, VVVVF<T>& weights, VVVVF<T>& recur
  
  template <typename T>
  VVVVF<T> lstm_elt_reference(VVVVF<T>& tempGEMM, VVVVF<T>& cell,
-                     bool hasCell = true, float clip_threshold = 0, bool input_forget = false, size_t dir = 0) {
+                            bool hasCell = true, float clip_threshold = 0,
+                            bool input_forget = false, size_t dir = 0)
+{
      size_t hidden_size = tempGEMM[0][0][0].size() / 4;
      size_t batch_size = tempGEMM.size();
      VVVVF<T> tempOut(batch_size, VVVF<T>(2, VVF<T>(1, VF<T>(hidden_size))));
@@ -113,16 +120,28 @@ VVVVF<T> lstm_elt_reference(VVVVF<T>& tempGEMM, VVVVF<T>& cell,
          T *ot = &tempGEMM[b][0][0][off.ot];
          T *ft = &tempGEMM[b][0][0][off.ft];
          T *zt = &tempGEMM[b][0][0][off.zt];
+
          for (size_t h = 0; h < hidden_size; ++h) {
-            T val = sigmoid(clip(it[h], clip_threshold)) * std::tanh((float)clip(zt[h], clip_threshold));
+
+            // Convert all inputs to float for all the elementwise operations. This is done to immitate
+            // how lstm kernel is performing the elementwise operations.
+            float fp32_it = (float)it[h];
+            float fp32_ot = (float)ot[h];
+            float fp32_ft = (float)ft[h];
+            float fp32_zt = (float)zt[h];
+            float val = sigmoid(clip(fp32_it, clip_threshold)) * std::tanh(clip(fp32_zt, clip_threshold));
+
              if (input_forget) {
-                val *= (1 - ft[h]);
+                val *= (1 - fp32_ft);
              }
              if (hasCell) {
-                val += cell[b][dir][0][h] * sigmoid(clip(ft[h], clip_threshold));
+                val += (float)cell[b][0][dir][h] * sigmoid(clip(fp32_ft, clip_threshold));
              }
-            tempOut[b][0][0][h] = std::tanh((float)val) * sigmoid(ot[h]);
-            tempOut[b][1][0][h] = val;
+
+            // Convert back to output data type before storing it into the output buffer. Currently, the output
+            // data type may be float or FLOAT16 (half)
+            tempOut[b][0][0][h] = (T)(std::tanh(val) * sigmoid(fp32_ot));
+            tempOut[b][1][0][h] = (T)val;
          }
      }
      return tempOut;
@@ -154,10 +173,14 @@ void print(const std::string& s, VVVVF<T>& input) {
  // tempGEMM  = [    batch,         1,               1, 4 * hidden_size ] temporary output
  // output    = [    batch,  sequence,       direction,     hidden_size ] output
  template <typename T>
-void lstm_reference(VVVVF<T>& input, VVVVF<T>& hidden, VVVVF<T>& cell, VVVVF<T>& weights, VVVVF<T>& recurrent, VVVVF<T>& bias,
-    VVVVF<T>& output, VVVVF<T>& last_hidden, VVVVF<T>& last_cell,
-    bool hasBias = true, bool hasInitialHidden = true, bool hasInitialCell = true,
-    float clip_threshold = 0, bool input_forget = false, bool scramble_input = true) {
+void lstm_reference(VVVVF<T>& input, VVVVF<T>& hidden, VVVVF<T>& cell,
+                    VVVVF<T>& weights, VVVVF<T>& recurrent, VVVVF<T>& bias,
+                    VVVVF<T>& output, VVVVF<T>& last_hidden,
+                    VVVVF<T>& last_cell, bool hasBias = true,
+                    bool hasInitialHidden = true, bool hasInitialCell = true,
+                    float clip_threshold = 0, bool input_forget = false,
+                    bool scramble_input = true)
+{
      size_t sequence_len = input[0].size();
      size_t dir_len = weights[0].size();
      size_t batch = input.size();
@@ -179,8 +202,8 @@ void lstm_reference(VVVVF<T>& input, VVVVF<T>& hidden, VVVVF<T>& cell, VVVVF<T>&
              // tempOutput[batch][0] = hidden and tempOutput[batch][1] = cell
              for (size_t i = 0; i < batch; i++) {
                  output[i][seq][dir] = tempOutput[i][0][0];
-                hidden[i][dir] = tempOutput[i][0];
-                cell[i][dir] = tempOutput[i][1];
+                hidden[i][0][dir] = tempOutput[i][0][0];
+                cell[i][0][dir] = tempOutput[i][1][0];
              }
              tempHasInitialHidden = true;
              tempHasInitialCell = true;
@@ -210,12 +233,23 @@ void generic_lstm_gemm_gpu_test(int sequence_len, int direction, int batch_size,
  
      VVVVF<T> ref_output = lstm_gemm_reference(ref_input, ref_weights, ref_recurrent, ref_bias, ref_hidden, 0, hasBias, hasHidden);
  
-    engine engine;
-    memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ batch_size,   sequence_len,  input_size,      1 } });
-    memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1,            direction,     input_size,      4 * hidden_size } });
-    memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1,            direction,     hidden_size,     4 * hidden_size } });
-    memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1,            1,             4 * hidden_size, direction } });
-    memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ batch_size,   direction,     hidden_size,     1 } });
+    constexpr auto dt = std::is_same<T, float>::value ? data_types::f32 : data_types::f16;
+    const auto& engine = get_test_engine();
+
+    // If the input is of fp16 type then, the memory will be allocated as such
+    if (!engine.get_info().supports_fp16)
+    {
+        if (dt == data_types::f16)
+        {
+            return;
+        }
+    }
+
+    memory input = memory::allocate(engine, { dt, format::bfyx,     { batch_size,   sequence_len,  input_size,      1 } });
+    memory weights = memory::allocate(engine, { dt, format::bfyx,   { 1,            direction,     input_size,      4 * hidden_size } });
+    memory recurrent = memory::allocate(engine, { dt, format::bfyx, { 1,            direction,     hidden_size,     4 * hidden_size } });
+    memory biases = memory::allocate(engine, { dt, format::bfyx,    { 1,            1,             4 * hidden_size, direction } });
+    memory hidden = memory::allocate(engine, { dt, format::bfyx,    { batch_size,   direction,     hidden_size,     1 } });
  
      set_values(input, ref_input_vec);
      set_values(weights, ref_weights_vec);
@@ -250,13 +284,13 @@ void generic_lstm_gemm_gpu_test(int sequence_len, int direction, int batch_size,
      int i = 0;
      for (int b = 0; b < batch_size; ++b) {
          for (int x = 0; x < 4 * hidden_size; ++x)
-            EXPECT_EQ(ref_output[b][0][0][x], output_ptr[i++]);
+            EXPECT_FLOAT_EQ(ref_output[b][0][0][x], output_ptr[i++]);
      }
  }
  
  template<typename T>
  void generic_lstm_elt_gpu_test(int sequence_len, int direction, int batch_size, int input_size, int hidden_size, bool hasCell = true,
-    float clip_threshold = 0.f, bool input_forget = false) {
+    T clip_threshold = (T)0.f, bool input_forget = false) {
      // tempGEMM  = [        1, direction,           batch, 4 * hidden_size ] input
      // cell      = [        1, direction,           batch,     hidden_size ] optional
      // output    = [        2, direction,           batch,     hidden_size ] output concat[hidden, cell]
@@ -269,9 +303,25 @@ void generic_lstm_elt_gpu_test(int sequence_len, int direction, int batch_size,
  
      VVVVF<T> ref_output = lstm_elt_reference(ref_tempGEMM, ref_cell, hasCell, clip_threshold, input_forget);
  
-    engine engine;
-    memory tempGEMM = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ batch_size,    direction, 4 * hidden_size, 1 } });
-    memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ batch_size,    direction,     hidden_size, 1 } });
+    // We observe some mismatch in down-converting from fp32 to fp16
+    // between the reference implementation and opencl kernel. This can be
+    // a simple rounding error. Thus, for fp16 we are increasing our tolerance
+    // to error from 1E-4 to 1E-2
+    constexpr float ferror = std::is_same<T, float>::value ? (float)1E-4 : (float)1E-2;
+    constexpr auto dt = std::is_same<T, float>::value ? data_types::f32 : data_types::f16;
+    const auto& engine = get_test_engine();
+
+    // If the input is of fp16 type then, the memory will be allocated as such
+    if (!engine.get_info().supports_fp16)
+    {
+        if (dt == data_types::f16)
+        {
+            return;
+        }
+    }
+
+    memory tempGEMM = memory::allocate(engine, { dt, format::bfyx,{ batch_size,    direction, 4 * hidden_size, 1 } });
+    memory cell = memory::allocate(engine, { dt, format::bfyx,{ batch_size,    direction,     hidden_size, 1 } });
      set_values(tempGEMM, ref_tempGEMM_vec);
      set_values(cell, ref_cell_vec);
  
@@ -298,7 +348,7 @@ void generic_lstm_elt_gpu_test(int sequence_len, int direction, int batch_size,
              for (int x = 0; x < hidden_size; ++x)
              {
                  auto idx = b * 2 * hidden_size + j * hidden_size + x;
-                EXPECT_NEAR(ref_output[b][j][0][x], output_ptr[idx], FERROR);
+                ASSERT_NEAR(ref_output[b][j][0][x], output_ptr[idx] , ferror);
              }
          }
      }
@@ -388,7 +438,7 @@ void generic_lstm_custom_gpu_test(int sequence_len, int direction, int batch_siz
      lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output, last_hidden, last_cell,
          hasBias, hasInitialHidden, hasInitialCell);
  
-    engine engine;
+    const auto& engine = get_test_engine();
      memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ batch_size, sequence_len,  input_size,       1 } });
      memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1,          direction,     input_size,       4 * hidden_size } });
      memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1,          direction,     hidden_size,      4 * hidden_size } });
@@ -434,7 +484,7 @@ void generic_lstm_custom_gpu_test(int sequence_len, int direction, int batch_siz
  template<typename T>
  void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batch_size, int input_size, int hidden_size,
                              bool hasBias = true, bool hasInitialHidden = true, bool hasInitialCell = true,
-                            float clip_threshold = 0, bool input_forget = false) {
+                            T clip_threshold = 0, bool input_forget = false) {
      std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
              << " Sequence Len = " << sequence_len << " Direction = " << direction << " Batch Size = " << batch_size << std::endl;
      int min_random = -2, max_random = 2;
@@ -452,8 +502,8 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc
          ref_weights.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, i==0 ? input_size : hidden_size, min_random, max_random));
          ref_recurrent.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, hidden_size, min_random, max_random));
          ref_bias.push_back(generate_random_4d<T>(1, 1, direction, 4 * hidden_size, min_random, max_random));
-        ref_hidden.push_back(generate_random_4d<T>(batch_size, direction, 1, hidden_size, min_random, max_random));
-        ref_cell.push_back(generate_random_4d<T>(batch_size, direction, 1, hidden_size, min_random, max_random));
+        ref_hidden.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
+        ref_cell.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
          ref_output.push_back(VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(direction, VF<T>(hidden_size)))));
      }
  
@@ -471,8 +521,8 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc
          ref_cell_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_cell[i]));
      }
  
-    VVVVF<T> last_hidden(batch_size, VVVF<T>(direction, VVF<T>(1, VF<T>(hidden_size))));
-    VVVVF<T> last_cell(batch_size, VVVF<T>(direction, VVF<T>(1, VF<T>(hidden_size))));
+    VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
+    VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
  
      lstm_reference(ref_input, ref_hidden[0], ref_cell[0], ref_weights[0], ref_recurrent[0], ref_bias[0], ref_output[0],
                     last_hidden, last_cell, hasBias, hasInitialHidden, hasInitialCell,
@@ -485,9 +535,24 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc
                          clip_threshold, input_forget, false);
      }
  
-    engine engine;
+    // We observe some mismatch in down-converting from fp32 to fp16
+    // between the reference implementation and opencl kernel. This can be
+    // a simple rounding error. Thus, for fp16 we are increasing our tolerance
+    // to error from 1E-4 to 1E-2
+    constexpr float ferror = std::is_same<T, float>::value ? (float)1E-4 : (float)1E-2;
+    constexpr auto dt = std::is_same<T, float>::value ? data_types::f32 : data_types::f16;
+    const auto& engine = get_test_engine();
  
-    memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {batch_size, sequence_len, input_size, 1} });
+    // If the input is of fp16 type then, the memory will be allocated as such
+    if (!engine.get_info().supports_fp16)
+    {
+        if (dt == data_types::f16)
+        {
+            return;
+        }
+    }
+
+    memory input = memory::allocate(engine, { dt, format::bfyx, {batch_size, sequence_len, input_size, 1} });
      set_values(input, ref_input_vec);
  
      std::vector<memory> weights;
@@ -496,20 +561,20 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc
      std::vector<memory> hidden;
      std::vector<memory> cell;
      for(int i = 0; i < layers; ++i) {
-        weights.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, direction, i==0 ? input_size : hidden_size, 4 * hidden_size } }));
+        weights.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, direction, i==0 ? input_size : hidden_size, 4 * hidden_size } }));
          set_values(weights[i], ref_weights_vec[i]);
-        recurrent.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } }));
+        recurrent.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } }));
          set_values(recurrent[i], ref_recurrent_vec[i]);
          if (hasBias) {
-            biases.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, direction } }));
+            biases.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, 1, 4 * hidden_size, direction } }));
              set_values(biases[i], ref_bias_vec[i]);
          }
          if (hasInitialHidden) {
-            hidden.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size,  direction, hidden_size, 1 } }));
+            hidden.push_back(memory::allocate(engine, { dt, format::bfyx, { batch_size, 1, hidden_size, direction } }));
              set_values(hidden[i], ref_hidden_vec[i]);
          }
          if (hasInitialCell) {
-            cell.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, direction, hidden_size, 1 } }));
+            cell.push_back(memory::allocate(engine, { dt, format::bfyx, { batch_size, 1, hidden_size, direction} }));
              set_values(cell[i], ref_cell_vec[i]);
          }
      }
@@ -543,12 +608,14 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc
          if (i == 0) {
              topology.add(lstm(lstm_id, lstm_inputs, weights_id, recurrent_id,
                              hasBias ? biases_id : "", hasInitialHidden ? hidden_id : "", hasInitialCell ? cell_id : "", "",
-                            clip_threshold, input_forget, {}, {}, default_offset_type));
+                            clip_threshold, input_forget, {}, {},
+                            cldnn_lstm_output::cldnn_lstm_output_sequence, default_offset_type));
          }
          else {
              topology.add(lstm(lstm_id, { prev_lstm_id }, weights_id, recurrent_id,
                              hasBias ? biases_id : "", hasInitialHidden ? hidden_id : "", hasInitialCell ? cell_id : "", "",
-                            clip_threshold, input_forget, {}, {}, default_offset_type));
+                            clip_threshold, input_forget, {}, {},
+                            cldnn_lstm_output::cldnn_lstm_output_sequence, default_offset_type));
          }
          prev_lstm_id = lstm_id;
      }
@@ -567,17 +634,17 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc
          ASSERT_EQ(output_size, size_t(hidden_size * sequence_len * batch_size * direction));
  
          auto output = outputs.begin()->second.get_memory();
-        
+
          // Get the output tensor
          cldnn::layout output_layout = output.get_layout();
-        cldnn::tensor output_tensor = output_layout.size; 
-        
+        cldnn::tensor output_tensor = output_layout.size;
+
          // Compare the output tensor configuration against the reference value
          // Output tensor is configured in bfyx format
          ASSERT_EQ(batch_size, output_tensor.batch[0]);
          ASSERT_EQ(sequence_len, output_tensor.feature[0]);
          ASSERT_EQ(direction, output_tensor.spatial[1]);
-        ASSERT_EQ(hidden_size, output_tensor.spatial[0]); 
+        ASSERT_EQ(hidden_size, output_tensor.spatial[0]);
  
          auto output_ptr = output.pointer<T>();
          int32_t i = 0;
@@ -585,7 +652,998 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc
              for (int32_t s = 0; s < sequence_len; ++s) {
                  for (int32_t d = 0; d < direction; ++d) {
                      for (int32_t x = 0; x <  hidden_size; ++x) {
-                        ASSERT_NEAR(ref_output[layers-1][b][s][d][x], output_ptr[i++], FERROR);
+                        ASSERT_NEAR(ref_output[layers - 1][b][s][d][x], output_ptr[i++], ferror);
+                    }
+                }
+            }
+        }
+    }
+}
+
+// -------------------------------------------------------
+template<typename T>
+void lstm_gpu_output_test(const cldnn_lstm_output& output_selection, int directions) {
+    int layers = 1;
+    int sequence_len = 4;
+    int batch_size = 3;
+    int input_size = 3;
+    int hidden_size = 4;
+
+    std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
+            << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size
+                       << " Output selection: " << output_selection << std::endl;
+    int min_random = -2, max_random = 2;
+
+    VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
+    VVVVF<T> ref_weights = generate_random_4d<T>(1, directions, 4 * hidden_size, input_size, min_random, max_random);
+    VVVVF<T> ref_recurrent = generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random);
+    VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random);
+    VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
+    VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
+    VVVVF<T> ref_output = VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size))));
+
+    VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
+    VF<T> ref_weights_vec = flatten_4d<T>(cldnn::format::bfyx, ref_weights);
+    VF<T> ref_recurrent_vec = flatten_4d<T>(cldnn::format::bfyx, ref_recurrent);
+    VF<T> ref_bias_vec = flatten_4d<T>(cldnn::format::bfyx, ref_bias);
+    VF<T> ref_hidden_vec = flatten_4d<T>(cldnn::format::bfyx, ref_hidden);
+    VF<T> ref_cell_vec = flatten_4d<T>(cldnn::format::bfyx, ref_cell);
+
+    VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
+    VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
+
+    lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output,
+                   last_hidden, last_cell, true, true, true,
+                   (T)0, false, true);
+
+    const auto& engine = get_test_engine();
+
+    memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {batch_size, sequence_len, input_size, 1} });
+    memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } });
+    memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } });
+    memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } });
+    memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
+    memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
+
+    set_values(input, ref_input_vec);
+    set_values(weights, ref_weights_vec);
+    set_values(recurrent, ref_recurrent_vec);
+    set_values(biases, ref_bias_vec);
+    set_values(hidden, ref_hidden_vec);
+    set_values(cell, ref_cell_vec);
+
+    bool emit_last_cell = output_selection == cldnn_lstm_output_hidden_cell ||
+                          output_selection == cldnn_lstm_output_sequence_cell;
+    bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden ||
+                            output_selection == cldnn_lstm_output_hidden_cell;
+
+    topology topology;
+    std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
+    std::vector<primitive_id> lstm_inputs;
+    std::vector<primitive_id> output_ids_offsets;
+
+    topology.add(input_layout("input", input.get_layout()));
+    for (int i = 0; i < sequence_len; ++i)
+    {
+        input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}});
+        lstm_inputs.push_back("inputSplit:"+get_string_id(i));
+    }
+    topology.add(split("inputSplit", "input", input_ids_offsets));
+    topology.add(data("weights", weights));
+    topology.add(data("recurrent", recurrent));
+    topology.add(data("biases", biases));
+    topology.add(input_layout("hidden", hidden.get_layout()));
+    topology.add(input_layout("cell", cell.get_layout()));
+    topology.add(lstm("lstm", lstm_inputs, "weights", "recurrent",
+                      "biases", "hidden", "cell", "", 0, false, {}, {},
+                      output_selection, default_offset_type));
+    if (emit_last_cell)
+    {
+        int32_t concatenation_len = emit_last_hidden ? 2 : sequence_len + 1;
+        tensor hidden_tensor {batch_size, concatenation_len - 1, hidden_size, directions};
+        tensor cell_tensor {batch_size, 1, hidden_size, directions};
+        topology.add(crop(emit_last_hidden ? "crop:last_hidden" : "crop:sequence", "lstm", hidden_tensor, tensor {0, 0, 0, 0}));
+        topology.add(crop("crop:last_cell", "lstm", cell_tensor, tensor {0, concatenation_len - 1, 0, 0}));
+    }
+
+    network network(engine, topology);
+    network.set_input_data("input", input);
+    network.set_input_data("hidden", hidden);
+    network.set_input_data("cell", cell);
+
+    auto outputs = network.execute();
+       uint32_t ref_num_output_primitives = 1;  // Output will return atleast 1 primitive
+
+       if (emit_last_cell) {
+               // add another primitve to account for cell state if the output selection includes cell state
+               ref_num_output_primitives += 1;
+       }
+
+       // check if the number of returned primitives match the expected number of output primitives
+       ASSERT_EQ(ref_num_output_primitives, outputs.size());
+
+       for (auto itr = outputs.begin(); itr != outputs.end(); itr++)
+       {
+        auto output_tensor = itr->second.get_memory().get_layout().size;
+        primitive_id primitive_name = itr->first;
+
+               cldnn::memory output_memory = itr->second.get_memory();
+        int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T));
+               cldnn::tensor ref_output_tensor;
+               VVVVF<T> ref_primitive_output;
+
+               int32_t ref_batch_size = batch_size;
+               int32_t ref_hidden_size = hidden_size;
+               int32_t ref_directions = directions;
+
+        int32_t ref_seq_len = 1;
+        // Set the reference output against which the primitive's output will be compared
+               if (primitive_name.find("crop:last_cell") != std::string::npos)
+               {
+                       ref_primitive_output = last_cell;
+               }
+               else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos)
+               {
+                       ref_primitive_output = last_hidden;
+               }
+               else
+               {
+                       ref_seq_len = sequence_len;
+                       ref_primitive_output = ref_output;
+               }
+
+               ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions };
+               int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions;
+
+               // The number of elements in reference should match the number of elements in the primitive's output
+               ASSERT_EQ(ref_output_size , output_size);
+
+        // Compare the output tensor configuration against the reference value
+        // Output tensor is configured in bfyx format
+        ASSERT_EQ(ref_batch_size, output_tensor.batch[0]);
+        ASSERT_EQ(ref_seq_len, output_tensor.feature[0]);              // Sequence length should match
+               ASSERT_EQ(ref_directions, output_tensor.spatial[1]);    // directions should match
+        ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]);  // input size should match
+
+        auto output_ptr = output_memory.pointer<T>();
+
+               int32_t i = 0;
+               for (int32_t b = 0; b < ref_batch_size; ++b) {
+                       for (int32_t s = 0; s < ref_seq_len; ++s) {
+                               for (int32_t d = 0; d < ref_directions; ++d) {
+                                       for (int32_t x = 0; x < ref_hidden_size; ++x) {
+                        ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+// -------------------------------------------------------
+template<typename T>
+void lstm_gpu_format_test(const cldnn::format& format, int directions) {
+    int layers = 1;
+    int sequence_len = 6;
+    int batch_size = 3;
+    int input_size = 4;
+    int hidden_size = 5;
+
+    cldnn_lstm_output output_selection = cldnn_lstm_output::cldnn_lstm_output_sequence;
+
+    std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
+            << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size
+            << " Output selection: " << output_selection << std::endl;
+    int min_random = -2, max_random = 2;
+
+    VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
+    VVVVF<T> ref_weights = generate_random_4d<T>(1, directions, 4 * hidden_size, input_size, min_random, max_random);
+    VVVVF<T> ref_recurrent = generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random);
+    VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random);
+    VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
+    VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
+    VVVVF<T> ref_output = VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size))));
+
+    VF<T> ref_input_vec = flatten_4d<T>(format, ref_input);
+    VF<T> ref_weights_vec = flatten_4d<T>(cldnn::format::bfyx, ref_weights);
+    VF<T> ref_recurrent_vec = flatten_4d<T>(cldnn::format::bfyx, ref_recurrent);
+    VF<T> ref_bias_vec = flatten_4d<T>(cldnn::format::bfyx, ref_bias);
+    VF<T> ref_hidden_vec = flatten_4d<T>(format, ref_hidden);
+    VF<T> ref_cell_vec = flatten_4d<T>(format, ref_cell);
+
+    VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
+    VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
+
+    lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output,
+                   last_hidden, last_cell, true, true, true,
+                   (T)0, false, true);
+
+    const auto& engine = get_test_engine();
+
+    memory input = memory::allocate(engine, { type_to_data_type<T>::value,format, {batch_size, sequence_len, input_size, 1} });
+    memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } });
+    memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } });
+    memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } });
+    memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format, { batch_size, 1, hidden_size, directions } });
+    memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format, { batch_size, 1, hidden_size, directions } });
+
+    set_values(input, ref_input_vec);
+    set_values(weights, ref_weights_vec);
+    set_values(recurrent, ref_recurrent_vec);
+    set_values(biases, ref_bias_vec);
+    set_values(hidden, ref_hidden_vec);
+    set_values(cell, ref_cell_vec);
+
+    bool emit_last_cell = output_selection == cldnn_lstm_output_hidden_cell ||
+                          output_selection == cldnn_lstm_output_sequence_cell;
+    bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden ||
+                            output_selection == cldnn_lstm_output_hidden_cell;
+
+    topology topology;
+    std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
+    std::vector<primitive_id> lstm_inputs;
+    std::vector<primitive_id> output_ids_offsets;
+
+    topology.add(input_layout("input", input.get_layout()));
+    for (int i = 0; i < sequence_len; ++i)
+    {
+        input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}});
+        lstm_inputs.push_back("inputSplit:"+get_string_id(i));
+    }
+    topology.add(split("inputSplit", "input", input_ids_offsets));
+    topology.add(data("weights", weights));
+    topology.add(data("recurrent", recurrent));
+    topology.add(data("biases", biases));
+    topology.add(input_layout("hidden", hidden.get_layout()));
+    topology.add(input_layout("cell", cell.get_layout()));
+    topology.add(lstm("lstm"+get_string_id(0), lstm_inputs, "weights", "recurrent",
+                      "biases", "hidden", "cell", "", 0, false, {}, {},
+                      output_selection, default_offset_type));
+
+    if (emit_last_cell)
+    {
+        int32_t concatenation_len = emit_last_hidden ? 2 : sequence_len + 1;
+        tensor hidden_tensor {batch_size, concatenation_len - 1, hidden_size, directions};
+        tensor cell_tensor {batch_size, 1, hidden_size, directions};
+        topology.add(crop(emit_last_hidden ? "crop:last_hidden" : "crop:sequence", "lstm", hidden_tensor, tensor {0, 0, 0, 0}));
+        topology.add(crop("crop:last_cell", "lstm", cell_tensor, tensor {0, concatenation_len - 1, 0, 0}));
+    }
+
+    network network(engine, topology);
+    std::map<primitive_id, network_output> outputs;
+
+    network.set_input_data("input", input);
+    network.set_input_data("hidden", hidden);
+    network.set_input_data("cell", cell);
+    outputs = network.execute();
+
+    uint32_t ref_num_output_primitives = 1;  // Output will return atleast 1 primitive
+
+    if (emit_last_cell) {
+        // add another primitve to account for cell state if the output selection includes cell state
+        ref_num_output_primitives += 1;
+    }
+
+    // check if the number of returned primitives match the expected number of output primitives
+    ASSERT_EQ(ref_num_output_primitives, outputs.size());
+
+    for (auto itr = outputs.begin(); itr != outputs.end(); itr++)
+    {
+        auto output_tensor = itr->second.get_memory().get_layout().size;
+        primitive_id primitive_name = itr->first;
+
+        cldnn::memory output_memory = itr->second.get_memory();
+        int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T));
+        cldnn::tensor ref_output_tensor;
+        VVVVF<T> ref_primitive_output;
+
+        int32_t ref_batch_size = batch_size;
+        int32_t ref_hidden_size = hidden_size;
+        int32_t ref_directions = directions;
+
+        int32_t ref_seq_len = 1;
+        // Set the reference output against which the primitive's output will be compared
+        if (primitive_name.find("crop:last_cell") != std::string::npos)
+        {
+            ref_primitive_output = last_cell;
+        }
+        else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos)
+        {
+            ref_primitive_output = last_hidden;
+        }
+        else
+        {
+            ref_seq_len = sequence_len;
+            ref_primitive_output = ref_output;
+        }
+
+        ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions };
+        int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions;
+
+        // The number of elements in reference should match the number of elements in the primitive's output
+        ASSERT_EQ(ref_output_size , output_size);
+
+        // Compare the output tensor configuration against the reference value
+        // Output tensor is configured in bfyx format
+        ASSERT_EQ(ref_batch_size, output_tensor.batch[0]);
+        ASSERT_EQ(ref_seq_len, output_tensor.feature[0]);       // Sequence length should match
+        ASSERT_EQ(ref_directions, output_tensor.spatial[1]);    // directions should match
+        ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]);   // input size should match
+
+        auto output_ptr = output_memory.pointer<T>();
+
+        int32_t i = 0;
+        if (format == cldnn::format::bfyx) {
+            for (int32_t b = 0; b < ref_batch_size; ++b) {
+                for (int32_t s = 0; s < ref_seq_len; ++s) {
+                    for (int32_t d = 0; d < ref_directions; ++d) {
+                        for (int32_t x = 0; x < ref_hidden_size; ++x) {
+                            ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
+                        }
+                    }
+                }
+            }
+        }
+        else if(format == cldnn::format::fyxb)
+        {
+            for (int32_t s = 0; s < ref_seq_len; ++s) {
+                for (int32_t d = 0; d < ref_directions; ++d) {
+                    for (int32_t x = 0; x < ref_hidden_size; ++x) {
+                        for (int32_t b = 0; b < ref_batch_size; ++b) {
+                            ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
+                        }
+                    }
+                }
+            }
+        }
+
+    }
+}
+
+// -------------------------------------------------------
+template<typename T>
+void lstm_gpu_users_test() {
+    int sequence_len = 2;
+    int batch_size = 1;
+    int input_size = 1;
+    int hidden_size = 1;
+    int directions = 1;
+    int min_random = -2, max_random = 2;
+
+    // The following test is designed to test the user dependencies of an LSTM node when replaced by subcomponents
+    // by the graph compiler.
+    // The output of an LSTM node is set to last_hidden only. Then we concatenate the last_hidden with the initial_hidden tensor:
+    // (input, weights, recurrent, bias, initial_hidden, inital_cell) -> LSTM -> last_hidden
+    // concatenation(last_hidden, initial_hidden)
+    // If the replacing is is done correctly then the initial_hidden tensor should match the output of the concatenation
+    // by an offset along the sequence.
+
+    VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
+    VVVVF<T> ref_weights = generate_random_4d<T>(1, directions, 4 * hidden_size, input_size, min_random, max_random);
+    VVVVF<T> ref_recurrent = generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random);
+    VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random);
+    VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
+    VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
+    VVVVF<T> ref_output = VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size))));
+
+    VF<T> ref_input_vec = flatten_4d<T>(format::bfyx, ref_input);
+    VF<T> ref_weights_vec = flatten_4d<T>(format::bfyx, ref_weights);
+    VF<T> ref_recurrent_vec = flatten_4d<T>(format::bfyx, ref_recurrent);
+    VF<T> ref_bias_vec = flatten_4d<T>(format::bfyx, ref_bias);
+    VF<T> ref_hidden_vec = flatten_4d<T>(format::bfyx, ref_hidden);
+    VF<T> ref_cell_vec = flatten_4d<T>(format::bfyx, ref_cell);
+
+    VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
+    VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
+
+    const auto& engine = get_test_engine();
+
+    memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {batch_size, sequence_len, input_size, 1} });
+    memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } });
+    memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } });
+    memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } });
+    memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
+    memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
+
+    set_values(input, ref_input_vec);
+    set_values(weights, ref_weights_vec);
+    set_values(recurrent, ref_recurrent_vec);
+    set_values(biases, ref_bias_vec);
+    set_values(hidden, ref_hidden_vec);
+    set_values(cell, ref_cell_vec);
+
+    topology topology;
+    std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
+    std::vector<primitive_id> lstm_inputs;
+
+    topology.add(input_layout("input", input.get_layout()));
+    for (int i = 0; i < sequence_len; ++i)
+    {
+        input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}});
+        lstm_inputs.push_back("inputSplit:"+get_string_id(i));
+    }
+    topology.add(split("inputSplit", "input", input_ids_offsets));
+    topology.add(data("weights", weights));
+    topology.add(data("recurrent", recurrent));
+    topology.add(data("biases", biases));
+    topology.add(input_layout("hidden", hidden.get_layout()));
+    topology.add(input_layout("cell", cell.get_layout()));
+    topology.add(lstm("lstm", lstm_inputs, "weights", "recurrent",
+                      "biases", "hidden", "cell", "", 0, false, {}, {},
+                      cldnn_lstm_output::cldnn_lstm_output_hidden, default_offset_type));
+    std::vector<primitive_id> output_ids_offsets {"lstm", "hidden"};
+    topology.add(concatenation("concatenation", output_ids_offsets, concatenation::along_f));
+
+    network network(engine, topology);
+    std::map<primitive_id, network_output> outputs;
+
+    network.set_input_data("input", input);
+    network.set_input_data("hidden", hidden);
+    network.set_input_data("cell", cell);
+    outputs = network.execute();
+
+    // check if the number of returned primitives match the expected number of output primitives
+    ASSERT_EQ(size_t(1), outputs.size());
+    cldnn::memory output_memory = outputs.begin()->second.get_memory();
+    auto output_ptr = output_memory.pointer<T>();
+
+    int32_t i = 0;
+    for (int32_t b = 0; b < batch_size; ++b) {
+        for (int32_t s = 0; s < 1; ++s) {
+            for (int32_t d = 0; d < directions; ++d) {
+                for (int32_t x = 0; x < hidden_size; ++x) {
+                    int32_t idx = x + hidden_size * (d + directions * ((s+1) + sequence_len * b));
+                    ASSERT_NEAR(ref_hidden[b][s][d][x], output_ptr[idx], FERROR);
+                }
+            }
+        }
+    }
+}
+
+// -------------------------------------------------------
+template<typename T>
+void lstm_gpu_concatenated_input_test(int layers, int sequence_len, int direction,
+                                                             int batch_size, int input_size, int hidden_size,
+                                                             bool has_bias = true, bool has_initial_hidden = true,
+                                                             bool has_initial_cell = true, float clip_threshold = 0,
+                                                             bool input_forget = false)
+{
+       std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
+               << " Sequence Len = " << sequence_len << " Direction = " << direction << " Batch Size = " << batch_size << std::endl;
+       int min_random = -2, max_random = 2;
+
+       VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
+
+       std::vector<VVVVF<T>> ref_weights;
+       std::vector<VVVVF<T>> ref_recurrent;
+       std::vector<VVVVF<T>> ref_bias;
+       std::vector<VVVVF<T>> ref_hidden;
+       std::vector<VVVVF<T>> ref_cell;
+       std::vector<VVVVF<T>> ref_output;
+
+       for (int i = 0; i < layers; ++i) {
+               ref_weights.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, i == 0 ? input_size : hidden_size, min_random, max_random));
+               ref_recurrent.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, hidden_size, min_random, max_random));
+               ref_bias.push_back(generate_random_4d<T>(1, 1, direction, 4 * hidden_size, min_random, max_random));
+               ref_hidden.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
+               ref_cell.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
+               ref_output.push_back(VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(direction, VF<T>(hidden_size)))));
+       }
+
+       VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
+
+       std::vector<VF<T>> ref_weights_vec;
+       std::vector<VF<T>> ref_recurrent_vec;
+       std::vector<VF<T>> ref_bias_vec;
+       std::vector<VF<T>> ref_hidden_vec;
+       std::vector<VF<T>> ref_cell_vec;
+       for (int i = 0; i < layers; ++i) {
+               ref_weights_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_weights[i]));
+               ref_recurrent_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_recurrent[i]));
+               ref_bias_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_bias[i]));
+               ref_hidden_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_hidden[i]));
+               ref_cell_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_cell[i]));
+       }
+
+       VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
+       VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
+
+       lstm_reference(ref_input, ref_hidden[0], ref_cell[0], ref_weights[0], ref_recurrent[0], ref_bias[0], ref_output[0],
+               last_hidden, last_cell, has_bias, has_initial_hidden, has_initial_cell,
+               clip_threshold, input_forget, true);
+
+       for (int i = 1; i < layers; ++i) {
+               lstm_reference(ref_output[i - 1], ref_hidden[i], ref_cell[i], ref_weights[i], ref_recurrent[i],
+                       ref_bias[i], ref_output[i],
+                       last_hidden, last_cell, has_bias, has_initial_hidden, has_initial_cell,
+                       clip_threshold, input_forget, false);
+       }
+
+       const auto& engine = get_test_engine();
+
+       memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {batch_size, sequence_len, input_size, 1} });
+       set_values(input, ref_input_vec);
+
+       std::vector<memory> weights;
+       std::vector<memory> recurrent;
+       std::vector<memory> biases;
+       std::vector<memory> hidden;
+       std::vector<memory> cell;
+       for (int i = 0; i < layers; ++i) {
+               weights.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, direction, i == 0 ? input_size : hidden_size, 4 * hidden_size } }));
+               set_values(weights[i], ref_weights_vec[i]);
+               recurrent.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } }));
+               set_values(recurrent[i], ref_recurrent_vec[i]);
+               if (has_bias) {
+                       biases.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, direction } }));
+                       set_values(biases[i], ref_bias_vec[i]);
+               }
+               if (has_initial_hidden) {
+                       hidden.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, direction } }));
+                       set_values(hidden[i], ref_hidden_vec[i]);
+               }
+               if (has_initial_cell) {
+                       cell.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, direction} }));
+                       set_values(cell[i], ref_cell_vec[i]);
+               }
+       }
+
+       topology topology;
+       std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
+       std::vector<primitive_id> lstm_inputs;
+       std::vector<primitive_id> output_ids_offsets;
+
+       topology.add(input_layout("input", input.get_layout()));
+       cldnn::primitive_id prev_node_id;
+
+    for (int i = 0; i < layers; ++i) {
+               std::string sid = get_string_id(i);
+               std::string lstm_id = "lstm" + sid;
+               std::string weights_id = "weights" + sid;
+               std::string recurrent_id = "recurrent" + sid;
+               std::string biases_id = "biases" + sid;
+               std::string hidden_id = "hidden" + sid;
+               std::string cell_id = "cell" + sid;
+               std::string output_crop_id = "crop:sequence:" + sid;
+
+               topology.add(data(weights_id, weights[i]));
+               topology.add(data(recurrent_id, recurrent[i]));
+               if (has_bias) topology.add(data(biases_id, biases[i]));
+               if (has_initial_hidden) topology.add(input_layout(hidden_id, hidden[i].get_layout()));
+               if (has_initial_cell) topology.add(input_layout(cell_id, cell[i].get_layout()));
+               if (i == 0) {
+            topology.add(lstm(lstm_id, { "input" }, weights_id, recurrent_id,
+                               has_bias ? biases_id : "", has_initial_hidden ? hidden_id : "", has_initial_cell ? cell_id : "", "",
+                               clip_threshold, input_forget, {}, {},
+                               cldnn_lstm_output::cldnn_lstm_output_sequence_cell, default_offset_type));
+               }
+               else {
+                       topology.add(lstm(lstm_id, { prev_node_id }, weights_id, recurrent_id,
+                               has_bias ? biases_id : "", has_initial_hidden ? hidden_id : "", has_initial_cell ? cell_id : "", "",
+                               clip_threshold, input_forget, {}, {},
+                               cldnn_lstm_output::cldnn_lstm_output_sequence_cell, default_offset_type));
+               }
+
+        // Crop out the whole output sequence element
+               topology.add(crop(output_crop_id, lstm_id, {batch_size, sequence_len, hidden_size, direction}, {0, 0, 0, 0}));
+
+       // Save the node id to provide it as input to the next lstm layer
+               prev_node_id = output_crop_id;
+       }
+
+       network network(engine, topology);
+       network.set_input_data("input", input);
+       for (int i = 0; i < layers; ++i) {
+               std::string sid = get_string_id(i);
+               if (has_initial_hidden) network.set_input_data("hidden" + sid, hidden[i]);
+               if (has_initial_cell) network.set_input_data("cell" + sid, cell[i]);
+       }
+       auto outputs = network.execute();
+       {
+               ASSERT_EQ(outputs.size(), size_t(1));
+               size_t output_size = outputs.begin()->second.get_memory().size() / sizeof(T);
+               ASSERT_EQ(output_size, size_t(hidden_size * sequence_len * batch_size * direction));
+
+               auto output = outputs.begin()->second.get_memory();
+
+               // Get the output tensor
+               cldnn::layout output_layout = output.get_layout();
+               cldnn::tensor output_tensor = output_layout.size;
+
+               // Compare the output tensor configuration against the reference value
+               // Output tensor is configured in bfyx format
+               ASSERT_EQ(batch_size, output_tensor.batch[0]);
+               ASSERT_EQ(sequence_len, output_tensor.feature[0]);
+               ASSERT_EQ(direction, output_tensor.spatial[1]);
+               ASSERT_EQ(hidden_size, output_tensor.spatial[0]);
+
+               auto output_ptr = output.pointer<T>();
+               int32_t i = 0;
+               for (int32_t b = 0; b < batch_size; ++b) {
+                       for (int32_t s = 0; s < sequence_len; ++s) {
+                               for (int32_t d = 0; d < direction; ++d) {
+                                       for (int32_t x = 0; x < hidden_size; ++x) {
+                                               ASSERT_NEAR(ref_output[layers - 1][b][s][d][x], output_ptr[i++], FERROR);
+                                       }
+                               }
+                       }
+               }
+       }
+}
+
+// This test checks chained and stacked LSTM topology. The configuration allows to create
+// LSTM topology with multiple layers and can also be chained together.
+template<typename T>
+void lstm_gpu_chain_test(int batch_size, int input_size, int hidden_size,
+                         int directions, size_t layers, size_t chains, int sequence_len,
+                         const cldnn_lstm_output& output_selection)
+{
+    int min_random = -2, max_random = 2;
+    bool has_bias = false;
+    bool has_initial_hidden = false;
+    bool has_initial_cell = false;
+    float clip_threshold = 0;
+    bool input_forget = false;
+
+    std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
+        << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size
+        << " Output selection: " << output_selection << std::endl;
+
+    VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
+    std::vector<std::vector< VVVVF<T>>> ref_weights;
+    std::vector<std::vector< VVVVF<T>>> ref_recurrent;
+    std::vector<std::vector< VVVVF<T>>> ref_bias;
+    std::vector<std::vector< VVVVF<T>>> ref_hidden;
+    std::vector<std::vector< VVVVF<T>>> ref_cell;
+    std::vector<std::vector< VVVVF<T>>> ref_output;
+
+    // Create the 4 dimensional weight, bias, hidden, cell state and output vectors
+    for (size_t chain = 0; chain < chains; chain++) {
+
+        std::vector<VVVVF<T>> per_chain_ref_weights;
+        std::vector<VVVVF<T>> per_chain_ref_recurrent;
+        std::vector<VVVVF<T>> per_chain_ref_bias;
+        std::vector<VVVVF<T>> per_chain_ref_hidden;
+        std::vector<VVVVF<T>> per_chain_ref_cell;
+        std::vector<VVVVF<T>> per_chain_ref_output;
+
+        for (size_t layer = 0; layer < layers; layer++) {
+            per_chain_ref_weights.push_back(generate_random_4d<T>(1, directions, 4 * hidden_size, (layer == 0) ? input_size : hidden_size, min_random, max_random));
+            per_chain_ref_recurrent.push_back(generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random));
+            per_chain_ref_bias.push_back(generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random));
+            per_chain_ref_hidden.push_back(generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random));
+            per_chain_ref_cell.push_back(generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random));
+            per_chain_ref_output.push_back(VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size)))));
+        }
+
+        ref_weights.push_back(per_chain_ref_weights);
+        ref_recurrent.push_back(per_chain_ref_recurrent);
+        ref_bias.push_back(per_chain_ref_bias);
+        ref_hidden.push_back(per_chain_ref_hidden);
+        ref_cell.push_back(per_chain_ref_cell);
+        ref_output.push_back(per_chain_ref_output);
+    }
+
+    VF<T> ref_input_vec;
+    std::vector<std::vector< VF<T>>> ref_weights_vec;
+    std::vector<std::vector< VF<T>>> ref_recurrent_vec;
+    std::vector<std::vector< VF<T>>> ref_bias_vec;
+    std::vector<std::vector< VF<T>>> ref_hidden_vec;
+    std::vector<std::vector< VF<T>>> ref_cell_vec;
+    std::vector<std::vector< VF<T>>> ref_output_vec;
+
+    ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
+
+    // flatten all the 4 dimensional vectors across chains and layers
+    for (size_t chain = 0; chain < chains; chain++) {
+
+        std::vector<VF<T>> per_chain_ref_weights;
+        std::vector<VF<T>> per_chain_ref_recurrent;
+        std::vector<VF<T>> per_chain_ref_bias;
+        std::vector<VF<T>> per_chain_ref_hidden;
+        std::vector<VF<T>> per_chain_ref_cell;
+        std::vector<VF<T>> per_chain_ref_output;
+
+        for (size_t layer = 0; layer < layers; layer++) {
+            per_chain_ref_weights.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_weights[chain][layer]));
+            per_chain_ref_recurrent.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_recurrent[chain][layer]));
+            per_chain_ref_bias.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_bias[chain][layer]));
+            per_chain_ref_hidden.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_hidden[chain][layer]));
+            per_chain_ref_cell.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_cell[chain][layer]));
+            per_chain_ref_output.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_output[chain][layer]));
+        }
+
+        ref_weights_vec.push_back(per_chain_ref_weights);
+        ref_recurrent_vec.push_back(per_chain_ref_recurrent);
+        ref_bias_vec.push_back(per_chain_ref_bias);
+        ref_hidden_vec.push_back(per_chain_ref_hidden);
+        ref_cell_vec.push_back(per_chain_ref_cell);
+        ref_output_vec.push_back(per_chain_ref_output);
+    }
+
+    std::vector<std::vector<VVVVF<T>>> last_hidden(chains, std::vector<VVVVF<T> >(layers, VVVVF<T>(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))))));
+    std::vector<std::vector<VVVVF<T>>> last_cell(chains, std::vector<VVVVF<T> >(layers, VVVVF<T>(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))))));
+
+    for (size_t chain = 0; chain < chains; chain++) {
+        lstm_reference(ref_input, ref_hidden[chain][0], ref_cell[chain][0], ref_weights[chain][0],
+                       ref_recurrent[chain][0], ref_bias[chain][0], ref_output[chain][0],
+                       last_hidden[chain][0], last_cell[chain][0], has_bias,
+                       chain == 0 ? has_initial_hidden : true,
+                       chain == 0 ? has_initial_cell : true,
+                       clip_threshold, input_forget, true);
+
+        if (chain < chains - 1)
+        {
+            ref_hidden[chain + 1][0] = last_hidden[chain][0];
+            ref_cell[chain + 1][0] = last_cell[chain][0];
+        }
+    }
+
+    for (size_t layer = 1; layer < layers; ++layer) {
+        for (size_t chain = 0; chain < chains; chain++) {
+            lstm_reference(ref_output[chain][layer - 1], ref_hidden[chain][layer], ref_cell[chain][layer],
+                           ref_weights[chain][layer], ref_recurrent[chain][layer], ref_bias[chain][layer],
+                           ref_output[chain][layer], last_hidden[chain][layer], last_cell[chain][layer], has_bias,
+                           chain == 0 ? has_initial_hidden : true,
+                           chain == 0 ? has_initial_cell : true,
+                           clip_threshold, input_forget,
+                           false);
+
+            if (chain < chains - 1)
+            {
+                ref_hidden[chain + 1][layer] = last_hidden[chain][layer];
+                ref_cell[chain + 1][layer] = last_cell[chain][layer];
+            }
+        }
+    }
+
+    const auto& engine = get_test_engine();
+    tensor input_tensor = { batch_size, sequence_len, input_size, 1 };
+    layout layout = { type_to_data_type<T>::value, cldnn::format::bfyx, input_tensor };
+
+    memory input = memory::allocate(engine, layout);
+    set_values(input, ref_input_vec);
+
+    // 2-dim vectors to support chain and layers
+    std::vector<std::vector<memory>> weights;
+    std::vector<std::vector<memory>> recurrent;
+    std::vector<std::vector<memory>> biases;
+    std::vector<std::vector<memory>> hidden;
+    std::vector<std::vector<memory>> cell;
+
+    for (size_t chain = 0; chain < chains; chain++) {
+        std::vector<memory> per_chain_weights;
+        std::vector<memory> per_chain_recurrent;
+        std::vector<memory> per_chain_biases;
+        std::vector<memory> per_chain_hidden;
+        std::vector<memory> per_chain_cell;
+
+        for (size_t layer = 0; layer < layers; layer++) {
+            per_chain_weights.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, directions, layer == 0 ? input_size : hidden_size, 4 * hidden_size} }));
+            set_values(per_chain_weights[layer], ref_weights_vec[chain][layer]);
+
+            per_chain_recurrent.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, directions, hidden_size, 4 * hidden_size} }));
+            set_values(per_chain_recurrent[layer], ref_recurrent_vec[chain][layer]);
+
+            if (has_bias)
+            {
+                per_chain_biases.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, 1, 4 * hidden_size, directions} }));
+                set_values(per_chain_biases[layer], ref_bias_vec[chain][layer]);
+            }
+
+            if (has_initial_hidden)
+            {
+                per_chain_hidden.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, 1, hidden_size, directions} }));
+                set_values(per_chain_hidden[layer], ref_hidden_vec[chain][layer]);
+            }
+
+            if (has_initial_cell)
+            {
+                per_chain_cell.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, 1, hidden_size, directions} }));
+                set_values(per_chain_cell[layer], ref_cell_vec[chain][layer]);
+            }
+        }
+
+        weights.push_back(per_chain_weights);
+        recurrent.push_back(per_chain_recurrent);
+        biases.push_back(per_chain_biases);
+        hidden.push_back(per_chain_hidden);
+        cell.push_back(per_chain_cell);
+    }
+
+    // Start creating the topology
+    cldnn::topology topology;
+    std::vector<std::pair<primitive_id, cldnn::tensor>> input_ids_offsets;
+    std::vector<primitive_id> lstm_inputs;
+    std::vector<primitive_id> output_ids_offsets;
+
+    topology.add(input_layout("input", input.get_layout()));
+
+    for (int feature = 0; feature < sequence_len; feature++) {
+        input_ids_offsets.push_back({ get_string_id(feature), {0, feature, 0, 0} });
+        lstm_inputs.push_back("inputSplit:" + get_string_id(feature));
+    }
+    topology.add(split("inputSplit", "input", input_ids_offsets));
+
+    bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden
+        || output_selection == cldnn_lstm_output_hidden_cell;
+
+    std::vector<cldnn::primitive_id> output_sequence_ids;
+    std::vector<cldnn::primitive_id> last_hidden_ids;
+    std::vector<cldnn::primitive_id> last_cell_ids;
+
+    for (size_t chain = 0; chain < chains; chain++) {
+
+        // Add all the primitives to the network
+        std::vector<cldnn::primitive_id> prev_output_sequence_ids(output_sequence_ids);
+        std::vector<cldnn::primitive_id> prev_last_hidden_ids(last_hidden_ids);
+        std::vector<cldnn::primitive_id> prev_last_cell_ids(last_cell_ids);
+
+        // Erase all the temporary primitive id containers
+        output_sequence_ids.clear();
+        last_cell_ids.clear();
+        last_hidden_ids.clear();
+
+        for (size_t layer = 0; layer < layers; layer++) {
+            std::string chain_id = get_string_id(chain);
+            std::string layer_id = get_string_id(layer);
+            std::string lstm_id = "lstm:" + chain_id + ":" + layer_id;
+            std::string weights_id = "weights:" + chain_id + ":" + layer_id;
+            std::string recurrent_id = "recurrent:" + chain_id + ":" + layer_id;
+            std::string biases_id = "biases:" + chain_id + ":" + layer_id;
+            std::string hidden_id = "hidden:" + chain_id + ":" + layer_id;
+            std::string cell_id = "cell:" + chain_id + ":" + layer_id;
+            std::string crop_seq_id = "crop:sequence:" + chain_id + ":" + layer_id;
+            std::string crop_last_cell_id = "crop:last_cell:" + chain_id + ":" + layer_id;
+            std::string crop_last_hidden_id = "crop:last_hidden:" + chain_id + ":" + layer_id;
+
+            primitive_id initial_hidden_id;
+            primitive_id initial_cell_id;
+            cldnn_lstm_output output_selection_per_layer;
+
+            topology.add(data(weights_id, weights[chain][layer]));
+            topology.add(data(recurrent_id, recurrent[chain][layer]));
+            if (has_bias) topology.add(data(biases_id, biases[chain][layer]));
+
+            if (chain == 0 && layer == 0)
+            {
+                if (has_initial_hidden) topology.add(input_layout(hidden_id, hidden[chain][layer].get_layout()));
+                if (has_initial_cell) topology.add(input_layout(cell_id, cell[chain][layer].get_layout()));
+            }
+
+            // Get the initial hidden and initial cell for each layer for each chain link
+            if (chain == 0)
+            {
+                initial_hidden_id = has_initial_hidden ? hidden_id : "";
+                initial_cell_id = has_initial_cell ? cell_id : "";
+            }
+            else
+            {
+                initial_hidden_id = prev_last_hidden_ids[layer];
+                initial_cell_id = prev_last_cell_ids[layer];
+            }
+
+            // Output selection for all the layers except the last layer has to have the sequence,
+            // last hidden and last cell
+            if (layer < layers - 1)
+            {
+                output_selection_per_layer = cldnn_lstm_output::cldnn_lstm_output_sequence_cell;
+            }
+            else
+            {
+                // For the last layer, use the output selection provided by the user
+                output_selection_per_layer = output_selection;
+            }
+
+            if (layer == 0)
+            {
+                topology.add(lstm(lstm_id, lstm_inputs, weights_id, recurrent_id,
+                    has_bias ? biases_id : "",
+                    initial_hidden_id, initial_cell_id,
+                    "", clip_threshold, input_forget, {}, {},
+                    output_selection_per_layer, default_offset_type));
+            }
+            else
+            {
+                topology.add(lstm(lstm_id, { output_sequence_ids[layer - 1] }, weights_id, recurrent_id,
+                    has_bias ? biases_id : "",
+                    initial_hidden_id, initial_cell_id,
+                    "", clip_threshold, input_forget, {}, {},
+                    output_selection_per_layer, default_offset_type));
+            }
+
+            tensor sequence_tensor{ batch_size, sequence_len, hidden_size, directions };
+            tensor cell_tensor{ batch_size, 1, hidden_size, directions };
+            tensor last_hidden_tensor{ batch_size, 1, hidden_size, directions };
+
+            // For all the layers except the last layer, we need to crop output sequence,
+            // last hidden and last cell.
+            // The output sequence goes into the next layer of lstm in a chain link
+            // The last cell state and last hidden go to the lstm node in the same layer
+            // next in chain
+            topology.add(crop(crop_seq_id, lstm_id, sequence_tensor, tensor{ 0, 0, 0, 0 }));  // Add crop to get the sequence
+            topology.add(crop(crop_last_hidden_id, lstm_id, last_hidden_tensor, tensor{ 0, sequence_len - 1, 0, 0 }));  // Add crop to get the last hidden element
+            topology.add(crop(crop_last_cell_id, lstm_id, cell_tensor, tensor{ 0, sequence_len, 0, 0 }));  // Add crop to get the last cell element
+
+            // Keep a copy of the sequence, last hidden and last cell primitve id for each layer
+            output_sequence_ids.push_back(crop_seq_id);
+            last_hidden_ids.push_back(crop_last_hidden_id);
+            last_cell_ids.push_back(crop_last_cell_id);
+        }
+    }
+
+    // Creating network out of the above designed topology
+    cldnn::network network(engine, topology);
+    network.set_input_data("input", input);
+    for (size_t layer = 0; layer < layers; layer++) {
+        std::string sid = get_string_id(layer);
+        if (has_initial_hidden) network.set_input_data("hidden:000:" + sid, hidden[0][layer]); // 0 is the chain link index
+        if (has_initial_cell) network.set_input_data("cell:000:" + sid, cell[0][layer]); // 0 is the chain link index
+    }
+
+    auto outputs = network.execute();
+    for (auto itr = outputs.begin(); itr != outputs.end(); itr++)
+    {
+        auto output_tensor = itr->second.get_memory().get_layout().size;
+        primitive_id primitive_name = itr->first;
+
+        // Split the primitive id to get the chain id
+        // Eg: primitive id: crop:last_cell:XXX:YYY
+        // XXX is the chain id
+        // YYY is the layer id
+        std::string chain_str = primitive_name.substr(primitive_name.find(":", primitive_name.find(":") + 1) + 1, 5);
+        std::string layer_str = primitive_name.substr(primitive_name.find(":", primitive_name.find(":", primitive_name.find(":") + 1) + 1) + 1, 5);
+        size_t chain_id = stoi(chain_str);
+        size_t layer_id = stoi(layer_str);
+
+        cldnn::memory output_memory = itr->second.get_memory();
+        int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T));
+        cldnn::tensor ref_output_tensor;
+        VVVVF<T> ref_primitive_output;
+
+        int32_t ref_batch_size = batch_size;
+        int32_t ref_hidden_size = hidden_size;
+        int32_t ref_directions = directions;
+
+        int32_t ref_seq_len = 1;
+
+        // Set the reference output against which the primitive's output will be compared
+        if (primitive_name.find("crop:last_cell") != std::string::npos)
+        {
+            ref_primitive_output = last_cell[chain_id][layer_id];
+        }
+        else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos)
+        {
+            ref_primitive_output = last_hidden[chain_id][layer_id];
+        }
+        else
+        {
+            ref_seq_len = sequence_len;
+            ref_primitive_output = ref_output[chain_id][layers - 1];
+        }
+
+        ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions };
+        int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions;
+
+        // The number of elements in reference should match the number of elements in the primitive's output
+        ASSERT_EQ(ref_output_size, output_size);
+
+        // Compare the output tensor configuration against the reference value
+        // Output tensor is configured in bfyx format
+        ASSERT_EQ(ref_batch_size, output_tensor.batch[0]);
+        ASSERT_EQ(ref_seq_len, output_tensor.feature[0]);              // Sequence length should match
+        ASSERT_EQ(ref_directions, output_tensor.spatial[1]);   // directions should match
+        ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]);  // input size should match
+
+        auto output_ptr = output_memory.pointer<T>();
+
+        int32_t i = 0;
+        for (int32_t b = 0; b < ref_batch_size; ++b) {
+            for (int32_t s = 0; s < ref_seq_len; ++s) {
+                for (int32_t d = 0; d < ref_directions; ++d) {
+                    for (int32_t x = 0; x < ref_hidden_size; ++x) {
+                        ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
                      }
                  }
              }
@@ -593,6 +1651,7 @@ void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batc
      }
  }
  
+
  TEST(lstm_gemm_gpu, generic_lstm_gemm_test_f32) {
      generic_lstm_gemm_gpu_test<float>(1, 1, 3, 6, 2, true, true);
  }
@@ -609,6 +1668,24 @@ TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_bias_f32) {
      generic_lstm_gemm_gpu_test<float>(1, 1, 3, 6, 2, false, false);
  }
  
+// LSTM GEMM tests to test LSTM GEMMV kernel implementation
+TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_test_f32) {
+    generic_lstm_gemm_gpu_test<float>(5, 1, 1, 1024, 1024, true, true);
+}
+
+TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_bias_f32) {
+    generic_lstm_gemm_gpu_test<float>(1, 1, 1, 256, 2, false, true);
+}
+
+TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_hidden_f32) {
+    generic_lstm_gemm_gpu_test<float>(1, 1, 1, 64, 2, true, false);
+}
+
+TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_hidden_bias_f32) {
+    generic_lstm_gemm_gpu_test<float>(1, 1, 1, 64, 2, false, false);
+}
+
+// LSTM ELT Tests
  TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_f32) {
      generic_lstm_elt_gpu_test<float>(1, 1, 4, 6, 3, true, 0.3f);
  }
@@ -751,9 +1828,234 @@ TEST(lstm_gpu, generic_lstm_stacked_seq_bi_f32) {
      generic_lstm_gpu_test<float>(4, 7, 2, 3, 3, 2, true, true, true);
  }
  
+// optional outputs support
+TEST(lstm_gpu, output_test_sequence_f32) {
+    lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence, 1);
+}
+
+TEST(lstm_gpu, output_test_hidden_f32) {
+    lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden, 1);
+}
+
+TEST(lstm_gpu, output_test_hidden_cell_f32) {
+    lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden_cell, 1);
+}
+
+TEST(lstm_gpu, output_test_sequence_cell_f32) {
+    lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence_cell, 1);
+}
+
+TEST(lstm_gpu, output_test_sequence_bi_f32) {
+    lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence, 2);
+}
+
+TEST(lstm_gpu, output_test_hidden_bi_f32) {
+    lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden, 2);
+}
+
+TEST(lstm_gpu, output_test_hidden_cell_bi_f32) {
+    lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden_cell, 2);
+}
+
+TEST(lstm_gpu, output_test_sequence_cell_bi_f32) {
+    lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence_cell, 2);
+}
+
+// format tests
+TEST(lstm_gpu, lstm_gpu_format_bfyx_f32) {
+    lstm_gpu_format_test<float>(cldnn::format::bfyx, 1);
+}
+
+TEST(lstm_gpu, lstm_gpu_format_bfyx_bi_f32) {
+    lstm_gpu_format_test<float>(cldnn::format::bfyx, 2);
+}
+
+TEST(lstm_gpu, lstm_gpu_format_fyxb_f32) {
+    lstm_gpu_format_test<float>(cldnn::format::fyxb, 1);
+}
+
+TEST(lstm_gpu, lstm_gpu_format_fyxb_bi_f32) {
+    lstm_gpu_format_test<float>(cldnn::format::fyxb, 2);
+}
+
+// test for LSTM users' dependencies
+TEST(lstm_gpu, lstm_users_f32) {
+    lstm_gpu_users_test<float>();
+}
+
+// Test for LSTM with concatenated input
+TEST(lstm_gpu, generic_lstm_concatenated_input) {
+    lstm_gpu_concatenated_input_test<float>(1, 2, 2, 1, 1, 1, true, true, true);
+}
+
+TEST(lstm_gpu, generic_lstm_concatenated_input_multi_layer) {
+    lstm_gpu_concatenated_input_test<float>(5, 5, 2, 1, 1, 4, true, true, true);
+}
+
+// test for LSTM with chain and stack (multilayer)
+TEST(lstm_gpu, generic_lstm_chained_unidirectional_f32) {
+    // batch size = 1
+    // input size = 2
+    // hidden size = 4
+    // directions = 1
+    // layers = 1
+    // chains = 1
+    // sequence length = 1
+    // output selection = output sequence and cell
+    lstm_gpu_chain_test<float>(1, 2, 4, 1, 1, 2, 1, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
+}
+
+TEST(lstm_gpu, generic_lstm_chained_bidirectional_f32) {
+    // batch size = 1
+    // input size = 2
+    // hidden size = 4
+    // directions = 2
+    // layers = 1
+    // chains = 1
+    // sequence length = 1
+    // output selection = output sequence and cell
+    lstm_gpu_chain_test<float>(1, 2, 4, 2, 1, 1, 1, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
+}
+
+TEST(lstm_gpu, generic_lstm_chained_no_stack_bidirectional_f32) {
+    // batch size = 2
+    // input size = 2
+    // hidden size = 4
+    // directions = 2
+    // layers = 1
+    // chains = 2
+    // sequence length = 5
+    // output selection = output sequence and cell
+    lstm_gpu_chain_test<float>(2, 2, 4, 2, 1, 2, 5, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
+}
+
+TEST(lstm_gpu, generic_lstm_chained_stacked_bidirectional_f32) {
+    // batch size = 2
+    // input size = 2
+    // hidden size = 4
+    // directions = 2
+    // layers = 4
+    // chains = 2
+    // sequence length = 5
+    // output selection = output sequence and cell
+    lstm_gpu_chain_test<float>(2, 2, 4, 2, 4, 2, 5, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
+}
+
+// FP16 Half precision tests
+TEST(lstm_gemm_gpu, generic_lstm_gemm_test_f16) {
+    generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, true, true);
+}
+
+TEST(lstm_gemm_gpu, generic_lstm_gemm_no_bias_f16) {
+    generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, false, true);
+}
+
+TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_f16) {
+    generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, true, false);
+}
+
+TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_bias_f16) {
+    generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, false, false);
+}
+
+TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_f16) {
+    generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true, 0.3f);
+}
+
+TEST(lstm_elt_gpu, generic_lstm_elt_test_input_forget_f16) {
+    generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true, 0.f, 1);
+}
+
+TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_input_forget_f16) {
+    generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true, 0.5f, 1);
+}
+
+TEST(lstm_elt_gpu, generic_lstm_elt_test_f16) {
+    generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true);
+}
+
+TEST(lstm_elt_gpu, generic_lstm_elt_no_cell_f16) {
+    generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, false);
+}
+
+TEST(lstm_gpu, generic_lstm_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true);
+}
+
+TEST(lstm_gpu, generic_lstm_no_bias_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, false, true, true);
+}
+
+TEST(lstm_gpu, generic_lstm_no_hidden_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, true, false, true);
+}
+
+TEST(lstm_gpu, generic_lstm_no_bias_hidden_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, false, false, true);
+}
+
+TEST(lstm_gpu, generic_lstm_no_cell_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, true, true, false);
+}
+
+TEST(lstm_gpu, generic_lstm_no_bias_cell_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, false, true, false);
+}
+
+TEST(lstm_gpu, generic_lstm_no_hidden_cell_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, true, false, false);
+}
+
+TEST(lstm_gpu, generic_lstm_no_bias_hidden_cell_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, false, false, false);
+}
+
+TEST(lstm_gpu, generic_lstm_clip_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 0);
+}
+
+TEST(lstm_gpu, generic_lstm_input_forget_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true, 0.f, 1);
+}
+
+TEST(lstm_gpu, generic_lstm_clip_input_forget_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 1);
+}
+
+TEST(lstm_gpu, generic_lstm_offset_order_ifoz_f16) {
+    default_offset_type = cldnn_lstm_offset_order_ifoz;
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true);
+    default_offset_type = cldnn_lstm_offset_order_iofz;
+}
+
+TEST(lstm_gpu, generic_lstm_canonical_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 1, 1, 1, 1, 1, true, true, true);
+}
+
+// bidirectional support
+TEST(lstm_gpu, generic_lstm_bi_bias_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 2, 2, 3, 4, true, false, false);
+}
+
+TEST(lstm_gpu, generic_lstm_bi_bias_hidden_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 2, 2, 3, 4, true, true, false);
+}
+
+TEST(lstm_gpu, generic_lstm_bi_bias_hidden_cell_f16) {
+    generic_lstm_gpu_test<FLOAT16>(1, 7, 2, 2, 3, 4, true, true, true);
+}
+
+// multi-layer support
+TEST(lstm_gpu, generic_lstm_stacked_seq_f16) {
+    generic_lstm_gpu_test<FLOAT16>(4, 7, 1, 3, 3, 2, true, true, true);
+}
+
+TEST(lstm_gpu, generic_lstm_stacked_bi_f16) {
+    generic_lstm_gpu_test<FLOAT16>(4, 7, 2, 3, 3, 2, true, true, true);
+}
+
  // TODO: Add tests for the following:
-// optional concatenate output
-// optional last hidden
-// optional last cell
+// integration testing using multi-layer and chained LSTMs
+// LSTMs single input
  // optional activation list