2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 ///////////////////////////////////////////////////////////////////////////////////////////////////
18 #include <gtest/gtest.h>
19 #include "api/CPP/memory.hpp"
20 #include <api/CPP/input_layout.hpp>
21 #include "api/CPP/lstm.hpp"
22 #include <api/CPP/split.hpp>
23 #include <api/CPP/crop.hpp>
24 #include <api/CPP/reshape.hpp>
25 #include <api/CPP/concatenation.hpp>
26 #include <api/CPP/topology.hpp>
27 #include <api/CPP/tensor.hpp>
28 #include <api/CPP/network.hpp>
29 #include <api/CPP/engine.hpp>
30 #include "test_utils/test_utils.h"
31 #include <api/CPP/data.hpp>
32 #include "instrumentation.h"
33 #include <test_utils/float16.h>
39 #pragma warning(disable: 4503)
42 using namespace cldnn;
43 using namespace tests;
48 float sigmoid(float x) {
49 return 1.f / (1.f + (float)std::exp((float)(-x)));
54 size_t it, ot, ft, zt;
55 offset_order(size_t scale, const cldnn_lstm_offset_order& t = cldnn_lstm_offset_order_iofz) {
56 static const std::map<cldnn_lstm_offset_order, std::vector<size_t>> offset_map{
57 { cldnn_lstm_offset_order_iofz,{ 0, 1, 2, 3 } },
58 { cldnn_lstm_offset_order_ifoz,{ 0, 2, 1, 3 } }
60 std::vector<size_t> v = offset_map.at(t);
67 cldnn_lstm_offset_order default_offset_type = cldnn_lstm_offset_order_iofz;
70 T clip(T val, T threshold) {
72 if (val > threshold) return threshold;
73 if (val < -threshold) return -threshold;
80 VVVVF<T> lstm_gemm_reference(VVVVF<T>& input, VVVVF<T>& weights, VVVVF<T>& recurrent, VVVVF<T>& bias, VVVVF<T>& hidden,
81 size_t seq, bool hasBias = true, bool hasHidden = true, size_t dir = 0, size_t input_dir = 0) {
82 size_t input_size = input[0][0][0].size();
83 size_t hidden_size = hidden[0][0][0].size();
84 size_t batch_size = input.size();
86 // Temporary output from GEMM operations [f, i, o, z]
87 VVVVF<T> tempGEMM(batch_size, VVVF<T>(1, VVF<T>(1, VF<T>(4 * hidden_size))));
88 for (size_t b = 0; b < batch_size; ++b) {
89 for (size_t y = 0; y < 4 * hidden_size; ++y) {
91 for (size_t x = 0; x < input_size; ++x) {
92 res += (T)weights[0][dir][y][x] * (T)input[b][seq][input_dir][x];
95 for (size_t x = 0; x < hidden_size; ++x) {
96 res += (T)recurrent[0][dir][y][x] * (T)hidden[b][0][dir][x];
100 res += (T)bias[0][0][dir][y];
102 tempGEMM[b][0][0][y] = res;
108 template <typename T>
109 VVVVF<T> lstm_elt_reference(VVVVF<T>& tempGEMM, VVVVF<T>& cell,
110 bool hasCell = true, float clip_threshold = 0,
111 bool input_forget = false, size_t dir = 0)
113 size_t hidden_size = tempGEMM[0][0][0].size() / 4;
114 size_t batch_size = tempGEMM.size();
115 VVVVF<T> tempOut(batch_size, VVVF<T>(2, VVF<T>(1, VF<T>(hidden_size))));
116 offset_order off(hidden_size, default_offset_type);
118 for (size_t b = 0; b < batch_size; ++b) {
119 T *it = &tempGEMM[b][0][0][off.it];
120 T *ot = &tempGEMM[b][0][0][off.ot];
121 T *ft = &tempGEMM[b][0][0][off.ft];
122 T *zt = &tempGEMM[b][0][0][off.zt];
124 for (size_t h = 0; h < hidden_size; ++h) {
126 // Convert all inputs to float for all the elementwise operations. This is done to immitate
127 // how lstm kernel is performing the elementwise operations.
128 float fp32_it = (float)it[h];
129 float fp32_ot = (float)ot[h];
130 float fp32_ft = (float)ft[h];
131 float fp32_zt = (float)zt[h];
132 float val = sigmoid(clip(fp32_it, clip_threshold)) * std::tanh(clip(fp32_zt, clip_threshold));
135 val *= (1 - fp32_ft);
138 val += (float)cell[b][0][dir][h] * sigmoid(clip(fp32_ft, clip_threshold));
141 // Convert back to output data type before storing it into the output buffer. Currently, the output
142 // data type may be float or FLOAT16 (half)
143 tempOut[b][0][0][h] = (T)(std::tanh(val) * sigmoid(fp32_ot));
144 tempOut[b][1][0][h] = (T)val;
151 void print(const std::string& s, VVVVF<T>& input) {
152 printf("%s -------------\n", s.c_str());
153 printf("Size = [%d, %d, %d, %d]\n", (int)input.size(), (int)input[0].size(), (int)input[0][0].size(), (int)input[0][0][0].size());
154 for (size_t b = 0; b < input.size(); ++b) {
155 for (size_t f = 0; f < input[0].size(); ++f) {
156 for (size_t y = 0; y < input[0][0].size(); ++y) {
157 for (size_t x = 0; x < input[0][0][0].size(); ++x) {
158 printf("%f ", input[b][f][y][x]);
164 printf("---------------------------------------\n");
167 // input = [ batch, sequence, direction, input_size ]
168 // weights = [ 1, direction, 4 * hidden_size, input_size ]
169 // recurrent = [ 1, direction, 4 * hidden_size, hidden_size ]
170 // biases = [ 1, 1, direction, 4 * hidden_size ] optional
171 // cell = [ batch, direction, 1, hidden_size ] optional
172 // hidden = [ batch, direction, 1, hidden_size ] optional
173 // tempGEMM = [ batch, 1, 1, 4 * hidden_size ] temporary output
174 // output = [ batch, sequence, direction, hidden_size ] output
175 template <typename T>
176 void lstm_reference(VVVVF<T>& input, VVVVF<T>& hidden, VVVVF<T>& cell,
177 VVVVF<T>& weights, VVVVF<T>& recurrent, VVVVF<T>& bias,
178 VVVVF<T>& output, VVVVF<T>& last_hidden,
179 VVVVF<T>& last_cell, bool hasBias = true,
180 bool hasInitialHidden = true, bool hasInitialCell = true,
181 float clip_threshold = 0, bool input_forget = false,
182 bool scramble_input = true)
184 size_t sequence_len = input[0].size();
185 size_t dir_len = weights[0].size();
186 size_t batch = input.size();
187 size_t input_directions = input[0][0].size();
188 for (size_t dir = 0; dir < dir_len; ++dir) {
189 bool tempHasInitialHidden = hasInitialHidden;
190 bool tempHasInitialCell = hasInitialCell;
191 for (size_t seq = 0; seq < sequence_len; ++seq) {
193 size_t input_direction = dir;
194 if (scramble_input) {
196 seq_id = input_directions == 1 ? sequence_len - seq - 1 : seq;
197 input_direction = input_directions - 1;
200 VVVVF<T> tempGEMM = lstm_gemm_reference(input, weights, recurrent, bias, hidden, seq_id, hasBias, tempHasInitialHidden, dir, input_direction);
201 VVVVF<T> tempOutput = lstm_elt_reference(tempGEMM, cell, tempHasInitialCell, clip_threshold, input_forget, dir);
202 // tempOutput[batch][0] = hidden and tempOutput[batch][1] = cell
203 for (size_t i = 0; i < batch; i++) {
204 output[i][seq][dir] = tempOutput[i][0][0];
205 hidden[i][0][dir] = tempOutput[i][0][0];
206 cell[i][0][dir] = tempOutput[i][1][0];
208 tempHasInitialHidden = true;
209 tempHasInitialCell = true;
212 last_hidden = hidden;
219 void generic_lstm_gemm_gpu_test(int sequence_len, int direction, int batch_size, int input_size, int hidden_size,
220 bool hasBias = true, bool hasHidden = true) {
221 int min_random = -2, max_random = 2;
223 VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
224 VVVVF<T> ref_weights = generate_random_4d<T>(1, direction, 4 * hidden_size, input_size, min_random, max_random);
225 VVVVF<T> ref_recurrent = generate_random_4d<T>(1, direction, 4 * hidden_size, hidden_size, min_random, max_random);
226 VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, direction, 4 * hidden_size, min_random, max_random);
227 VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, direction, 1, hidden_size, min_random, max_random);
228 VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
229 VF<T> ref_weights_vec = flatten_4d<T>(cldnn::format::bfyx, ref_weights);
230 VF<T> ref_recurrent_vec = flatten_4d<T>(cldnn::format::bfyx, ref_recurrent);
231 VF<T> ref_bias_vec = flatten_4d<T>(cldnn::format::bfyx, ref_bias);
232 VF<T> ref_hidden_vec = flatten_4d<T>(cldnn::format::bfyx, ref_hidden);
234 VVVVF<T> ref_output = lstm_gemm_reference(ref_input, ref_weights, ref_recurrent, ref_bias, ref_hidden, 0, hasBias, hasHidden);
236 constexpr auto dt = std::is_same<T, float>::value ? data_types::f32 : data_types::f16;
237 const auto& engine = get_test_engine();
239 // If the input is of fp16 type then, the memory will be allocated as such
240 if (!engine.get_info().supports_fp16)
242 if (dt == data_types::f16)
248 memory input = memory::allocate(engine, { dt, format::bfyx, { batch_size, sequence_len, input_size, 1 } });
249 memory weights = memory::allocate(engine, { dt, format::bfyx, { 1, direction, input_size, 4 * hidden_size } });
250 memory recurrent = memory::allocate(engine, { dt, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } });
251 memory biases = memory::allocate(engine, { dt, format::bfyx, { 1, 1, 4 * hidden_size, direction } });
252 memory hidden = memory::allocate(engine, { dt, format::bfyx, { batch_size, direction, hidden_size, 1 } });
254 set_values(input, ref_input_vec);
255 set_values(weights, ref_weights_vec);
256 set_values(recurrent, ref_recurrent_vec);
257 set_values(biases, ref_bias_vec);
258 set_values(hidden, ref_hidden_vec);
261 topology.add(input_layout("input", input.get_layout()));
262 topology.add(data("weights", weights));
263 topology.add(data("recurrent", recurrent));
265 topology.add(data("biases", biases));
268 topology.add(input_layout("hidden", hidden.get_layout()));
271 topology.add(lstm_gemm("lstm_gemm", "input", "weights", "recurrent", hasBias ? "biases" : "", hasHidden ? "hidden" : ""));
273 network network(engine, topology);
274 network.set_input_data("input", input);
276 network.set_input_data("hidden", hidden);
279 auto outputs = network.execute();
280 EXPECT_EQ(outputs.size(), size_t(1));
282 auto output = outputs.begin()->second.get_memory();
283 auto output_ptr = output.pointer<T>();
285 for (int b = 0; b < batch_size; ++b) {
286 for (int x = 0; x < 4 * hidden_size; ++x)
287 EXPECT_FLOAT_EQ(ref_output[b][0][0][x], output_ptr[i++]);
292 void generic_lstm_elt_gpu_test(int sequence_len, int direction, int batch_size, int input_size, int hidden_size, bool hasCell = true,
293 T clip_threshold = (T)0.f, bool input_forget = false) {
294 // tempGEMM = [ 1, direction, batch, 4 * hidden_size ] input
295 // cell = [ 1, direction, batch, hidden_size ] optional
296 // output = [ 2, direction, batch, hidden_size ] output concat[hidden, cell]
297 int min_random = -2, max_random = 2;
299 VVVVF<T> ref_tempGEMM = generate_random_4d<T>(batch_size, direction, 1, 4 * hidden_size, min_random, max_random);
300 VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, direction, 1, hidden_size, min_random, max_random);
301 VF<T> ref_tempGEMM_vec = flatten_4d<T>(cldnn::format::bfyx, ref_tempGEMM);
302 VF<T> ref_cell_vec = flatten_4d<T>(cldnn::format::bfyx, ref_cell);
304 VVVVF<T> ref_output = lstm_elt_reference(ref_tempGEMM, ref_cell, hasCell, clip_threshold, input_forget);
306 // We observe some mismatch in down-converting from fp32 to fp16
307 // between the reference implementation and opencl kernel. This can be
308 // a simple rounding error. Thus, for fp16 we are increasing our tolerance
309 // to error from 1E-4 to 1E-2
310 constexpr float ferror = std::is_same<T, float>::value ? (float)1E-4 : (float)1E-2;
311 constexpr auto dt = std::is_same<T, float>::value ? data_types::f32 : data_types::f16;
312 const auto& engine = get_test_engine();
314 // If the input is of fp16 type then, the memory will be allocated as such
315 if (!engine.get_info().supports_fp16)
317 if (dt == data_types::f16)
323 memory tempGEMM = memory::allocate(engine, { dt, format::bfyx,{ batch_size, direction, 4 * hidden_size, 1 } });
324 memory cell = memory::allocate(engine, { dt, format::bfyx,{ batch_size, direction, hidden_size, 1 } });
325 set_values(tempGEMM, ref_tempGEMM_vec);
326 set_values(cell, ref_cell_vec);
329 topology.add(input_layout("tempGEMM", tempGEMM.get_layout()));
331 topology.add(input_layout("cell", cell.get_layout()));
333 topology.add(lstm_elt("lstm_elt", "tempGEMM", hasCell ? "cell" : "", clip_threshold, input_forget));
335 network network(engine, topology);
336 network.set_input_data("tempGEMM", tempGEMM);
338 network.set_input_data("cell", cell);
341 auto outputs = network.execute();
342 EXPECT_EQ(outputs.size(), size_t(1));
344 auto output = outputs.begin()->second.get_memory();
345 auto output_ptr = output.pointer<T>();
346 for (int b = 0; b < batch_size; ++b) {
347 for (int j = 0; j < 2; ++j) {
348 for (int x = 0; x < hidden_size; ++x)
350 auto idx = b * 2 * hidden_size + j * hidden_size + x;
351 ASSERT_NEAR(ref_output[b][j][0][x], output_ptr[idx] , ferror);
357 std::string get_string_id(size_t i) {
358 std::stringstream ss;
359 ss << std::setw(5) << std::setfill('0') << i;
363 // --------------- Manually constructed LSTM ----------------------------------------
364 // This function manually generates an lstm node sequence by conbining lstm_gemm and lstm_elt nodes
365 // it requires that the output of the lstm_elt node is croped to obtain the corresponding hidden and cell outputs
366 void generate_lstm_topology(topology& t, memory& input, memory& hidden, memory& cell,
367 memory& weights, memory& recurrent, memory& biases, int sequence_len,
368 bool hasBias = true, bool hasInitialHidden = true, bool hasInitialCell = true) {
369 auto hidden_size = hidden.get_layout().size;
370 t.add(input_layout("input", input.get_layout()));
371 std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
372 std::vector<primitive_id> output_ids_offsets;
373 for (int i = 0; i < sequence_len; ++i)
374 input_ids_offsets.push_back({ get_string_id(i),{ 0, i, 0, 0 } });
375 t.add(split("inputSplit", "input", input_ids_offsets));
376 t.add(data("weights", weights));
377 t.add(data("recurrent", recurrent));
379 std::string biasStr = "";
380 std::string hiddenStr = "";
381 std::string cellStr = "";
384 t.add(data("biases", biases));
387 if (hasInitialHidden)
389 t.add(input_layout("hidden", hidden.get_layout()));
390 hiddenStr = "hidden";
394 t.add(input_layout("cell", cell.get_layout()));
397 for (int i = 0; i < sequence_len; ++i) {
398 std::string lstm_gemm_id = "lstm_gemm" + get_string_id(i);
399 std::string lstm_elt_id = "lstm_elt" + get_string_id(i);
400 std::string crop_id = "crop" + get_string_id(i);
402 t.add(lstm_gemm(lstm_gemm_id, "inputSplit:" + get_string_id(i), "weights", "recurrent", biasStr, hiddenStr));
403 t.add(lstm_elt(lstm_elt_id, lstm_gemm_id, cellStr));
405 hiddenStr = crop_id + ":hidden";
406 t.add(crop(hiddenStr, lstm_elt_id, hidden_size, tensor{ 0,0,0,0 }));
407 if (i < sequence_len - 1) {
408 cellStr = crop_id + ":cell";
409 t.add(crop(cellStr, lstm_elt_id, hidden_size, tensor{ 0,1,0,0 }));
411 output_ids_offsets.push_back(hiddenStr);
413 t.add(concatenation("concatenation", output_ids_offsets, concatenation::along_f));
418 void generic_lstm_custom_gpu_test(int sequence_len, int direction, int batch_size, int input_size, int hidden_size,
419 bool hasBias = true, bool hasInitialHidden = true, bool hasInitialCell = true) {
420 std::cout << "Input Size = " << input_size << " Hidden Size = " << hidden_size << " Sequence Len = " << sequence_len << " Batch Size = " << batch_size << std::endl;
421 int min_random = -2, max_random = 2;
422 VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
423 VVVVF<T> ref_weights = generate_random_4d<T>(1, direction, 4 * hidden_size, input_size, min_random, max_random);
424 VVVVF<T> ref_recurrent = generate_random_4d<T>(1, direction, 4 * hidden_size, hidden_size, min_random, max_random);
425 VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, direction, 4 * hidden_size, min_random, max_random);
426 VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, direction, 1, hidden_size, min_random, max_random);
427 VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, direction, 1, hidden_size, min_random, max_random);
428 VVVVF<T> ref_output(batch_size, VVVF<T>(sequence_len, VVF<T>(direction, VF<T>(hidden_size))));
429 VVVVF<T> last_hidden(batch_size, VVVF<T>(direction, VVF<T>(1, VF<T>(hidden_size))));
430 VVVVF<T> last_cell(batch_size, VVVF<T>(direction, VVF<T>(1, VF<T>(hidden_size))));
432 VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
433 VF<T> ref_weights_vec = flatten_4d<T>(cldnn::format::bfyx, ref_weights);
434 VF<T> ref_recurrent_vec = flatten_4d<T>(cldnn::format::bfyx, ref_recurrent);
435 VF<T> ref_bias_vec = flatten_4d<T>(cldnn::format::bfyx, ref_bias);
436 VF<T> ref_hidden_vec = flatten_4d<T>(cldnn::format::bfyx, ref_hidden);
437 VF<T> ref_cell_vec = flatten_4d<T>(cldnn::format::bfyx, ref_cell);
438 lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output, last_hidden, last_cell,
439 hasBias, hasInitialHidden, hasInitialCell);
441 const auto& engine = get_test_engine();
442 memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ batch_size, sequence_len, input_size, 1 } });
443 memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, direction, input_size, 4 * hidden_size } });
444 memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, direction, hidden_size, 4 * hidden_size } });
445 memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1, 1, 4 * hidden_size, direction } });
446 memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ batch_size, direction, hidden_size, 1 } });
447 memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ batch_size, direction, hidden_size, 1 } });
448 set_values(input, ref_input_vec);
449 set_values(weights, ref_weights_vec);
450 set_values(recurrent, ref_recurrent_vec);
451 set_values(biases, ref_bias_vec);
452 set_values(hidden, ref_hidden_vec);
453 set_values(cell, ref_cell_vec);
456 generate_lstm_topology(topology, input, hidden, cell, weights, recurrent, biases, sequence_len,
457 hasBias, hasInitialHidden, hasInitialCell);
459 network network(engine, topology);
460 network.set_input_data("input", input);
461 if (hasInitialHidden) network.set_input_data("hidden", hidden);
462 if (hasInitialCell) network.set_input_data("cell", cell);
463 auto outputs = network.execute();
465 ASSERT_EQ(outputs.size(), size_t(1));
466 size_t output_size = outputs.begin()->second.get_memory().size() / sizeof(T);
467 ASSERT_EQ(output_size, size_t(hidden_size * sequence_len * batch_size * direction));
469 auto output = outputs.begin()->second.get_memory();
470 auto output_ptr = output.pointer<T>();
472 for (int b = 0; b < batch_size; ++b) {
473 for (int s = 0; s < sequence_len; ++s) {
474 for (int x = 0; x < hidden_size; ++x) {
475 for (int d = 0; d < direction; ++d) {
476 ASSERT_NEAR(ref_output[b][s][d][x], output_ptr[i++], FERROR);
483 // -------------------------------------------------------
485 void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batch_size, int input_size, int hidden_size,
486 bool hasBias = true, bool hasInitialHidden = true, bool hasInitialCell = true,
487 T clip_threshold = 0, bool input_forget = false) {
488 std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
489 << " Sequence Len = " << sequence_len << " Direction = " << direction << " Batch Size = " << batch_size << std::endl;
490 int min_random = -2, max_random = 2;
492 VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
494 std::vector<VVVVF<T>> ref_weights;
495 std::vector<VVVVF<T>> ref_recurrent;
496 std::vector<VVVVF<T>> ref_bias;
497 std::vector<VVVVF<T>> ref_hidden;
498 std::vector<VVVVF<T>> ref_cell;
499 std::vector<VVVVF<T>> ref_output;
501 for (int i = 0; i < layers; ++i) {
502 ref_weights.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, i==0 ? input_size : hidden_size, min_random, max_random));
503 ref_recurrent.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, hidden_size, min_random, max_random));
504 ref_bias.push_back(generate_random_4d<T>(1, 1, direction, 4 * hidden_size, min_random, max_random));
505 ref_hidden.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
506 ref_cell.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
507 ref_output.push_back(VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(direction, VF<T>(hidden_size)))));
510 VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
511 std::vector<VF<T>> ref_weights_vec;
512 std::vector<VF<T>> ref_recurrent_vec;
513 std::vector<VF<T>> ref_bias_vec;
514 std::vector<VF<T>> ref_hidden_vec;
515 std::vector<VF<T>> ref_cell_vec;
516 for (int i = 0; i < layers; ++i) {
517 ref_weights_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_weights[i]));
518 ref_recurrent_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_recurrent[i]));
519 ref_bias_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_bias[i]));
520 ref_hidden_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_hidden[i]));
521 ref_cell_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_cell[i]));
524 VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
525 VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
527 lstm_reference(ref_input, ref_hidden[0], ref_cell[0], ref_weights[0], ref_recurrent[0], ref_bias[0], ref_output[0],
528 last_hidden, last_cell, hasBias, hasInitialHidden, hasInitialCell,
529 clip_threshold, input_forget, true);
531 for (int i = 1; i < layers; ++i) {
532 lstm_reference(ref_output[i - 1], ref_hidden[i], ref_cell[i], ref_weights[i], ref_recurrent[i],
533 ref_bias[i], ref_output[i],
534 last_hidden, last_cell, hasBias, hasInitialHidden, hasInitialCell,
535 clip_threshold, input_forget, false);
538 // We observe some mismatch in down-converting from fp32 to fp16
539 // between the reference implementation and opencl kernel. This can be
540 // a simple rounding error. Thus, for fp16 we are increasing our tolerance
541 // to error from 1E-4 to 1E-2
542 constexpr float ferror = std::is_same<T, float>::value ? (float)1E-4 : (float)1E-2;
543 constexpr auto dt = std::is_same<T, float>::value ? data_types::f32 : data_types::f16;
544 const auto& engine = get_test_engine();
546 // If the input is of fp16 type then, the memory will be allocated as such
547 if (!engine.get_info().supports_fp16)
549 if (dt == data_types::f16)
555 memory input = memory::allocate(engine, { dt, format::bfyx, {batch_size, sequence_len, input_size, 1} });
556 set_values(input, ref_input_vec);
558 std::vector<memory> weights;
559 std::vector<memory> recurrent;
560 std::vector<memory> biases;
561 std::vector<memory> hidden;
562 std::vector<memory> cell;
563 for(int i = 0; i < layers; ++i) {
564 weights.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, direction, i==0 ? input_size : hidden_size, 4 * hidden_size } }));
565 set_values(weights[i], ref_weights_vec[i]);
566 recurrent.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } }));
567 set_values(recurrent[i], ref_recurrent_vec[i]);
569 biases.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, 1, 4 * hidden_size, direction } }));
570 set_values(biases[i], ref_bias_vec[i]);
572 if (hasInitialHidden) {
573 hidden.push_back(memory::allocate(engine, { dt, format::bfyx, { batch_size, 1, hidden_size, direction } }));
574 set_values(hidden[i], ref_hidden_vec[i]);
576 if (hasInitialCell) {
577 cell.push_back(memory::allocate(engine, { dt, format::bfyx, { batch_size, 1, hidden_size, direction} }));
578 set_values(cell[i], ref_cell_vec[i]);
583 std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
584 std::vector<primitive_id> lstm_inputs;
585 std::vector<primitive_id> output_ids_offsets;
587 topology.add(input_layout("input", input.get_layout()));
588 for (int i = 0; i < sequence_len; ++i) {
589 input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}});
590 lstm_inputs.push_back("inputSplit:"+get_string_id(i));
592 topology.add(split("inputSplit", "input", input_ids_offsets));
593 cldnn::primitive_id prev_lstm_id;
594 for(int i = 0; i < layers; ++i) {
595 std::string sid = get_string_id(i);
596 std::string lstm_id = "lstm" + sid;
597 std::string weights_id = "weights" + sid;
598 std::string recurrent_id = "recurrent" + sid;
599 std::string biases_id = "biases" + sid;
600 std::string hidden_id = "hidden" + sid;
601 std::string cell_id = "cell" + sid;
603 topology.add(data(weights_id, weights[i]));
604 topology.add(data(recurrent_id, recurrent[i]));
605 if (hasBias) topology.add(data(biases_id, biases[i]));
606 if (hasInitialHidden) topology.add(input_layout(hidden_id, hidden[i].get_layout()));
607 if (hasInitialCell) topology.add(input_layout(cell_id, cell[i].get_layout()));
609 topology.add(lstm(lstm_id, lstm_inputs, weights_id, recurrent_id,
610 hasBias ? biases_id : "", hasInitialHidden ? hidden_id : "", hasInitialCell ? cell_id : "", "",
611 clip_threshold, input_forget, {}, {},
612 cldnn_lstm_output::cldnn_lstm_output_sequence, default_offset_type));
615 topology.add(lstm(lstm_id, { prev_lstm_id }, weights_id, recurrent_id,
616 hasBias ? biases_id : "", hasInitialHidden ? hidden_id : "", hasInitialCell ? cell_id : "", "",
617 clip_threshold, input_forget, {}, {},
618 cldnn_lstm_output::cldnn_lstm_output_sequence, default_offset_type));
620 prev_lstm_id = lstm_id;
623 network network(engine, topology);
624 network.set_input_data("input", input);
625 for (int i = 0; i < layers; ++i) {
626 std::string sid = get_string_id(i);
627 if (hasInitialHidden) network.set_input_data("hidden" + sid, hidden[i]);
628 if (hasInitialCell) network.set_input_data("cell" + sid, cell[i]);
630 auto outputs = network.execute();
632 ASSERT_EQ(outputs.size(), size_t(1));
633 size_t output_size = outputs.begin()->second.get_memory().size() / sizeof(T);
634 ASSERT_EQ(output_size, size_t(hidden_size * sequence_len * batch_size * direction));
636 auto output = outputs.begin()->second.get_memory();
638 // Get the output tensor
639 cldnn::layout output_layout = output.get_layout();
640 cldnn::tensor output_tensor = output_layout.size;
642 // Compare the output tensor configuration against the reference value
643 // Output tensor is configured in bfyx format
644 ASSERT_EQ(batch_size, output_tensor.batch[0]);
645 ASSERT_EQ(sequence_len, output_tensor.feature[0]);
646 ASSERT_EQ(direction, output_tensor.spatial[1]);
647 ASSERT_EQ(hidden_size, output_tensor.spatial[0]);
649 auto output_ptr = output.pointer<T>();
651 for (int32_t b = 0; b < batch_size; ++b) {
652 for (int32_t s = 0; s < sequence_len; ++s) {
653 for (int32_t d = 0; d < direction; ++d) {
654 for (int32_t x = 0; x < hidden_size; ++x) {
655 ASSERT_NEAR(ref_output[layers - 1][b][s][d][x], output_ptr[i++], ferror);
663 // -------------------------------------------------------
665 void lstm_gpu_output_test(const cldnn_lstm_output& output_selection, int directions) {
667 int sequence_len = 4;
672 std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
673 << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size
674 << " Output selection: " << output_selection << std::endl;
675 int min_random = -2, max_random = 2;
677 VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
678 VVVVF<T> ref_weights = generate_random_4d<T>(1, directions, 4 * hidden_size, input_size, min_random, max_random);
679 VVVVF<T> ref_recurrent = generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random);
680 VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random);
681 VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
682 VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
683 VVVVF<T> ref_output = VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size))));
685 VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
686 VF<T> ref_weights_vec = flatten_4d<T>(cldnn::format::bfyx, ref_weights);
687 VF<T> ref_recurrent_vec = flatten_4d<T>(cldnn::format::bfyx, ref_recurrent);
688 VF<T> ref_bias_vec = flatten_4d<T>(cldnn::format::bfyx, ref_bias);
689 VF<T> ref_hidden_vec = flatten_4d<T>(cldnn::format::bfyx, ref_hidden);
690 VF<T> ref_cell_vec = flatten_4d<T>(cldnn::format::bfyx, ref_cell);
692 VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
693 VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
695 lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output,
696 last_hidden, last_cell, true, true, true,
699 const auto& engine = get_test_engine();
701 memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {batch_size, sequence_len, input_size, 1} });
702 memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } });
703 memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } });
704 memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } });
705 memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
706 memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
708 set_values(input, ref_input_vec);
709 set_values(weights, ref_weights_vec);
710 set_values(recurrent, ref_recurrent_vec);
711 set_values(biases, ref_bias_vec);
712 set_values(hidden, ref_hidden_vec);
713 set_values(cell, ref_cell_vec);
715 bool emit_last_cell = output_selection == cldnn_lstm_output_hidden_cell ||
716 output_selection == cldnn_lstm_output_sequence_cell;
717 bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden ||
718 output_selection == cldnn_lstm_output_hidden_cell;
721 std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
722 std::vector<primitive_id> lstm_inputs;
723 std::vector<primitive_id> output_ids_offsets;
725 topology.add(input_layout("input", input.get_layout()));
726 for (int i = 0; i < sequence_len; ++i)
728 input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}});
729 lstm_inputs.push_back("inputSplit:"+get_string_id(i));
731 topology.add(split("inputSplit", "input", input_ids_offsets));
732 topology.add(data("weights", weights));
733 topology.add(data("recurrent", recurrent));
734 topology.add(data("biases", biases));
735 topology.add(input_layout("hidden", hidden.get_layout()));
736 topology.add(input_layout("cell", cell.get_layout()));
737 topology.add(lstm("lstm", lstm_inputs, "weights", "recurrent",
738 "biases", "hidden", "cell", "", 0, false, {}, {},
739 output_selection, default_offset_type));
742 int32_t concatenation_len = emit_last_hidden ? 2 : sequence_len + 1;
743 tensor hidden_tensor {batch_size, concatenation_len - 1, hidden_size, directions};
744 tensor cell_tensor {batch_size, 1, hidden_size, directions};
745 topology.add(crop(emit_last_hidden ? "crop:last_hidden" : "crop:sequence", "lstm", hidden_tensor, tensor {0, 0, 0, 0}));
746 topology.add(crop("crop:last_cell", "lstm", cell_tensor, tensor {0, concatenation_len - 1, 0, 0}));
749 network network(engine, topology);
750 network.set_input_data("input", input);
751 network.set_input_data("hidden", hidden);
752 network.set_input_data("cell", cell);
754 auto outputs = network.execute();
755 uint32_t ref_num_output_primitives = 1; // Output will return atleast 1 primitive
757 if (emit_last_cell) {
758 // add another primitve to account for cell state if the output selection includes cell state
759 ref_num_output_primitives += 1;
762 // check if the number of returned primitives match the expected number of output primitives
763 ASSERT_EQ(ref_num_output_primitives, outputs.size());
765 for (auto itr = outputs.begin(); itr != outputs.end(); itr++)
767 auto output_tensor = itr->second.get_memory().get_layout().size;
768 primitive_id primitive_name = itr->first;
770 cldnn::memory output_memory = itr->second.get_memory();
771 int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T));
772 cldnn::tensor ref_output_tensor;
773 VVVVF<T> ref_primitive_output;
775 int32_t ref_batch_size = batch_size;
776 int32_t ref_hidden_size = hidden_size;
777 int32_t ref_directions = directions;
779 int32_t ref_seq_len = 1;
780 // Set the reference output against which the primitive's output will be compared
781 if (primitive_name.find("crop:last_cell") != std::string::npos)
783 ref_primitive_output = last_cell;
785 else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos)
787 ref_primitive_output = last_hidden;
791 ref_seq_len = sequence_len;
792 ref_primitive_output = ref_output;
795 ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions };
796 int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions;
798 // The number of elements in reference should match the number of elements in the primitive's output
799 ASSERT_EQ(ref_output_size , output_size);
801 // Compare the output tensor configuration against the reference value
802 // Output tensor is configured in bfyx format
803 ASSERT_EQ(ref_batch_size, output_tensor.batch[0]);
804 ASSERT_EQ(ref_seq_len, output_tensor.feature[0]); // Sequence length should match
805 ASSERT_EQ(ref_directions, output_tensor.spatial[1]); // directions should match
806 ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]); // input size should match
808 auto output_ptr = output_memory.pointer<T>();
811 for (int32_t b = 0; b < ref_batch_size; ++b) {
812 for (int32_t s = 0; s < ref_seq_len; ++s) {
813 for (int32_t d = 0; d < ref_directions; ++d) {
814 for (int32_t x = 0; x < ref_hidden_size; ++x) {
815 ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
824 // -------------------------------------------------------
826 void lstm_gpu_format_test(const cldnn::format& format, int directions) {
828 int sequence_len = 6;
833 cldnn_lstm_output output_selection = cldnn_lstm_output::cldnn_lstm_output_sequence;
835 std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
836 << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size
837 << " Output selection: " << output_selection << std::endl;
838 int min_random = -2, max_random = 2;
840 VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
841 VVVVF<T> ref_weights = generate_random_4d<T>(1, directions, 4 * hidden_size, input_size, min_random, max_random);
842 VVVVF<T> ref_recurrent = generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random);
843 VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random);
844 VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
845 VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
846 VVVVF<T> ref_output = VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size))));
848 VF<T> ref_input_vec = flatten_4d<T>(format, ref_input);
849 VF<T> ref_weights_vec = flatten_4d<T>(cldnn::format::bfyx, ref_weights);
850 VF<T> ref_recurrent_vec = flatten_4d<T>(cldnn::format::bfyx, ref_recurrent);
851 VF<T> ref_bias_vec = flatten_4d<T>(cldnn::format::bfyx, ref_bias);
852 VF<T> ref_hidden_vec = flatten_4d<T>(format, ref_hidden);
853 VF<T> ref_cell_vec = flatten_4d<T>(format, ref_cell);
855 VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
856 VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
858 lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output,
859 last_hidden, last_cell, true, true, true,
862 const auto& engine = get_test_engine();
864 memory input = memory::allocate(engine, { type_to_data_type<T>::value,format, {batch_size, sequence_len, input_size, 1} });
865 memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } });
866 memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } });
867 memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } });
868 memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format, { batch_size, 1, hidden_size, directions } });
869 memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format, { batch_size, 1, hidden_size, directions } });
871 set_values(input, ref_input_vec);
872 set_values(weights, ref_weights_vec);
873 set_values(recurrent, ref_recurrent_vec);
874 set_values(biases, ref_bias_vec);
875 set_values(hidden, ref_hidden_vec);
876 set_values(cell, ref_cell_vec);
878 bool emit_last_cell = output_selection == cldnn_lstm_output_hidden_cell ||
879 output_selection == cldnn_lstm_output_sequence_cell;
880 bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden ||
881 output_selection == cldnn_lstm_output_hidden_cell;
884 std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
885 std::vector<primitive_id> lstm_inputs;
886 std::vector<primitive_id> output_ids_offsets;
888 topology.add(input_layout("input", input.get_layout()));
889 for (int i = 0; i < sequence_len; ++i)
891 input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}});
892 lstm_inputs.push_back("inputSplit:"+get_string_id(i));
894 topology.add(split("inputSplit", "input", input_ids_offsets));
895 topology.add(data("weights", weights));
896 topology.add(data("recurrent", recurrent));
897 topology.add(data("biases", biases));
898 topology.add(input_layout("hidden", hidden.get_layout()));
899 topology.add(input_layout("cell", cell.get_layout()));
900 topology.add(lstm("lstm"+get_string_id(0), lstm_inputs, "weights", "recurrent",
901 "biases", "hidden", "cell", "", 0, false, {}, {},
902 output_selection, default_offset_type));
906 int32_t concatenation_len = emit_last_hidden ? 2 : sequence_len + 1;
907 tensor hidden_tensor {batch_size, concatenation_len - 1, hidden_size, directions};
908 tensor cell_tensor {batch_size, 1, hidden_size, directions};
909 topology.add(crop(emit_last_hidden ? "crop:last_hidden" : "crop:sequence", "lstm", hidden_tensor, tensor {0, 0, 0, 0}));
910 topology.add(crop("crop:last_cell", "lstm", cell_tensor, tensor {0, concatenation_len - 1, 0, 0}));
913 network network(engine, topology);
914 std::map<primitive_id, network_output> outputs;
916 network.set_input_data("input", input);
917 network.set_input_data("hidden", hidden);
918 network.set_input_data("cell", cell);
919 outputs = network.execute();
921 uint32_t ref_num_output_primitives = 1; // Output will return atleast 1 primitive
923 if (emit_last_cell) {
924 // add another primitve to account for cell state if the output selection includes cell state
925 ref_num_output_primitives += 1;
928 // check if the number of returned primitives match the expected number of output primitives
929 ASSERT_EQ(ref_num_output_primitives, outputs.size());
931 for (auto itr = outputs.begin(); itr != outputs.end(); itr++)
933 auto output_tensor = itr->second.get_memory().get_layout().size;
934 primitive_id primitive_name = itr->first;
936 cldnn::memory output_memory = itr->second.get_memory();
937 int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T));
938 cldnn::tensor ref_output_tensor;
939 VVVVF<T> ref_primitive_output;
941 int32_t ref_batch_size = batch_size;
942 int32_t ref_hidden_size = hidden_size;
943 int32_t ref_directions = directions;
945 int32_t ref_seq_len = 1;
946 // Set the reference output against which the primitive's output will be compared
947 if (primitive_name.find("crop:last_cell") != std::string::npos)
949 ref_primitive_output = last_cell;
951 else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos)
953 ref_primitive_output = last_hidden;
957 ref_seq_len = sequence_len;
958 ref_primitive_output = ref_output;
961 ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions };
962 int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions;
964 // The number of elements in reference should match the number of elements in the primitive's output
965 ASSERT_EQ(ref_output_size , output_size);
967 // Compare the output tensor configuration against the reference value
968 // Output tensor is configured in bfyx format
969 ASSERT_EQ(ref_batch_size, output_tensor.batch[0]);
970 ASSERT_EQ(ref_seq_len, output_tensor.feature[0]); // Sequence length should match
971 ASSERT_EQ(ref_directions, output_tensor.spatial[1]); // directions should match
972 ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]); // input size should match
974 auto output_ptr = output_memory.pointer<T>();
977 if (format == cldnn::format::bfyx) {
978 for (int32_t b = 0; b < ref_batch_size; ++b) {
979 for (int32_t s = 0; s < ref_seq_len; ++s) {
980 for (int32_t d = 0; d < ref_directions; ++d) {
981 for (int32_t x = 0; x < ref_hidden_size; ++x) {
982 ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
988 else if(format == cldnn::format::fyxb)
990 for (int32_t s = 0; s < ref_seq_len; ++s) {
991 for (int32_t d = 0; d < ref_directions; ++d) {
992 for (int32_t x = 0; x < ref_hidden_size; ++x) {
993 for (int32_t b = 0; b < ref_batch_size; ++b) {
994 ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
1004 // -------------------------------------------------------
1005 template<typename T>
1006 void lstm_gpu_users_test() {
1007 int sequence_len = 2;
1010 int hidden_size = 1;
1012 int min_random = -2, max_random = 2;
1014 // The following test is designed to test the user dependencies of an LSTM node when replaced by subcomponents
1015 // by the graph compiler.
1016 // The output of an LSTM node is set to last_hidden only. Then we concatenate the last_hidden with the initial_hidden tensor:
1017 // (input, weights, recurrent, bias, initial_hidden, inital_cell) -> LSTM -> last_hidden
1018 // concatenation(last_hidden, initial_hidden)
1019 // If the replacing is is done correctly then the initial_hidden tensor should match the output of the concatenation
1020 // by an offset along the sequence.
1022 VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
1023 VVVVF<T> ref_weights = generate_random_4d<T>(1, directions, 4 * hidden_size, input_size, min_random, max_random);
1024 VVVVF<T> ref_recurrent = generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random);
1025 VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random);
1026 VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
1027 VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
1028 VVVVF<T> ref_output = VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size))));
1030 VF<T> ref_input_vec = flatten_4d<T>(format::bfyx, ref_input);
1031 VF<T> ref_weights_vec = flatten_4d<T>(format::bfyx, ref_weights);
1032 VF<T> ref_recurrent_vec = flatten_4d<T>(format::bfyx, ref_recurrent);
1033 VF<T> ref_bias_vec = flatten_4d<T>(format::bfyx, ref_bias);
1034 VF<T> ref_hidden_vec = flatten_4d<T>(format::bfyx, ref_hidden);
1035 VF<T> ref_cell_vec = flatten_4d<T>(format::bfyx, ref_cell);
1037 VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
1038 VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
1040 const auto& engine = get_test_engine();
1042 memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {batch_size, sequence_len, input_size, 1} });
1043 memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } });
1044 memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } });
1045 memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } });
1046 memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
1047 memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
1049 set_values(input, ref_input_vec);
1050 set_values(weights, ref_weights_vec);
1051 set_values(recurrent, ref_recurrent_vec);
1052 set_values(biases, ref_bias_vec);
1053 set_values(hidden, ref_hidden_vec);
1054 set_values(cell, ref_cell_vec);
1057 std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
1058 std::vector<primitive_id> lstm_inputs;
1060 topology.add(input_layout("input", input.get_layout()));
1061 for (int i = 0; i < sequence_len; ++i)
1063 input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}});
1064 lstm_inputs.push_back("inputSplit:"+get_string_id(i));
1066 topology.add(split("inputSplit", "input", input_ids_offsets));
1067 topology.add(data("weights", weights));
1068 topology.add(data("recurrent", recurrent));
1069 topology.add(data("biases", biases));
1070 topology.add(input_layout("hidden", hidden.get_layout()));
1071 topology.add(input_layout("cell", cell.get_layout()));
1072 topology.add(lstm("lstm", lstm_inputs, "weights", "recurrent",
1073 "biases", "hidden", "cell", "", 0, false, {}, {},
1074 cldnn_lstm_output::cldnn_lstm_output_hidden, default_offset_type));
1075 std::vector<primitive_id> output_ids_offsets {"lstm", "hidden"};
1076 topology.add(concatenation("concatenation", output_ids_offsets, concatenation::along_f));
1078 network network(engine, topology);
1079 std::map<primitive_id, network_output> outputs;
1081 network.set_input_data("input", input);
1082 network.set_input_data("hidden", hidden);
1083 network.set_input_data("cell", cell);
1084 outputs = network.execute();
1086 // check if the number of returned primitives match the expected number of output primitives
1087 ASSERT_EQ(size_t(1), outputs.size());
1088 cldnn::memory output_memory = outputs.begin()->second.get_memory();
1089 auto output_ptr = output_memory.pointer<T>();
1092 for (int32_t b = 0; b < batch_size; ++b) {
1093 for (int32_t s = 0; s < 1; ++s) {
1094 for (int32_t d = 0; d < directions; ++d) {
1095 for (int32_t x = 0; x < hidden_size; ++x) {
1096 int32_t idx = x + hidden_size * (d + directions * ((s+1) + sequence_len * b));
1097 ASSERT_NEAR(ref_hidden[b][s][d][x], output_ptr[idx], FERROR);
1104 // -------------------------------------------------------
1105 template<typename T>
1106 void lstm_gpu_concatenated_input_test(int layers, int sequence_len, int direction,
1107 int batch_size, int input_size, int hidden_size,
1108 bool has_bias = true, bool has_initial_hidden = true,
1109 bool has_initial_cell = true, float clip_threshold = 0,
1110 bool input_forget = false)
1112 std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
1113 << " Sequence Len = " << sequence_len << " Direction = " << direction << " Batch Size = " << batch_size << std::endl;
1114 int min_random = -2, max_random = 2;
1116 VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
1118 std::vector<VVVVF<T>> ref_weights;
1119 std::vector<VVVVF<T>> ref_recurrent;
1120 std::vector<VVVVF<T>> ref_bias;
1121 std::vector<VVVVF<T>> ref_hidden;
1122 std::vector<VVVVF<T>> ref_cell;
1123 std::vector<VVVVF<T>> ref_output;
1125 for (int i = 0; i < layers; ++i) {
1126 ref_weights.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, i == 0 ? input_size : hidden_size, min_random, max_random));
1127 ref_recurrent.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, hidden_size, min_random, max_random));
1128 ref_bias.push_back(generate_random_4d<T>(1, 1, direction, 4 * hidden_size, min_random, max_random));
1129 ref_hidden.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
1130 ref_cell.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
1131 ref_output.push_back(VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(direction, VF<T>(hidden_size)))));
1134 VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
1136 std::vector<VF<T>> ref_weights_vec;
1137 std::vector<VF<T>> ref_recurrent_vec;
1138 std::vector<VF<T>> ref_bias_vec;
1139 std::vector<VF<T>> ref_hidden_vec;
1140 std::vector<VF<T>> ref_cell_vec;
1141 for (int i = 0; i < layers; ++i) {
1142 ref_weights_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_weights[i]));
1143 ref_recurrent_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_recurrent[i]));
1144 ref_bias_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_bias[i]));
1145 ref_hidden_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_hidden[i]));
1146 ref_cell_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_cell[i]));
1149 VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
1150 VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
1152 lstm_reference(ref_input, ref_hidden[0], ref_cell[0], ref_weights[0], ref_recurrent[0], ref_bias[0], ref_output[0],
1153 last_hidden, last_cell, has_bias, has_initial_hidden, has_initial_cell,
1154 clip_threshold, input_forget, true);
1156 for (int i = 1; i < layers; ++i) {
1157 lstm_reference(ref_output[i - 1], ref_hidden[i], ref_cell[i], ref_weights[i], ref_recurrent[i],
1158 ref_bias[i], ref_output[i],
1159 last_hidden, last_cell, has_bias, has_initial_hidden, has_initial_cell,
1160 clip_threshold, input_forget, false);
1163 const auto& engine = get_test_engine();
1165 memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {batch_size, sequence_len, input_size, 1} });
1166 set_values(input, ref_input_vec);
1168 std::vector<memory> weights;
1169 std::vector<memory> recurrent;
1170 std::vector<memory> biases;
1171 std::vector<memory> hidden;
1172 std::vector<memory> cell;
1173 for (int i = 0; i < layers; ++i) {
1174 weights.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, direction, i == 0 ? input_size : hidden_size, 4 * hidden_size } }));
1175 set_values(weights[i], ref_weights_vec[i]);
1176 recurrent.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } }));
1177 set_values(recurrent[i], ref_recurrent_vec[i]);
1179 biases.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, direction } }));
1180 set_values(biases[i], ref_bias_vec[i]);
1182 if (has_initial_hidden) {
1183 hidden.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, direction } }));
1184 set_values(hidden[i], ref_hidden_vec[i]);
1186 if (has_initial_cell) {
1187 cell.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, direction} }));
1188 set_values(cell[i], ref_cell_vec[i]);
1193 std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
1194 std::vector<primitive_id> lstm_inputs;
1195 std::vector<primitive_id> output_ids_offsets;
1197 topology.add(input_layout("input", input.get_layout()));
1198 cldnn::primitive_id prev_node_id;
1200 for (int i = 0; i < layers; ++i) {
1201 std::string sid = get_string_id(i);
1202 std::string lstm_id = "lstm" + sid;
1203 std::string weights_id = "weights" + sid;
1204 std::string recurrent_id = "recurrent" + sid;
1205 std::string biases_id = "biases" + sid;
1206 std::string hidden_id = "hidden" + sid;
1207 std::string cell_id = "cell" + sid;
1208 std::string output_crop_id = "crop:sequence:" + sid;
1210 topology.add(data(weights_id, weights[i]));
1211 topology.add(data(recurrent_id, recurrent[i]));
1212 if (has_bias) topology.add(data(biases_id, biases[i]));
1213 if (has_initial_hidden) topology.add(input_layout(hidden_id, hidden[i].get_layout()));
1214 if (has_initial_cell) topology.add(input_layout(cell_id, cell[i].get_layout()));
1216 topology.add(lstm(lstm_id, { "input" }, weights_id, recurrent_id,
1217 has_bias ? biases_id : "", has_initial_hidden ? hidden_id : "", has_initial_cell ? cell_id : "", "",
1218 clip_threshold, input_forget, {}, {},
1219 cldnn_lstm_output::cldnn_lstm_output_sequence_cell, default_offset_type));
1222 topology.add(lstm(lstm_id, { prev_node_id }, weights_id, recurrent_id,
1223 has_bias ? biases_id : "", has_initial_hidden ? hidden_id : "", has_initial_cell ? cell_id : "", "",
1224 clip_threshold, input_forget, {}, {},
1225 cldnn_lstm_output::cldnn_lstm_output_sequence_cell, default_offset_type));
1228 // Crop out the whole output sequence element
1229 topology.add(crop(output_crop_id, lstm_id, {batch_size, sequence_len, hidden_size, direction}, {0, 0, 0, 0}));
1231 // Save the node id to provide it as input to the next lstm layer
1232 prev_node_id = output_crop_id;
1235 network network(engine, topology);
1236 network.set_input_data("input", input);
1237 for (int i = 0; i < layers; ++i) {
1238 std::string sid = get_string_id(i);
1239 if (has_initial_hidden) network.set_input_data("hidden" + sid, hidden[i]);
1240 if (has_initial_cell) network.set_input_data("cell" + sid, cell[i]);
1242 auto outputs = network.execute();
1244 ASSERT_EQ(outputs.size(), size_t(1));
1245 size_t output_size = outputs.begin()->second.get_memory().size() / sizeof(T);
1246 ASSERT_EQ(output_size, size_t(hidden_size * sequence_len * batch_size * direction));
1248 auto output = outputs.begin()->second.get_memory();
1250 // Get the output tensor
1251 cldnn::layout output_layout = output.get_layout();
1252 cldnn::tensor output_tensor = output_layout.size;
1254 // Compare the output tensor configuration against the reference value
1255 // Output tensor is configured in bfyx format
1256 ASSERT_EQ(batch_size, output_tensor.batch[0]);
1257 ASSERT_EQ(sequence_len, output_tensor.feature[0]);
1258 ASSERT_EQ(direction, output_tensor.spatial[1]);
1259 ASSERT_EQ(hidden_size, output_tensor.spatial[0]);
1261 auto output_ptr = output.pointer<T>();
1263 for (int32_t b = 0; b < batch_size; ++b) {
1264 for (int32_t s = 0; s < sequence_len; ++s) {
1265 for (int32_t d = 0; d < direction; ++d) {
1266 for (int32_t x = 0; x < hidden_size; ++x) {
1267 ASSERT_NEAR(ref_output[layers - 1][b][s][d][x], output_ptr[i++], FERROR);
1275 // This test checks chained and stacked LSTM topology. The configuration allows to create
1276 // LSTM topology with multiple layers and can also be chained together.
1277 template<typename T>
1278 void lstm_gpu_chain_test(int batch_size, int input_size, int hidden_size,
1279 int directions, size_t layers, size_t chains, int sequence_len,
1280 const cldnn_lstm_output& output_selection)
1282 int min_random = -2, max_random = 2;
1283 bool has_bias = false;
1284 bool has_initial_hidden = false;
1285 bool has_initial_cell = false;
1286 float clip_threshold = 0;
1287 bool input_forget = false;
1289 std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
1290 << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size
1291 << " Output selection: " << output_selection << std::endl;
1293 VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
1294 std::vector<std::vector< VVVVF<T>>> ref_weights;
1295 std::vector<std::vector< VVVVF<T>>> ref_recurrent;
1296 std::vector<std::vector< VVVVF<T>>> ref_bias;
1297 std::vector<std::vector< VVVVF<T>>> ref_hidden;
1298 std::vector<std::vector< VVVVF<T>>> ref_cell;
1299 std::vector<std::vector< VVVVF<T>>> ref_output;
1301 // Create the 4 dimensional weight, bias, hidden, cell state and output vectors
1302 for (size_t chain = 0; chain < chains; chain++) {
1304 std::vector<VVVVF<T>> per_chain_ref_weights;
1305 std::vector<VVVVF<T>> per_chain_ref_recurrent;
1306 std::vector<VVVVF<T>> per_chain_ref_bias;
1307 std::vector<VVVVF<T>> per_chain_ref_hidden;
1308 std::vector<VVVVF<T>> per_chain_ref_cell;
1309 std::vector<VVVVF<T>> per_chain_ref_output;
1311 for (size_t layer = 0; layer < layers; layer++) {
1312 per_chain_ref_weights.push_back(generate_random_4d<T>(1, directions, 4 * hidden_size, (layer == 0) ? input_size : hidden_size, min_random, max_random));
1313 per_chain_ref_recurrent.push_back(generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random));
1314 per_chain_ref_bias.push_back(generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random));
1315 per_chain_ref_hidden.push_back(generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random));
1316 per_chain_ref_cell.push_back(generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random));
1317 per_chain_ref_output.push_back(VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size)))));
1320 ref_weights.push_back(per_chain_ref_weights);
1321 ref_recurrent.push_back(per_chain_ref_recurrent);
1322 ref_bias.push_back(per_chain_ref_bias);
1323 ref_hidden.push_back(per_chain_ref_hidden);
1324 ref_cell.push_back(per_chain_ref_cell);
1325 ref_output.push_back(per_chain_ref_output);
1328 VF<T> ref_input_vec;
1329 std::vector<std::vector< VF<T>>> ref_weights_vec;
1330 std::vector<std::vector< VF<T>>> ref_recurrent_vec;
1331 std::vector<std::vector< VF<T>>> ref_bias_vec;
1332 std::vector<std::vector< VF<T>>> ref_hidden_vec;
1333 std::vector<std::vector< VF<T>>> ref_cell_vec;
1334 std::vector<std::vector< VF<T>>> ref_output_vec;
1336 ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
1338 // flatten all the 4 dimensional vectors across chains and layers
1339 for (size_t chain = 0; chain < chains; chain++) {
1341 std::vector<VF<T>> per_chain_ref_weights;
1342 std::vector<VF<T>> per_chain_ref_recurrent;
1343 std::vector<VF<T>> per_chain_ref_bias;
1344 std::vector<VF<T>> per_chain_ref_hidden;
1345 std::vector<VF<T>> per_chain_ref_cell;
1346 std::vector<VF<T>> per_chain_ref_output;
1348 for (size_t layer = 0; layer < layers; layer++) {
1349 per_chain_ref_weights.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_weights[chain][layer]));
1350 per_chain_ref_recurrent.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_recurrent[chain][layer]));
1351 per_chain_ref_bias.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_bias[chain][layer]));
1352 per_chain_ref_hidden.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_hidden[chain][layer]));
1353 per_chain_ref_cell.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_cell[chain][layer]));
1354 per_chain_ref_output.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_output[chain][layer]));
1357 ref_weights_vec.push_back(per_chain_ref_weights);
1358 ref_recurrent_vec.push_back(per_chain_ref_recurrent);
1359 ref_bias_vec.push_back(per_chain_ref_bias);
1360 ref_hidden_vec.push_back(per_chain_ref_hidden);
1361 ref_cell_vec.push_back(per_chain_ref_cell);
1362 ref_output_vec.push_back(per_chain_ref_output);
1365 std::vector<std::vector<VVVVF<T>>> last_hidden(chains, std::vector<VVVVF<T> >(layers, VVVVF<T>(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))))));
1366 std::vector<std::vector<VVVVF<T>>> last_cell(chains, std::vector<VVVVF<T> >(layers, VVVVF<T>(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))))));
1368 for (size_t chain = 0; chain < chains; chain++) {
1369 lstm_reference(ref_input, ref_hidden[chain][0], ref_cell[chain][0], ref_weights[chain][0],
1370 ref_recurrent[chain][0], ref_bias[chain][0], ref_output[chain][0],
1371 last_hidden[chain][0], last_cell[chain][0], has_bias,
1372 chain == 0 ? has_initial_hidden : true,
1373 chain == 0 ? has_initial_cell : true,
1374 clip_threshold, input_forget, true);
1376 if (chain < chains - 1)
1378 ref_hidden[chain + 1][0] = last_hidden[chain][0];
1379 ref_cell[chain + 1][0] = last_cell[chain][0];
1383 for (size_t layer = 1; layer < layers; ++layer) {
1384 for (size_t chain = 0; chain < chains; chain++) {
1385 lstm_reference(ref_output[chain][layer - 1], ref_hidden[chain][layer], ref_cell[chain][layer],
1386 ref_weights[chain][layer], ref_recurrent[chain][layer], ref_bias[chain][layer],
1387 ref_output[chain][layer], last_hidden[chain][layer], last_cell[chain][layer], has_bias,
1388 chain == 0 ? has_initial_hidden : true,
1389 chain == 0 ? has_initial_cell : true,
1390 clip_threshold, input_forget,
1393 if (chain < chains - 1)
1395 ref_hidden[chain + 1][layer] = last_hidden[chain][layer];
1396 ref_cell[chain + 1][layer] = last_cell[chain][layer];
1401 const auto& engine = get_test_engine();
1402 tensor input_tensor = { batch_size, sequence_len, input_size, 1 };
1403 layout layout = { type_to_data_type<T>::value, cldnn::format::bfyx, input_tensor };
1405 memory input = memory::allocate(engine, layout);
1406 set_values(input, ref_input_vec);
1408 // 2-dim vectors to support chain and layers
1409 std::vector<std::vector<memory>> weights;
1410 std::vector<std::vector<memory>> recurrent;
1411 std::vector<std::vector<memory>> biases;
1412 std::vector<std::vector<memory>> hidden;
1413 std::vector<std::vector<memory>> cell;
1415 for (size_t chain = 0; chain < chains; chain++) {
1416 std::vector<memory> per_chain_weights;
1417 std::vector<memory> per_chain_recurrent;
1418 std::vector<memory> per_chain_biases;
1419 std::vector<memory> per_chain_hidden;
1420 std::vector<memory> per_chain_cell;
1422 for (size_t layer = 0; layer < layers; layer++) {
1423 per_chain_weights.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, directions, layer == 0 ? input_size : hidden_size, 4 * hidden_size} }));
1424 set_values(per_chain_weights[layer], ref_weights_vec[chain][layer]);
1426 per_chain_recurrent.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, directions, hidden_size, 4 * hidden_size} }));
1427 set_values(per_chain_recurrent[layer], ref_recurrent_vec[chain][layer]);
1431 per_chain_biases.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, 1, 4 * hidden_size, directions} }));
1432 set_values(per_chain_biases[layer], ref_bias_vec[chain][layer]);
1435 if (has_initial_hidden)
1437 per_chain_hidden.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, 1, hidden_size, directions} }));
1438 set_values(per_chain_hidden[layer], ref_hidden_vec[chain][layer]);
1441 if (has_initial_cell)
1443 per_chain_cell.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, 1, hidden_size, directions} }));
1444 set_values(per_chain_cell[layer], ref_cell_vec[chain][layer]);
1448 weights.push_back(per_chain_weights);
1449 recurrent.push_back(per_chain_recurrent);
1450 biases.push_back(per_chain_biases);
1451 hidden.push_back(per_chain_hidden);
1452 cell.push_back(per_chain_cell);
1455 // Start creating the topology
1456 cldnn::topology topology;
1457 std::vector<std::pair<primitive_id, cldnn::tensor>> input_ids_offsets;
1458 std::vector<primitive_id> lstm_inputs;
1459 std::vector<primitive_id> output_ids_offsets;
1461 topology.add(input_layout("input", input.get_layout()));
1463 for (int feature = 0; feature < sequence_len; feature++) {
1464 input_ids_offsets.push_back({ get_string_id(feature), {0, feature, 0, 0} });
1465 lstm_inputs.push_back("inputSplit:" + get_string_id(feature));
1467 topology.add(split("inputSplit", "input", input_ids_offsets));
1469 bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden
1470 || output_selection == cldnn_lstm_output_hidden_cell;
1472 std::vector<cldnn::primitive_id> output_sequence_ids;
1473 std::vector<cldnn::primitive_id> last_hidden_ids;
1474 std::vector<cldnn::primitive_id> last_cell_ids;
1476 for (size_t chain = 0; chain < chains; chain++) {
1478 // Add all the primitives to the network
1479 std::vector<cldnn::primitive_id> prev_output_sequence_ids(output_sequence_ids);
1480 std::vector<cldnn::primitive_id> prev_last_hidden_ids(last_hidden_ids);
1481 std::vector<cldnn::primitive_id> prev_last_cell_ids(last_cell_ids);
1483 // Erase all the temporary primitive id containers
1484 output_sequence_ids.clear();
1485 last_cell_ids.clear();
1486 last_hidden_ids.clear();
1488 for (size_t layer = 0; layer < layers; layer++) {
1489 std::string chain_id = get_string_id(chain);
1490 std::string layer_id = get_string_id(layer);
1491 std::string lstm_id = "lstm:" + chain_id + ":" + layer_id;
1492 std::string weights_id = "weights:" + chain_id + ":" + layer_id;
1493 std::string recurrent_id = "recurrent:" + chain_id + ":" + layer_id;
1494 std::string biases_id = "biases:" + chain_id + ":" + layer_id;
1495 std::string hidden_id = "hidden:" + chain_id + ":" + layer_id;
1496 std::string cell_id = "cell:" + chain_id + ":" + layer_id;
1497 std::string crop_seq_id = "crop:sequence:" + chain_id + ":" + layer_id;
1498 std::string crop_last_cell_id = "crop:last_cell:" + chain_id + ":" + layer_id;
1499 std::string crop_last_hidden_id = "crop:last_hidden:" + chain_id + ":" + layer_id;
1501 primitive_id initial_hidden_id;
1502 primitive_id initial_cell_id;
1503 cldnn_lstm_output output_selection_per_layer;
1505 topology.add(data(weights_id, weights[chain][layer]));
1506 topology.add(data(recurrent_id, recurrent[chain][layer]));
1507 if (has_bias) topology.add(data(biases_id, biases[chain][layer]));
1509 if (chain == 0 && layer == 0)
1511 if (has_initial_hidden) topology.add(input_layout(hidden_id, hidden[chain][layer].get_layout()));
1512 if (has_initial_cell) topology.add(input_layout(cell_id, cell[chain][layer].get_layout()));
1515 // Get the initial hidden and initial cell for each layer for each chain link
1518 initial_hidden_id = has_initial_hidden ? hidden_id : "";
1519 initial_cell_id = has_initial_cell ? cell_id : "";
1523 initial_hidden_id = prev_last_hidden_ids[layer];
1524 initial_cell_id = prev_last_cell_ids[layer];
1527 // Output selection for all the layers except the last layer has to have the sequence,
1528 // last hidden and last cell
1529 if (layer < layers - 1)
1531 output_selection_per_layer = cldnn_lstm_output::cldnn_lstm_output_sequence_cell;
1535 // For the last layer, use the output selection provided by the user
1536 output_selection_per_layer = output_selection;
1541 topology.add(lstm(lstm_id, lstm_inputs, weights_id, recurrent_id,
1542 has_bias ? biases_id : "",
1543 initial_hidden_id, initial_cell_id,
1544 "", clip_threshold, input_forget, {}, {},
1545 output_selection_per_layer, default_offset_type));
1549 topology.add(lstm(lstm_id, { output_sequence_ids[layer - 1] }, weights_id, recurrent_id,
1550 has_bias ? biases_id : "",
1551 initial_hidden_id, initial_cell_id,
1552 "", clip_threshold, input_forget, {}, {},
1553 output_selection_per_layer, default_offset_type));
1556 tensor sequence_tensor{ batch_size, sequence_len, hidden_size, directions };
1557 tensor cell_tensor{ batch_size, 1, hidden_size, directions };
1558 tensor last_hidden_tensor{ batch_size, 1, hidden_size, directions };
1560 // For all the layers except the last layer, we need to crop output sequence,
1561 // last hidden and last cell.
1562 // The output sequence goes into the next layer of lstm in a chain link
1563 // The last cell state and last hidden go to the lstm node in the same layer
1565 topology.add(crop(crop_seq_id, lstm_id, sequence_tensor, tensor{ 0, 0, 0, 0 })); // Add crop to get the sequence
1566 topology.add(crop(crop_last_hidden_id, lstm_id, last_hidden_tensor, tensor{ 0, sequence_len - 1, 0, 0 })); // Add crop to get the last hidden element
1567 topology.add(crop(crop_last_cell_id, lstm_id, cell_tensor, tensor{ 0, sequence_len, 0, 0 })); // Add crop to get the last cell element
1569 // Keep a copy of the sequence, last hidden and last cell primitve id for each layer
1570 output_sequence_ids.push_back(crop_seq_id);
1571 last_hidden_ids.push_back(crop_last_hidden_id);
1572 last_cell_ids.push_back(crop_last_cell_id);
1576 // Creating network out of the above designed topology
1577 cldnn::network network(engine, topology);
1578 network.set_input_data("input", input);
1579 for (size_t layer = 0; layer < layers; layer++) {
1580 std::string sid = get_string_id(layer);
1581 if (has_initial_hidden) network.set_input_data("hidden:000:" + sid, hidden[0][layer]); // 0 is the chain link index
1582 if (has_initial_cell) network.set_input_data("cell:000:" + sid, cell[0][layer]); // 0 is the chain link index
1585 auto outputs = network.execute();
1586 for (auto itr = outputs.begin(); itr != outputs.end(); itr++)
1588 auto output_tensor = itr->second.get_memory().get_layout().size;
1589 primitive_id primitive_name = itr->first;
1591 // Split the primitive id to get the chain id
1592 // Eg: primitive id: crop:last_cell:XXX:YYY
1593 // XXX is the chain id
1594 // YYY is the layer id
1595 std::string chain_str = primitive_name.substr(primitive_name.find(":", primitive_name.find(":") + 1) + 1, 5);
1596 std::string layer_str = primitive_name.substr(primitive_name.find(":", primitive_name.find(":", primitive_name.find(":") + 1) + 1) + 1, 5);
1597 size_t chain_id = stoi(chain_str);
1598 size_t layer_id = stoi(layer_str);
1600 cldnn::memory output_memory = itr->second.get_memory();
1601 int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T));
1602 cldnn::tensor ref_output_tensor;
1603 VVVVF<T> ref_primitive_output;
1605 int32_t ref_batch_size = batch_size;
1606 int32_t ref_hidden_size = hidden_size;
1607 int32_t ref_directions = directions;
1609 int32_t ref_seq_len = 1;
1611 // Set the reference output against which the primitive's output will be compared
1612 if (primitive_name.find("crop:last_cell") != std::string::npos)
1614 ref_primitive_output = last_cell[chain_id][layer_id];
1616 else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos)
1618 ref_primitive_output = last_hidden[chain_id][layer_id];
1622 ref_seq_len = sequence_len;
1623 ref_primitive_output = ref_output[chain_id][layers - 1];
1626 ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions };
1627 int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions;
1629 // The number of elements in reference should match the number of elements in the primitive's output
1630 ASSERT_EQ(ref_output_size, output_size);
1632 // Compare the output tensor configuration against the reference value
1633 // Output tensor is configured in bfyx format
1634 ASSERT_EQ(ref_batch_size, output_tensor.batch[0]);
1635 ASSERT_EQ(ref_seq_len, output_tensor.feature[0]); // Sequence length should match
1636 ASSERT_EQ(ref_directions, output_tensor.spatial[1]); // directions should match
1637 ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]); // input size should match
1639 auto output_ptr = output_memory.pointer<T>();
1642 for (int32_t b = 0; b < ref_batch_size; ++b) {
1643 for (int32_t s = 0; s < ref_seq_len; ++s) {
1644 for (int32_t d = 0; d < ref_directions; ++d) {
1645 for (int32_t x = 0; x < ref_hidden_size; ++x) {
1646 ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
1655 TEST(lstm_gemm_gpu, generic_lstm_gemm_test_f32) {
1656 generic_lstm_gemm_gpu_test<float>(1, 1, 3, 6, 2, true, true);
1659 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_bias_f32) {
1660 generic_lstm_gemm_gpu_test<float>(1, 1, 3, 6, 2, false, true);
1663 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_f32) {
1664 generic_lstm_gemm_gpu_test<float>(1, 1, 3, 6, 2, true, false);
1667 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_bias_f32) {
1668 generic_lstm_gemm_gpu_test<float>(1, 1, 3, 6, 2, false, false);
1671 // LSTM GEMM tests to test LSTM GEMMV kernel implementation
1672 TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_test_f32) {
1673 generic_lstm_gemm_gpu_test<float>(5, 1, 1, 1024, 1024, true, true);
1676 TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_bias_f32) {
1677 generic_lstm_gemm_gpu_test<float>(1, 1, 1, 256, 2, false, true);
1680 TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_hidden_f32) {
1681 generic_lstm_gemm_gpu_test<float>(1, 1, 1, 64, 2, true, false);
1684 TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_hidden_bias_f32) {
1685 generic_lstm_gemm_gpu_test<float>(1, 1, 1, 64, 2, false, false);
1689 TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_f32) {
1690 generic_lstm_elt_gpu_test<float>(1, 1, 4, 6, 3, true, 0.3f);
1693 TEST(lstm_elt_gpu, generic_lstm_elt_test_input_forget_f32) {
1694 generic_lstm_elt_gpu_test<float>(1, 1, 4, 6, 3, true, 0.f, 1);
1697 TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_input_forget_f32) {
1698 generic_lstm_elt_gpu_test<float>(1, 1, 4, 6, 3, true, 0.5f, 1);
1701 TEST(lstm_elt_gpu, generic_lstm_elt_test_f32) {
1702 generic_lstm_elt_gpu_test<float>(1, 1, 4, 6, 3, true);
1705 TEST(lstm_elt_gpu, generic_lstm_elt_no_cell_f32) {
1706 generic_lstm_elt_gpu_test<float>(1, 1, 4, 6, 3, false);
1709 TEST(lstm_custom_gpu, generic_lstm_custom_f32) {
1710 generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, true, true, true);
1713 TEST(lstm_custom_gpu, generic_lstm_custom_no_biasf32) {
1714 generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, false, true, true);
1717 TEST(lstm_custom_gpu, generic_lstm_custom_no_hidden_f32) {
1718 generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, true, false, true);
1721 TEST(lstm_custom_gpu, generic_lstm_custom_no_bias_hidden_f32) {
1722 generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, false, false, true);
1725 TEST(lstm_custom_gpu, generic_lstm_custom_no_cell_f32) {
1726 generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, true, true, false);
1729 TEST(lstm_custom_gpu, generic_lstm_custom_no_bias_cell_f32) {
1730 generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, false, true, false);
1733 TEST(lstm_custom_gpu, generic_lstm_custom_no_hidden_cell_f32) {
1734 generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, true, false, false);
1737 TEST(lstm_custom_gpu, generic_lstm_custom_no_bias_hidden_cell_f32) {
1738 generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, false, false, false);
1741 // generic_lstm_gpu_test paramters:
1742 // layers, sequence, dir, batch, input, hidden, bias, initial_h, initial_cell, threshold, coupled_input_forget
1743 TEST(lstm_gpu, generic_lstm_f32) {
1744 generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, true, true, true);
1747 TEST(lstm_gpu, generic_lstm_no_bias_f32) {
1748 generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, false, true, true);
1751 TEST(lstm_gpu, generic_lstm_no_hidden_f32) {
1752 generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, true, false, true);
1755 TEST(lstm_gpu, generic_lstm_no_bias_hidden_f32) {
1756 generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, false, false, true);
1759 TEST(lstm_gpu, generic_lstm_no_cell_f32) {
1760 generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, true, true, false);
1763 TEST(lstm_gpu, generic_lstm_no_bias_cell_f32) {
1764 generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, false, true, false);
1767 TEST(lstm_gpu, generic_lstm_no_hidden_cell_f32) {
1768 generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, true, false, false);
1771 TEST(lstm_gpu, generic_lstm_no_bias_hidden_cell_f32) {
1772 generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, false, false, false);
1775 TEST(lstm_gpu, generic_lstm_clip_f32) {
1776 generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 0);
1779 TEST(lstm_gpu, generic_lstm_input_forget_f32) {
1780 generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, true, true, true, 0.f, 1);
1783 TEST(lstm_gpu, generic_lstm_clip_input_forget_f32) {
1784 generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 1);
1787 TEST(lstm_gpu, generic_lstm_offset_order_ifoz_f32) {
1788 default_offset_type = cldnn_lstm_offset_order_ifoz;
1789 generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, true, true, true);
1790 default_offset_type = cldnn_lstm_offset_order_iofz;
1793 TEST(lstm_gpu, generic_lstm_canonical_f32) {
1794 generic_lstm_gpu_test<float>(1, 1, 1, 1, 1, 1, true, true, true);
1797 // bidirectional support
1798 TEST(lstm_gpu, generic_lstm_bi_f32) {
1799 generic_lstm_gpu_test<float>(1, 7, 2, 2, 3, 4, false, false, false);
1802 TEST(lstm_gpu, generic_lstm_bi_bias_f32) {
1803 generic_lstm_gpu_test<float>(1, 7, 2, 2, 3, 4, true, false, false);
1806 TEST(lstm_gpu, generic_lstm_bi_bias_hidden_f32) {
1807 generic_lstm_gpu_test<float>(1, 7, 2, 2, 3, 4, true, true, false);
1810 TEST(lstm_gpu, generic_lstm_bi_bias_hidden_cell_f32) {
1811 generic_lstm_gpu_test<float>(1, 7, 2, 2, 3, 4, true, true, true);
1814 // multi-layer support
1815 TEST(lstm_gpu, generic_lstm_stacked_no_seq_f32) {
1816 generic_lstm_gpu_test<float>(4, 1, 1, 3, 3, 2, true, true, true);
1819 TEST(lstm_gpu, generic_lstm_stacked_seq_f32) {
1820 generic_lstm_gpu_test<float>(4, 7, 1, 3, 3, 2, true, true, true);
1823 TEST(lstm_gpu, generic_lstm_stacked_bi_f32) {
1824 generic_lstm_gpu_test<float>(4, 7, 2, 3, 3, 2, true, true, true);
1827 TEST(lstm_gpu, generic_lstm_stacked_seq_bi_f32) {
1828 generic_lstm_gpu_test<float>(4, 7, 2, 3, 3, 2, true, true, true);
1831 // optional outputs support
1832 TEST(lstm_gpu, output_test_sequence_f32) {
1833 lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence, 1);
1836 TEST(lstm_gpu, output_test_hidden_f32) {
1837 lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden, 1);
1840 TEST(lstm_gpu, output_test_hidden_cell_f32) {
1841 lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden_cell, 1);
1844 TEST(lstm_gpu, output_test_sequence_cell_f32) {
1845 lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence_cell, 1);
1848 TEST(lstm_gpu, output_test_sequence_bi_f32) {
1849 lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence, 2);
1852 TEST(lstm_gpu, output_test_hidden_bi_f32) {
1853 lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden, 2);
1856 TEST(lstm_gpu, output_test_hidden_cell_bi_f32) {
1857 lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden_cell, 2);
1860 TEST(lstm_gpu, output_test_sequence_cell_bi_f32) {
1861 lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence_cell, 2);
1865 TEST(lstm_gpu, lstm_gpu_format_bfyx_f32) {
1866 lstm_gpu_format_test<float>(cldnn::format::bfyx, 1);
1869 TEST(lstm_gpu, lstm_gpu_format_bfyx_bi_f32) {
1870 lstm_gpu_format_test<float>(cldnn::format::bfyx, 2);
1873 TEST(lstm_gpu, lstm_gpu_format_fyxb_f32) {
1874 lstm_gpu_format_test<float>(cldnn::format::fyxb, 1);
1877 TEST(lstm_gpu, lstm_gpu_format_fyxb_bi_f32) {
1878 lstm_gpu_format_test<float>(cldnn::format::fyxb, 2);
1881 // test for LSTM users' dependencies
1882 TEST(lstm_gpu, lstm_users_f32) {
1883 lstm_gpu_users_test<float>();
1886 // Test for LSTM with concatenated input
1887 TEST(lstm_gpu, generic_lstm_concatenated_input) {
1888 lstm_gpu_concatenated_input_test<float>(1, 2, 2, 1, 1, 1, true, true, true);
1891 TEST(lstm_gpu, generic_lstm_concatenated_input_multi_layer) {
1892 lstm_gpu_concatenated_input_test<float>(5, 5, 2, 1, 1, 4, true, true, true);
1895 // test for LSTM with chain and stack (multilayer)
1896 TEST(lstm_gpu, generic_lstm_chained_unidirectional_f32) {
1903 // sequence length = 1
1904 // output selection = output sequence and cell
1905 lstm_gpu_chain_test<float>(1, 2, 4, 1, 1, 2, 1, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
1908 TEST(lstm_gpu, generic_lstm_chained_bidirectional_f32) {
1915 // sequence length = 1
1916 // output selection = output sequence and cell
1917 lstm_gpu_chain_test<float>(1, 2, 4, 2, 1, 1, 1, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
1920 TEST(lstm_gpu, generic_lstm_chained_no_stack_bidirectional_f32) {
1927 // sequence length = 5
1928 // output selection = output sequence and cell
1929 lstm_gpu_chain_test<float>(2, 2, 4, 2, 1, 2, 5, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
1932 TEST(lstm_gpu, generic_lstm_chained_stacked_bidirectional_f32) {
1939 // sequence length = 5
1940 // output selection = output sequence and cell
1941 lstm_gpu_chain_test<float>(2, 2, 4, 2, 4, 2, 5, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
1944 // FP16 Half precision tests
1945 TEST(lstm_gemm_gpu, generic_lstm_gemm_test_f16) {
1946 generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, true, true);
1949 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_bias_f16) {
1950 generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, false, true);
1953 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_f16) {
1954 generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, true, false);
1957 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_bias_f16) {
1958 generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, false, false);
1961 TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_f16) {
1962 generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true, 0.3f);
1965 TEST(lstm_elt_gpu, generic_lstm_elt_test_input_forget_f16) {
1966 generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true, 0.f, 1);
1969 TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_input_forget_f16) {
1970 generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true, 0.5f, 1);
1973 TEST(lstm_elt_gpu, generic_lstm_elt_test_f16) {
1974 generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true);
1977 TEST(lstm_elt_gpu, generic_lstm_elt_no_cell_f16) {
1978 generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, false);
1981 TEST(lstm_gpu, generic_lstm_f16) {
1982 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true);
1985 TEST(lstm_gpu, generic_lstm_no_bias_f16) {
1986 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, false, true, true);
1989 TEST(lstm_gpu, generic_lstm_no_hidden_f16) {
1990 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, true, false, true);
1993 TEST(lstm_gpu, generic_lstm_no_bias_hidden_f16) {
1994 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, false, false, true);
1997 TEST(lstm_gpu, generic_lstm_no_cell_f16) {
1998 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, true, true, false);
2001 TEST(lstm_gpu, generic_lstm_no_bias_cell_f16) {
2002 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, false, true, false);
2005 TEST(lstm_gpu, generic_lstm_no_hidden_cell_f16) {
2006 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, true, false, false);
2009 TEST(lstm_gpu, generic_lstm_no_bias_hidden_cell_f16) {
2010 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, false, false, false);
2013 TEST(lstm_gpu, generic_lstm_clip_f16) {
2014 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 0);
2017 TEST(lstm_gpu, generic_lstm_input_forget_f16) {
2018 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true, 0.f, 1);
2021 TEST(lstm_gpu, generic_lstm_clip_input_forget_f16) {
2022 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 1);
2025 TEST(lstm_gpu, generic_lstm_offset_order_ifoz_f16) {
2026 default_offset_type = cldnn_lstm_offset_order_ifoz;
2027 generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true);
2028 default_offset_type = cldnn_lstm_offset_order_iofz;
2031 TEST(lstm_gpu, generic_lstm_canonical_f16) {
2032 generic_lstm_gpu_test<FLOAT16>(1, 1, 1, 1, 1, 1, true, true, true);
2035 // bidirectional support
2036 TEST(lstm_gpu, generic_lstm_bi_bias_f16) {
2037 generic_lstm_gpu_test<FLOAT16>(1, 7, 2, 2, 3, 4, true, false, false);
2040 TEST(lstm_gpu, generic_lstm_bi_bias_hidden_f16) {
2041 generic_lstm_gpu_test<FLOAT16>(1, 7, 2, 2, 3, 4, true, true, false);
2044 TEST(lstm_gpu, generic_lstm_bi_bias_hidden_cell_f16) {
2045 generic_lstm_gpu_test<FLOAT16>(1, 7, 2, 2, 3, 4, true, true, true);
2048 // multi-layer support
2049 TEST(lstm_gpu, generic_lstm_stacked_seq_f16) {
2050 generic_lstm_gpu_test<FLOAT16>(4, 7, 1, 3, 3, 2, true, true, true);
2053 TEST(lstm_gpu, generic_lstm_stacked_bi_f16) {
2054 generic_lstm_gpu_test<FLOAT16>(4, 7, 2, 3, 3, 2, true, true, true);
2057 // TODO: Add tests for the following:
2058 // integration testing using multi-layer and chained LSTMs
2059 // LSTMs single input
2060 // optional activation list