[multi head attention] added unittest
authorhyeonseok lee <hs89.lee@samsung.com>
Fri, 15 Jul 2022 05:51:02 +0000 (14:51 +0900)
committerJijoong Moon <jijoong.moon@samsung.com>
Wed, 7 Sep 2022 12:44:00 +0000 (21:44 +0900)
 - Added layer/model unittest for multi head attention
 - Change == operator overloading to pass if both tensor has nan value

Signed-off-by: hyeonseok lee <hs89.lee@samsung.com>
nntrainer/tensor/tensor.cpp
packaging/unittest_layers_v2.tar.gz
packaging/unittest_models_v2.tar.gz
test/input_gen/genLayerTests.py
test/input_gen/genModelTests_v2.py
test/input_gen/recorder_v2.py
test/input_gen/transLayer.py
test/input_gen/transLayer_v2.py
test/unittest/layers/meson.build
test/unittest/layers/unittest_layers_multi_head_attention.cpp [new file with mode: 0644]
test/unittest/models/unittest_models.cpp

index daa4f89..7addada 100644 (file)
@@ -227,7 +227,8 @@ bool Tensor::operator==(const Tensor &rhs) const {
   for (size_t i = 0; i < len; ++i) {
     /** not checking sign change is intentional to avoid float calculation
      * errors around 0 */
-    if (std::isnan(data[i]) || std::isnan(rdata[i]) ||
+    if ((std::isnan(data[i]) && !std::isnan(rdata[i])) ||
+        (!std::isnan(data[i]) && std::isnan(rdata[i])) ||
         std::fabs(data[i] - rdata[i]) > epsilon)
       return false;
   }
index 7529d01..1e5b099 100644 (file)
Binary files a/packaging/unittest_layers_v2.tar.gz and b/packaging/unittest_layers_v2.tar.gz differ
index 42d3dbe..ced1a23 100644 (file)
Binary files a/packaging/unittest_models_v2.tar.gz and b/packaging/unittest_models_v2.tar.gz differ
index 3d88ab8..405bb2c 100644 (file)
@@ -98,6 +98,21 @@ if __name__ == "__main__":
     record_single(attention, [(2, 5, 7), (2, 3, 7), (2, 3, 7)],
                  "attention_batched", {}, input_type='float')
 
+    # use float data to generate input here
+    multi_head_attention = K.layers.MultiHeadAttention(num_heads=2, key_dim=3)
+    record_single(multi_head_attention, [(1, 5, 7), (1, 3, 7), (1, 3, 7)],
+                 "multi_head_attention_single_batch", {}, input_type='float')
+    record_single(multi_head_attention, [(2, 5, 7), (2, 3, 7), (2, 3, 7)],
+                 "multi_head_attention", {}, input_type='float')
+    record_single(multi_head_attention, [(2, 5, 7), (2, 3, 7), (2, 3, 7)],
+                 "multi_head_attention_return_attention_scores", {"return_attention_scores":True}, input_type='float')
+    multi_head_attention = K.layers.MultiHeadAttention(num_heads=2, key_dim=3, value_dim=5)
+    record_single(multi_head_attention, [(2, 5, 7), (2, 3, 7), (2, 3, 7)],
+                 "multi_head_attention_value_dim", {}, input_type='float')
+    multi_head_attention = K.layers.MultiHeadAttention(num_heads=2, key_dim=3, output_shape=5)
+    record_single(multi_head_attention, [(2, 5, 7), (2, 3, 7), (2, 3, 7)],
+                 "multi_head_attention_output_shape", {}, input_type='float')
+
     rnn = K.layers.SimpleRNN(units=5,
                          activation="tanh",
                          return_sequences=False,
index 9ebd599..842d036 100644 (file)
@@ -8,7 +8,7 @@
 # @brief Generate model tcs
 # @author Parichay Kapoor <pk.kapoor@samsung.com>
 
-from recorder_v2 import record_v2, inspect_file
+from recorder_v2 import record_v2, inspect_file, _rand_like
 import torch
 
 class ReduceMeanLast(torch.nn.Module):
@@ -72,6 +72,53 @@ class MolAttention(torch.nn.Module):
 
         return (output, kappa), loss
 
+class MultiHeadAttention(torch.nn.Module):
+    def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None, need_weights=True, provide_attention_mask=False):
+        super(MultiHeadAttention, self).__init__()
+        self.multi_head_attention = torch.nn.MultiheadAttention(embed_dim, num_heads, dropout, bias, add_bias_kv, add_zero_attn, kdim, vdim, batch_first=True)
+        self.loss = torch.nn.MSELoss()
+        self.need_weights = need_weights
+        self.provide_attention_mask = provide_attention_mask
+
+    def forward(self, inputs, labels):
+        inputs, attn_mask = (inputs[:-1], inputs[-1]) if self.provide_attention_mask else (inputs, None)
+        query, *left = inputs
+        if len(left) == 0:
+            key = value = query
+        else:
+            key, value = left
+
+        output, attention_weight = self.multi_head_attention(query, key, value, need_weights=self.need_weights, attn_mask=attn_mask)
+        loss = self.loss(output, labels[0])
+        if attention_weight is not None:
+            output = [output, attention_weight]
+
+        return output, loss
+
+    def input_label_reader(input_dims, label_dims, input_dtype):
+        query_dim, key_dim, value_dim, *left_dim = input_dims
+        query_dtype, key_dtype, value_dtype, *left_dtype = input_dtype
+        assert(query_dtype == key_dtype == value_dtype)
+        if left_dim != []:
+            mask_dim = left_dim[0]
+            mask_dtype = left_dtype[0]
+            if mask_dtype == bool:
+                # Since nntrainer does not support bool type tensor yet, convert mask to float type
+                # todo: return bool type mask tensor
+                mask = torch.randn(mask_dim) > 0.5
+                new_attn_mask = torch.zeros_like(mask, dtype=torch.float32)
+                new_attn_mask.masked_fill_(mask, float("-inf"))
+                mask = [new_attn_mask]
+            elif mask_dtype == int:
+                mask = [torch.randint(0, 1, mask_dim, torch.int32)]
+            else:
+                mask = _rand_like([mask_dim], -1e9, mask_dtype)
+        else:
+            mask = []
+        inputs = _rand_like([query_dim, key_dim, value_dim], dtype=input_dtype if input_dtype is not None else float) + mask
+        labels = _rand_like(label_dims, dtype=float)
+        return inputs, labels
+
 class FCRelu(torch.nn.Module):
     def __init__(self, decay=False):
         super().__init__()
@@ -129,6 +176,63 @@ if __name__ == "__main__":
         name="mol_attention",
     )
 
+    record_v2(
+        MultiHeadAttention(embed_dim=6, num_heads=2, bias=False, need_weights=False),
+        iteration=2,
+        input_dims=[(3,3,6), (3,2,6), (3,2,6)],
+        label_dims=[(3,3,6)],
+        input_dtype=[float, float, float],
+        name="multi_head_attention_disable_need_weights",
+    )
+
+    record_v2(
+        MultiHeadAttention(embed_dim=6, num_heads=2),
+        iteration=2,
+        input_dims=[(3,3,6), (3,2,6), (3,2,6)],
+        label_dims=[(3,3,6), (3,3,2)],
+        input_dtype=[float, float, float],
+        name="multi_head_attention",
+    )
+
+    record_v2(
+        MultiHeadAttention(embed_dim=6, num_heads=2, kdim=4, vdim=5),
+        iteration=2,
+        input_dims=[(3,3,6), (3,2,4), (3,2,5)],
+        label_dims=[(3,3,6), (3,3,2)],
+        input_dtype=[float, float, float],
+        name="multi_head_attention_kdim_vdim",
+    )
+
+    record_v2(
+        MultiHeadAttention(embed_dim=6, num_heads=2, provide_attention_mask=True),
+        iteration=2,
+        input_dims=[(3,3,6), (3,2,6), (3,2,6), (6,3,2)],
+        label_dims=[(3,3,6), (3,3,2)],
+        input_dtype=[float, float, float, float],
+        input_label_reader=MultiHeadAttention.input_label_reader,
+        name="multi_head_attention_float_attn_mask",
+    )
+
+    # @todo: change this pseudo bool type tensor to actual bool tensor
+    record_v2(
+        MultiHeadAttention(embed_dim=6, num_heads=2, provide_attention_mask=True),
+        iteration=2,
+        input_dims=[(3,3,6), (3,2,6), (3,2,6), (6,3,2)],
+        label_dims=[(3,3,6), (3,3,2)],
+        input_dtype=[float, float, float, bool],
+        input_label_reader=MultiHeadAttention.input_label_reader,
+        name="multi_head_attention_pseudo_bool_attn_mask",
+    )
+
+    record_v2(
+        MultiHeadAttention(embed_dim=6, num_heads=2),
+        iteration=2,
+        input_dims=[(3,3,6)],
+        label_dims=[(3,3,6), (3,3,3)],
+        input_dtype=[float],
+        name="multi_head_attention_self_attention",
+    )
+
     fc_relu_decay = FCRelu(decay=True)
     record_v2(
         fc_relu_decay,
index 5a58b9d..9bc219c 100644 (file)
@@ -79,18 +79,19 @@ def record_v2(model, iteration, input_dims, label_dims, name, clip=False,
 
     def record_iteration(write_fn):
         if input_label_reader != None:
-            inputs, labels = input_label_reader(input_dims, label_dims)
+            inputs, labels = input_label_reader(input_dims, label_dims, input_dtype)
         else:
             inputs = _rand_like(input_dims, dtype=input_dtype if input_dtype is not None else float)
             labels = _rand_like(label_dims, dtype=float)
         write_fn(inputs)
         write_fn(labels)
         write_fn(list(t for _, t in params_translated(model)))
-        output, loss = model(inputs, labels)
+        output, *losses = model(inputs, labels)
         write_fn(output)
 
         optimizer.zero_grad()
-        loss.backward()
+        for loss in losses:
+            loss.backward()
         if clip:
             norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 0.0001)
         optimizer.step()
index 9af1b42..77a0592 100644 (file)
@@ -257,7 +257,6 @@ class GRUCellTransLayer(IdentityTransLayer):
         input = inputs[0]
         states = inputs[1:]
         output, states = self.tf_layer.call(input, states)
-        # print(output)
         return output
 
     def to_nntr_weights(self, tensorOrList):
@@ -271,6 +270,23 @@ class GRUCellTransLayer(IdentityTransLayer):
     def to_nntr_trainable_weights(self, tensorOrList):
         return self.to_nntr_weights(tensorOrList)
 
+class MultiHeadAttentionTransLayer(IdentityTransLayer):
+    def build(self, input_shape):
+        if not self.built:
+            query = tf.random.normal(input_shape[0])
+            key = tf.random.normal(input_shape[1]) if len(input_shape) == 3 else None
+            value = tf.random.normal(input_shape[2 if len(input_shape) == 3 else 1])
+            self.tf_layer(query, value, key)
+
+    ##
+    # @brief call function
+    # @param inputs input with nntrainer layout
+    def call(self, inputs, provide_attention_mask=False, return_attention_scores=False):
+        inputs, mask = (inputs[:-1], inputs[-1]) if provide_attention_mask else (inputs, None)
+        query, key, value = (inputs[0], inputs[1], inputs[2]) if len(inputs) == 3 else (inputs[0], None, inputs[1])
+        output = self.tf_layer.call(query, value, key, mask, return_attention_scores=return_attention_scores)
+        return [output[0], output[1]] if return_attention_scores else output
+
 ##
 # @brief A factory function to attach translayer to existing layer
 # if nothing should be attached, it does not attach the layer
@@ -292,4 +308,7 @@ def attach_trans_layer(layer):
     if isinstance(layer, K.layers.GRUCell):
         return GRUCellTransLayer(layer)
 
+    if isinstance(layer, K.layers.MultiHeadAttention):
+        return MultiHeadAttentionTransLayer(layer)
+
     return layer
index ca0b621..691afc4 100644 (file)
@@ -115,6 +115,45 @@ def gru_translate(model):
 
     yield from new_params
 
+@register_for_((torch.nn.MultiheadAttention))
+def multi_head_attention_translate(model):
+    def transpose_(weight):
+        return (weight[0], weight[1].transpose(1, 0))
+
+    params = [(name, tensor.detach()) for name, tensor in model.named_parameters()]
+
+    getParamByName = lambda name: list(filter(lambda param: param[0] == name, params))[0]
+
+    if model._qkv_same_embed_dim:
+        in_proj_weight = getParamByName('in_proj_weight')
+        w_q, w_k, w_v = in_proj_weight[1].chunk(3)
+        q_proj_weight = ('q_proj_weight', w_q)
+        k_proj_weight = ('k_proj_weight', w_k)
+        v_proj_weight = ('v_proj_weight', w_v)
+    else:
+        q_proj_weight = getParamByName('q_proj_weight')
+        k_proj_weight = getParamByName('k_proj_weight')
+        v_proj_weight = getParamByName('v_proj_weight')
+
+    if model.in_proj_bias is not None:
+        in_proj_bias = getParamByName('in_proj_bias')
+        w_q, w_k, w_v = in_proj_bias[1].chunk(3)
+        q_proj_bias = ('q_proj_bias', w_q)
+        k_proj_bias = ('k_proj_bias', w_k)
+        v_proj_bias = ('v_proj_bias', w_v)
+
+    out_proj_weight = getParamByName('out_proj.weight')
+
+    if model.in_proj_bias is not None:
+        out_proj_bias = getParamByName('out_proj.bias')
+
+    if model.in_proj_bias is None:
+        new_params = [transpose_(q_proj_weight), transpose_(k_proj_weight), transpose_(v_proj_weight), transpose_(out_proj_weight)]
+    else:
+        new_params = [transpose_(q_proj_weight), q_proj_bias, transpose_(k_proj_weight), k_proj_bias, transpose_(v_proj_weight), v_proj_bias, transpose_(out_proj_weight), out_proj_bias]
+
+    yield from new_params
+
 def translate(model):
     for child in model.children():
         for registered_classes, fn in handler_book:
index 502f60a..b6f43c5 100644 (file)
@@ -60,6 +60,7 @@ test_target = [
   'unittest_layers_dropout.cpp',
   'unittest_layers_reshape.cpp',
   # 'unittest_layers_mol_attention.cpp',
+  'unittest_layers_multi_head_attention.cpp',
 ]
 
 if get_option('enable-tflite-backbone')
diff --git a/test/unittest/layers/unittest_layers_multi_head_attention.cpp b/test/unittest/layers/unittest_layers_multi_head_attention.cpp
new file mode 100644 (file)
index 0000000..995d628
--- /dev/null
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2022 hyeonseok Lee <hs89.lee@samsung.com>
+ *
+ * @file unittest_layers_mol_attention.cpp
+ * @date 13 July 2022
+ * @brief Multi Head Attention Layer Test
+ * @see        https://github.com/nnstreamer/nntrainer
+ * @author hyeonseok Lee <hs89.lee@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+#include <tuple>
+
+#include <gtest/gtest.h>
+
+#include <layers_common_tests.h>
+#include <multi_head_attention_layer.h>
+
+auto semantic_multi_head_attention = LayerSemanticsParamType(
+  nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+  nntrainer::MultiHeadAttentionLayer::type,
+  {"num_heads=1", "projected_key_dim=1"}, 0, false, 3);
+
+auto semantic_multi_head_attention_with_mask = LayerSemanticsParamType(
+  nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+  nntrainer::MultiHeadAttentionLayer::type,
+  {"num_heads=1", "projected_key_dim=1"}, 0, false, 4);
+
+GTEST_PARAMETER_TEST(
+  MultiHeadAttention, LayerSemantics,
+  ::testing::Values(semantic_multi_head_attention,
+                    semantic_multi_head_attention_with_mask));
+
+auto multi_head_attention_single_batch = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+  {"num_heads=2", "projected_key_dim=3"}, "1:1:5:7,1:1:3:7,1:1:3:7",
+  "multi_head_attention_single_batch.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT);
+
+auto multi_head_attention = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+  {"num_heads=2", "projected_key_dim=3"}, "2:1:5:7,2:1:3:7,2:1:3:7",
+  "multi_head_attention.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
+
+auto multi_head_attention_return_attention_scores = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+  {"num_heads=2", "projected_key_dim=3", "return_attention_weight=before",
+   "average_attention_weight=false"},
+  "2:1:5:7,2:1:3:7,2:1:3:7",
+  "multi_head_attention_return_attention_scores.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT);
+
+auto multi_head_attention_value_dim = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+  {"num_heads=2", "projected_key_dim=3", "projected_value_dim=5"},
+  "2:1:5:7,2:1:3:7,2:1:3:7", "multi_head_attention_value_dim.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT);
+
+auto multi_head_attention_output_shape = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+  {"num_heads=2", "projected_key_dim=3", "output_shape=5"},
+  "2:1:5:7,2:1:3:7,2:1:3:7", "multi_head_attention_output_shape.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT);
+
+GTEST_PARAMETER_TEST(
+  MultiHeadAttention, LayerGoldenTest,
+  ::testing::Values(multi_head_attention_single_batch, multi_head_attention,
+                    multi_head_attention_return_attention_scores,
+                    multi_head_attention_value_dim,
+                    multi_head_attention_output_shape));
index 9079b82..2e3ab1b 100644 (file)
@@ -97,6 +97,134 @@ static std::unique_ptr<NeuralNetwork> makeMolAttentionMasked() {
   return nn;
 }
 
+static std::unique_ptr<NeuralNetwork>
+makeMultiHeadAttention_disable_need_weights() {
+  std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+  nn->setProperty({"batch_size=3"});
+
+  auto outer_graph = makeGraph({
+    {"input", {"name=input_0", "input_shape=1:3:6"}},
+    {"input", {"name=input_1", "input_shape=1:2:6"}},
+    {"input", {"name=input_2", "input_shape=1:2:6"}},
+    {"multi_head_attention",
+     {"name=multi_head_attention", "input_layers=input_0, input_1, input_2",
+      "disable_bias=true", "num_heads=2"}},
+    {"mse", {"name=loss", "input_layers=multi_head_attention"}},
+  });
+
+  for (auto &node : outer_graph) {
+    nn->addLayer(node);
+  }
+
+  nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"}));
+  nn->setProperty({"input_layers=input_0, input_1, input_2"});
+
+  return nn;
+}
+
+static std::unique_ptr<NeuralNetwork> makeMultiHeadAttention() {
+  std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+  nn->setProperty({"batch_size=3"});
+
+  auto outer_graph = makeGraph({
+    {"input", {"name=input_0", "input_shape=1:3:6"}},
+    {"input", {"name=input_1", "input_shape=1:2:6"}},
+    {"input", {"name=input_2", "input_shape=1:2:6"}},
+    {"multi_head_attention",
+     {"name=multi_head_attention", "input_layers=input_0, input_1, input_2",
+      "num_heads=2", "return_attention_weight=after"}},
+    {"mse", {"name=loss1", "input_layers=multi_head_attention(0)"}},
+    {"mse", {"name=loss2", "input_layers=multi_head_attention(1)"}},
+  });
+
+  for (auto &node : outer_graph) {
+    nn->addLayer(node);
+  }
+
+  nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"}));
+  nn->setProperty(
+    {"input_layers=input_0, input_1, input_2", "label_layers=loss1, loss2"});
+
+  return nn;
+}
+
+static std::unique_ptr<NeuralNetwork> makeMultiHeadAttention_kdim_vdim() {
+  std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+  nn->setProperty({"batch_size=3"});
+
+  auto outer_graph = makeGraph({
+    {"input", {"name=input_0", "input_shape=1:3:6"}},
+    {"input", {"name=input_1", "input_shape=1:2:4"}},
+    {"input", {"name=input_2", "input_shape=1:2:5"}},
+    {"multi_head_attention",
+     {"name=multi_head_attention", "input_layers=input_0, input_1, input_2",
+      "num_heads=2", "return_attention_weight=after"}},
+    {"mse", {"name=loss1", "input_layers=multi_head_attention(0)"}},
+    {"mse", {"name=loss2", "input_layers=multi_head_attention(1)"}},
+  });
+
+  for (auto &node : outer_graph) {
+    nn->addLayer(node);
+  }
+
+  nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"}));
+  nn->setProperty(
+    {"input_layers=input_0, input_1, input_2", "label_layers=loss1, loss2"});
+
+  return nn;
+}
+
+static std::unique_ptr<NeuralNetwork> makeMultiHeadAttention_float_attn_mask() {
+  std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+  nn->setProperty({"batch_size=3"});
+
+  auto outer_graph = makeGraph({
+    {"input", {"name=input_0", "input_shape=1:3:6"}},
+    {"input", {"name=input_1", "input_shape=1:2:6"}},
+    {"input", {"name=input_2", "input_shape=1:2:6"}},
+    {"input", {"name=input_3", "input_shape=2:3:2"}},
+    {"multi_head_attention",
+     {"name=multi_head_attention",
+      "input_layers=input_0, input_1, input_2, input_3", "num_heads=2",
+      "return_attention_weight=after"}},
+    {"mse", {"name=loss1", "input_layers=multi_head_attention(0)"}},
+    {"mse", {"name=loss2", "input_layers=multi_head_attention(1)"}},
+  });
+
+  for (auto &node : outer_graph) {
+    nn->addLayer(node);
+  }
+
+  nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"}));
+  nn->setProperty({"input_layers=input_0, input_1, input_2, input_3",
+                   "label_layers=loss1, loss2"});
+
+  return nn;
+}
+
+static std::unique_ptr<NeuralNetwork> makeMultiHeadAttention_self_attention() {
+  std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+  nn->setProperty({"batch_size=3"});
+
+  auto outer_graph = makeGraph({
+    {"input", {"name=input_0", "input_shape=1:3:6"}},
+    {"multi_head_attention",
+     {"name=multi_head_attention", "input_layers=input_0, input_0, input_0",
+      "num_heads=2", "return_attention_weight=after"}},
+    {"mse", {"name=loss1", "input_layers=multi_head_attention(0)"}},
+    {"mse", {"name=loss2", "input_layers=multi_head_attention(1)"}},
+  });
+
+  for (auto &node : outer_graph) {
+    nn->addLayer(node);
+  }
+
+  nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"}));
+  nn->setProperty({"input_layers=input_0", "label_layers=loss1, loss2"});
+
+  return nn;
+}
+
 GTEST_PARAMETER_TEST(
   model, nntrainerModelTest,
   ::testing::ValuesIn({
@@ -106,6 +234,23 @@ GTEST_PARAMETER_TEST(
                  ModelTestOption::COMPARE_V2),
     mkModelTc_V2(makeMolAttentionMasked, "mol_attention_masked",
                  ModelTestOption::COMPARE_RUN_V2),
+    mkModelTc_V2(makeMultiHeadAttention_disable_need_weights,
+                 "multi_head_attention_disable_need_weights",
+                 ModelTestOption::ALL_V2),
+    mkModelTc_V2(makeMultiHeadAttention, "multi_head_attention",
+                 ModelTestOption::ALL_V2),
+    mkModelTc_V2(makeMultiHeadAttention_kdim_vdim,
+                 "multi_head_attention_kdim_vdim", ModelTestOption::ALL_V2),
+    mkModelTc_V2(makeMultiHeadAttention_float_attn_mask,
+                 "multi_head_attention_float_attn_mask",
+                 ModelTestOption::ALL_V2),
+    /** @todo:change model if bool type tensor is supported */
+    mkModelTc_V2(makeMultiHeadAttention_float_attn_mask,
+                 "multi_head_attention_pseudo_bool_attn_mask",
+                 ModelTestOption::ALL_V2),
+    mkModelTc_V2(makeMultiHeadAttention_self_attention,
+                 "multi_head_attention_self_attention",
+                 ModelTestOption::ALL_V2),
     mkModelIniTc(fc_relu_decay, DIM_UNUSED, NOT_USED_,
                  ModelTestOption::COMPARE_V2),
   }),