for (size_t i = 0; i < len; ++i) {
/** not checking sign change is intentional to avoid float calculation
* errors around 0 */
- if (std::isnan(data[i]) || std::isnan(rdata[i]) ||
+ if ((std::isnan(data[i]) && !std::isnan(rdata[i])) ||
+ (!std::isnan(data[i]) && std::isnan(rdata[i])) ||
std::fabs(data[i] - rdata[i]) > epsilon)
return false;
}
record_single(attention, [(2, 5, 7), (2, 3, 7), (2, 3, 7)],
"attention_batched", {}, input_type='float')
+ # use float data to generate input here
+ multi_head_attention = K.layers.MultiHeadAttention(num_heads=2, key_dim=3)
+ record_single(multi_head_attention, [(1, 5, 7), (1, 3, 7), (1, 3, 7)],
+ "multi_head_attention_single_batch", {}, input_type='float')
+ record_single(multi_head_attention, [(2, 5, 7), (2, 3, 7), (2, 3, 7)],
+ "multi_head_attention", {}, input_type='float')
+ record_single(multi_head_attention, [(2, 5, 7), (2, 3, 7), (2, 3, 7)],
+ "multi_head_attention_return_attention_scores", {"return_attention_scores":True}, input_type='float')
+ multi_head_attention = K.layers.MultiHeadAttention(num_heads=2, key_dim=3, value_dim=5)
+ record_single(multi_head_attention, [(2, 5, 7), (2, 3, 7), (2, 3, 7)],
+ "multi_head_attention_value_dim", {}, input_type='float')
+ multi_head_attention = K.layers.MultiHeadAttention(num_heads=2, key_dim=3, output_shape=5)
+ record_single(multi_head_attention, [(2, 5, 7), (2, 3, 7), (2, 3, 7)],
+ "multi_head_attention_output_shape", {}, input_type='float')
+
rnn = K.layers.SimpleRNN(units=5,
activation="tanh",
return_sequences=False,
# @brief Generate model tcs
# @author Parichay Kapoor <pk.kapoor@samsung.com>
-from recorder_v2 import record_v2, inspect_file
+from recorder_v2 import record_v2, inspect_file, _rand_like
import torch
class ReduceMeanLast(torch.nn.Module):
return (output, kappa), loss
+class MultiHeadAttention(torch.nn.Module):
+ def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None, need_weights=True, provide_attention_mask=False):
+ super(MultiHeadAttention, self).__init__()
+ self.multi_head_attention = torch.nn.MultiheadAttention(embed_dim, num_heads, dropout, bias, add_bias_kv, add_zero_attn, kdim, vdim, batch_first=True)
+ self.loss = torch.nn.MSELoss()
+ self.need_weights = need_weights
+ self.provide_attention_mask = provide_attention_mask
+
+ def forward(self, inputs, labels):
+ inputs, attn_mask = (inputs[:-1], inputs[-1]) if self.provide_attention_mask else (inputs, None)
+ query, *left = inputs
+ if len(left) == 0:
+ key = value = query
+ else:
+ key, value = left
+
+ output, attention_weight = self.multi_head_attention(query, key, value, need_weights=self.need_weights, attn_mask=attn_mask)
+ loss = self.loss(output, labels[0])
+ if attention_weight is not None:
+ output = [output, attention_weight]
+
+ return output, loss
+
+ def input_label_reader(input_dims, label_dims, input_dtype):
+ query_dim, key_dim, value_dim, *left_dim = input_dims
+ query_dtype, key_dtype, value_dtype, *left_dtype = input_dtype
+ assert(query_dtype == key_dtype == value_dtype)
+ if left_dim != []:
+ mask_dim = left_dim[0]
+ mask_dtype = left_dtype[0]
+ if mask_dtype == bool:
+ # Since nntrainer does not support bool type tensor yet, convert mask to float type
+ # todo: return bool type mask tensor
+ mask = torch.randn(mask_dim) > 0.5
+ new_attn_mask = torch.zeros_like(mask, dtype=torch.float32)
+ new_attn_mask.masked_fill_(mask, float("-inf"))
+ mask = [new_attn_mask]
+ elif mask_dtype == int:
+ mask = [torch.randint(0, 1, mask_dim, torch.int32)]
+ else:
+ mask = _rand_like([mask_dim], -1e9, mask_dtype)
+ else:
+ mask = []
+ inputs = _rand_like([query_dim, key_dim, value_dim], dtype=input_dtype if input_dtype is not None else float) + mask
+ labels = _rand_like(label_dims, dtype=float)
+ return inputs, labels
+
class FCRelu(torch.nn.Module):
def __init__(self, decay=False):
super().__init__()
name="mol_attention",
)
+ record_v2(
+ MultiHeadAttention(embed_dim=6, num_heads=2, bias=False, need_weights=False),
+ iteration=2,
+ input_dims=[(3,3,6), (3,2,6), (3,2,6)],
+ label_dims=[(3,3,6)],
+ input_dtype=[float, float, float],
+ name="multi_head_attention_disable_need_weights",
+ )
+
+ record_v2(
+ MultiHeadAttention(embed_dim=6, num_heads=2),
+ iteration=2,
+ input_dims=[(3,3,6), (3,2,6), (3,2,6)],
+ label_dims=[(3,3,6), (3,3,2)],
+ input_dtype=[float, float, float],
+ name="multi_head_attention",
+ )
+
+ record_v2(
+ MultiHeadAttention(embed_dim=6, num_heads=2, kdim=4, vdim=5),
+ iteration=2,
+ input_dims=[(3,3,6), (3,2,4), (3,2,5)],
+ label_dims=[(3,3,6), (3,3,2)],
+ input_dtype=[float, float, float],
+ name="multi_head_attention_kdim_vdim",
+ )
+
+ record_v2(
+ MultiHeadAttention(embed_dim=6, num_heads=2, provide_attention_mask=True),
+ iteration=2,
+ input_dims=[(3,3,6), (3,2,6), (3,2,6), (6,3,2)],
+ label_dims=[(3,3,6), (3,3,2)],
+ input_dtype=[float, float, float, float],
+ input_label_reader=MultiHeadAttention.input_label_reader,
+ name="multi_head_attention_float_attn_mask",
+ )
+
+ # @todo: change this pseudo bool type tensor to actual bool tensor
+ record_v2(
+ MultiHeadAttention(embed_dim=6, num_heads=2, provide_attention_mask=True),
+ iteration=2,
+ input_dims=[(3,3,6), (3,2,6), (3,2,6), (6,3,2)],
+ label_dims=[(3,3,6), (3,3,2)],
+ input_dtype=[float, float, float, bool],
+ input_label_reader=MultiHeadAttention.input_label_reader,
+ name="multi_head_attention_pseudo_bool_attn_mask",
+ )
+
+ record_v2(
+ MultiHeadAttention(embed_dim=6, num_heads=2),
+ iteration=2,
+ input_dims=[(3,3,6)],
+ label_dims=[(3,3,6), (3,3,3)],
+ input_dtype=[float],
+ name="multi_head_attention_self_attention",
+ )
+
fc_relu_decay = FCRelu(decay=True)
record_v2(
fc_relu_decay,
def record_iteration(write_fn):
if input_label_reader != None:
- inputs, labels = input_label_reader(input_dims, label_dims)
+ inputs, labels = input_label_reader(input_dims, label_dims, input_dtype)
else:
inputs = _rand_like(input_dims, dtype=input_dtype if input_dtype is not None else float)
labels = _rand_like(label_dims, dtype=float)
write_fn(inputs)
write_fn(labels)
write_fn(list(t for _, t in params_translated(model)))
- output, loss = model(inputs, labels)
+ output, *losses = model(inputs, labels)
write_fn(output)
optimizer.zero_grad()
- loss.backward()
+ for loss in losses:
+ loss.backward()
if clip:
norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 0.0001)
optimizer.step()
input = inputs[0]
states = inputs[1:]
output, states = self.tf_layer.call(input, states)
- # print(output)
return output
def to_nntr_weights(self, tensorOrList):
def to_nntr_trainable_weights(self, tensorOrList):
return self.to_nntr_weights(tensorOrList)
+class MultiHeadAttentionTransLayer(IdentityTransLayer):
+ def build(self, input_shape):
+ if not self.built:
+ query = tf.random.normal(input_shape[0])
+ key = tf.random.normal(input_shape[1]) if len(input_shape) == 3 else None
+ value = tf.random.normal(input_shape[2 if len(input_shape) == 3 else 1])
+ self.tf_layer(query, value, key)
+
+ ##
+ # @brief call function
+ # @param inputs input with nntrainer layout
+ def call(self, inputs, provide_attention_mask=False, return_attention_scores=False):
+ inputs, mask = (inputs[:-1], inputs[-1]) if provide_attention_mask else (inputs, None)
+ query, key, value = (inputs[0], inputs[1], inputs[2]) if len(inputs) == 3 else (inputs[0], None, inputs[1])
+ output = self.tf_layer.call(query, value, key, mask, return_attention_scores=return_attention_scores)
+ return [output[0], output[1]] if return_attention_scores else output
+
##
# @brief A factory function to attach translayer to existing layer
# if nothing should be attached, it does not attach the layer
if isinstance(layer, K.layers.GRUCell):
return GRUCellTransLayer(layer)
+ if isinstance(layer, K.layers.MultiHeadAttention):
+ return MultiHeadAttentionTransLayer(layer)
+
return layer
yield from new_params
+@register_for_((torch.nn.MultiheadAttention))
+def multi_head_attention_translate(model):
+ def transpose_(weight):
+ return (weight[0], weight[1].transpose(1, 0))
+
+ params = [(name, tensor.detach()) for name, tensor in model.named_parameters()]
+
+ getParamByName = lambda name: list(filter(lambda param: param[0] == name, params))[0]
+
+ if model._qkv_same_embed_dim:
+ in_proj_weight = getParamByName('in_proj_weight')
+ w_q, w_k, w_v = in_proj_weight[1].chunk(3)
+ q_proj_weight = ('q_proj_weight', w_q)
+ k_proj_weight = ('k_proj_weight', w_k)
+ v_proj_weight = ('v_proj_weight', w_v)
+ else:
+ q_proj_weight = getParamByName('q_proj_weight')
+ k_proj_weight = getParamByName('k_proj_weight')
+ v_proj_weight = getParamByName('v_proj_weight')
+
+ if model.in_proj_bias is not None:
+ in_proj_bias = getParamByName('in_proj_bias')
+ w_q, w_k, w_v = in_proj_bias[1].chunk(3)
+ q_proj_bias = ('q_proj_bias', w_q)
+ k_proj_bias = ('k_proj_bias', w_k)
+ v_proj_bias = ('v_proj_bias', w_v)
+
+ out_proj_weight = getParamByName('out_proj.weight')
+
+ if model.in_proj_bias is not None:
+ out_proj_bias = getParamByName('out_proj.bias')
+
+ if model.in_proj_bias is None:
+ new_params = [transpose_(q_proj_weight), transpose_(k_proj_weight), transpose_(v_proj_weight), transpose_(out_proj_weight)]
+ else:
+ new_params = [transpose_(q_proj_weight), q_proj_bias, transpose_(k_proj_weight), k_proj_bias, transpose_(v_proj_weight), v_proj_bias, transpose_(out_proj_weight), out_proj_bias]
+
+ yield from new_params
+
def translate(model):
for child in model.children():
for registered_classes, fn in handler_book:
'unittest_layers_dropout.cpp',
'unittest_layers_reshape.cpp',
# 'unittest_layers_mol_attention.cpp',
+ 'unittest_layers_multi_head_attention.cpp',
]
if get_option('enable-tflite-backbone')
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2022 hyeonseok Lee <hs89.lee@samsung.com>
+ *
+ * @file unittest_layers_mol_attention.cpp
+ * @date 13 July 2022
+ * @brief Multi Head Attention Layer Test
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author hyeonseok Lee <hs89.lee@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+#include <tuple>
+
+#include <gtest/gtest.h>
+
+#include <layers_common_tests.h>
+#include <multi_head_attention_layer.h>
+
+auto semantic_multi_head_attention = LayerSemanticsParamType(
+ nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+ nntrainer::MultiHeadAttentionLayer::type,
+ {"num_heads=1", "projected_key_dim=1"}, 0, false, 3);
+
+auto semantic_multi_head_attention_with_mask = LayerSemanticsParamType(
+ nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+ nntrainer::MultiHeadAttentionLayer::type,
+ {"num_heads=1", "projected_key_dim=1"}, 0, false, 4);
+
+GTEST_PARAMETER_TEST(
+ MultiHeadAttention, LayerSemantics,
+ ::testing::Values(semantic_multi_head_attention,
+ semantic_multi_head_attention_with_mask));
+
+auto multi_head_attention_single_batch = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+ {"num_heads=2", "projected_key_dim=3"}, "1:1:5:7,1:1:3:7,1:1:3:7",
+ "multi_head_attention_single_batch.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
+
+auto multi_head_attention = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+ {"num_heads=2", "projected_key_dim=3"}, "2:1:5:7,2:1:3:7,2:1:3:7",
+ "multi_head_attention.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT);
+
+auto multi_head_attention_return_attention_scores = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+ {"num_heads=2", "projected_key_dim=3", "return_attention_weight=before",
+ "average_attention_weight=false"},
+ "2:1:5:7,2:1:3:7,2:1:3:7",
+ "multi_head_attention_return_attention_scores.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
+
+auto multi_head_attention_value_dim = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+ {"num_heads=2", "projected_key_dim=3", "projected_value_dim=5"},
+ "2:1:5:7,2:1:3:7,2:1:3:7", "multi_head_attention_value_dim.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
+
+auto multi_head_attention_output_shape = LayerGoldenTestParamType(
+ nntrainer::createLayer<nntrainer::MultiHeadAttentionLayer>,
+ {"num_heads=2", "projected_key_dim=3", "output_shape=5"},
+ "2:1:5:7,2:1:3:7,2:1:3:7", "multi_head_attention_output_shape.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT);
+
+GTEST_PARAMETER_TEST(
+ MultiHeadAttention, LayerGoldenTest,
+ ::testing::Values(multi_head_attention_single_batch, multi_head_attention,
+ multi_head_attention_return_attention_scores,
+ multi_head_attention_value_dim,
+ multi_head_attention_output_shape));
return nn;
}
+static std::unique_ptr<NeuralNetwork>
+makeMultiHeadAttention_disable_need_weights() {
+ std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+ nn->setProperty({"batch_size=3"});
+
+ auto outer_graph = makeGraph({
+ {"input", {"name=input_0", "input_shape=1:3:6"}},
+ {"input", {"name=input_1", "input_shape=1:2:6"}},
+ {"input", {"name=input_2", "input_shape=1:2:6"}},
+ {"multi_head_attention",
+ {"name=multi_head_attention", "input_layers=input_0, input_1, input_2",
+ "disable_bias=true", "num_heads=2"}},
+ {"mse", {"name=loss", "input_layers=multi_head_attention"}},
+ });
+
+ for (auto &node : outer_graph) {
+ nn->addLayer(node);
+ }
+
+ nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"}));
+ nn->setProperty({"input_layers=input_0, input_1, input_2"});
+
+ return nn;
+}
+
+static std::unique_ptr<NeuralNetwork> makeMultiHeadAttention() {
+ std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+ nn->setProperty({"batch_size=3"});
+
+ auto outer_graph = makeGraph({
+ {"input", {"name=input_0", "input_shape=1:3:6"}},
+ {"input", {"name=input_1", "input_shape=1:2:6"}},
+ {"input", {"name=input_2", "input_shape=1:2:6"}},
+ {"multi_head_attention",
+ {"name=multi_head_attention", "input_layers=input_0, input_1, input_2",
+ "num_heads=2", "return_attention_weight=after"}},
+ {"mse", {"name=loss1", "input_layers=multi_head_attention(0)"}},
+ {"mse", {"name=loss2", "input_layers=multi_head_attention(1)"}},
+ });
+
+ for (auto &node : outer_graph) {
+ nn->addLayer(node);
+ }
+
+ nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"}));
+ nn->setProperty(
+ {"input_layers=input_0, input_1, input_2", "label_layers=loss1, loss2"});
+
+ return nn;
+}
+
+static std::unique_ptr<NeuralNetwork> makeMultiHeadAttention_kdim_vdim() {
+ std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+ nn->setProperty({"batch_size=3"});
+
+ auto outer_graph = makeGraph({
+ {"input", {"name=input_0", "input_shape=1:3:6"}},
+ {"input", {"name=input_1", "input_shape=1:2:4"}},
+ {"input", {"name=input_2", "input_shape=1:2:5"}},
+ {"multi_head_attention",
+ {"name=multi_head_attention", "input_layers=input_0, input_1, input_2",
+ "num_heads=2", "return_attention_weight=after"}},
+ {"mse", {"name=loss1", "input_layers=multi_head_attention(0)"}},
+ {"mse", {"name=loss2", "input_layers=multi_head_attention(1)"}},
+ });
+
+ for (auto &node : outer_graph) {
+ nn->addLayer(node);
+ }
+
+ nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"}));
+ nn->setProperty(
+ {"input_layers=input_0, input_1, input_2", "label_layers=loss1, loss2"});
+
+ return nn;
+}
+
+static std::unique_ptr<NeuralNetwork> makeMultiHeadAttention_float_attn_mask() {
+ std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+ nn->setProperty({"batch_size=3"});
+
+ auto outer_graph = makeGraph({
+ {"input", {"name=input_0", "input_shape=1:3:6"}},
+ {"input", {"name=input_1", "input_shape=1:2:6"}},
+ {"input", {"name=input_2", "input_shape=1:2:6"}},
+ {"input", {"name=input_3", "input_shape=2:3:2"}},
+ {"multi_head_attention",
+ {"name=multi_head_attention",
+ "input_layers=input_0, input_1, input_2, input_3", "num_heads=2",
+ "return_attention_weight=after"}},
+ {"mse", {"name=loss1", "input_layers=multi_head_attention(0)"}},
+ {"mse", {"name=loss2", "input_layers=multi_head_attention(1)"}},
+ });
+
+ for (auto &node : outer_graph) {
+ nn->addLayer(node);
+ }
+
+ nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"}));
+ nn->setProperty({"input_layers=input_0, input_1, input_2, input_3",
+ "label_layers=loss1, loss2"});
+
+ return nn;
+}
+
+static std::unique_ptr<NeuralNetwork> makeMultiHeadAttention_self_attention() {
+ std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+ nn->setProperty({"batch_size=3"});
+
+ auto outer_graph = makeGraph({
+ {"input", {"name=input_0", "input_shape=1:3:6"}},
+ {"multi_head_attention",
+ {"name=multi_head_attention", "input_layers=input_0, input_0, input_0",
+ "num_heads=2", "return_attention_weight=after"}},
+ {"mse", {"name=loss1", "input_layers=multi_head_attention(0)"}},
+ {"mse", {"name=loss2", "input_layers=multi_head_attention(1)"}},
+ });
+
+ for (auto &node : outer_graph) {
+ nn->addLayer(node);
+ }
+
+ nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"}));
+ nn->setProperty({"input_layers=input_0", "label_layers=loss1, loss2"});
+
+ return nn;
+}
+
GTEST_PARAMETER_TEST(
model, nntrainerModelTest,
::testing::ValuesIn({
ModelTestOption::COMPARE_V2),
mkModelTc_V2(makeMolAttentionMasked, "mol_attention_masked",
ModelTestOption::COMPARE_RUN_V2),
+ mkModelTc_V2(makeMultiHeadAttention_disable_need_weights,
+ "multi_head_attention_disable_need_weights",
+ ModelTestOption::ALL_V2),
+ mkModelTc_V2(makeMultiHeadAttention, "multi_head_attention",
+ ModelTestOption::ALL_V2),
+ mkModelTc_V2(makeMultiHeadAttention_kdim_vdim,
+ "multi_head_attention_kdim_vdim", ModelTestOption::ALL_V2),
+ mkModelTc_V2(makeMultiHeadAttention_float_attn_mask,
+ "multi_head_attention_float_attn_mask",
+ ModelTestOption::ALL_V2),
+ /** @todo:change model if bool type tensor is supported */
+ mkModelTc_V2(makeMultiHeadAttention_float_attn_mask,
+ "multi_head_attention_pseudo_bool_attn_mask",
+ ModelTestOption::ALL_V2),
+ mkModelTc_V2(makeMultiHeadAttention_self_attention,
+ "multi_head_attention_self_attention",
+ ModelTestOption::ALL_V2),
mkModelIniTc(fc_relu_decay, DIM_UNUSED, NOT_USED_,
ModelTestOption::COMPARE_V2),
}),