throw std::runtime_error("Attention layer needs 2-3 inputs.");
auto const &all_dims = context.getInputDimensions();
+ auto const &query_dim = all_dims[AttentionParams::query];
auto const &value_dim = all_dims[AttentionParams::value];
+ NNTR_THROW_IF(query_dim.width() != value_dim.width(), std::invalid_argument)
+ << "Query and Value dimension mismatch for layer " << context.getName();
+
wt_idx[AttentionParams::query] = AttentionParams::query;
wt_idx[AttentionParams::value] = AttentionParams::value;
wt_idx[AttentionParams::key] = AttentionParams::value;
* references which leads to nasty bugs. This validation ensures that the
* tensors are not set mistakenly by verifying their unique names
*/
+ bool ret = true;
#ifdef DEBUG
+ std::function<bool(const Var_Grad *, bool)> matcher;
+
if (tensor_map.empty() || !tensor_map[inputs[0]->getName()]) {
auto filler = [this](const auto &vec) {
for (auto const &val : vec) {
filler(inputs);
filler(outputs);
filler(tensors);
- } else {
- auto matcher = [this](const Var_Grad *val, bool skip_grad = false) {
- if (val->getName().empty() ||
- (val->hasGradient() && val->getGradientName().empty()))
- return false;
-
- if (tensor_map.find(val->getName()) == tensor_map.end())
- /**
- * Disabled because of in-place input layer. Enable this later.
- * tensor_map[val->getName()] != val->getVariableRef().getData())
- */
- return false;
-
- if (skip_grad &&
- (tensor_map.find(val->getGradientName()) == tensor_map.end()))
- return false;
-
- return true;
- };
-
- auto matcher_w = [this, matcher](const std::vector<Weight *> &vec) {
- return std::all_of(vec.begin(), vec.end(), matcher);
- };
-
- auto matcher_vw = [this, matcher](const std::vector<Var_Grad *> &vec,
- bool skip_grad = false) {
- return std::all_of(vec.begin(), vec.end(),
- std::bind(matcher, std::placeholders::_1, skip_grad));
- auto ret = true;
- for (auto const &val : vec)
- ret &= matcher(val, skip_grad);
- return ret;
- };
-
- /** match the tensor map from the next validations */
-
- auto ret = matcher_w(weights) & matcher_vw(tensors) &
- matcher_vw(outputs, skip_label);
- if (!skip_input)
- ret &= matcher_vw(inputs);
-
- return ret;
}
+
+ matcher = [this](const Var_Grad *val, bool skip_grad) -> bool {
+ if (val->getName().empty() ||
+ (val->hasGradient() && val->getGradientName().empty()))
+ return false;
+
+ if (tensor_map.find(val->getName()) == tensor_map.end())
+ /**
+ * Disabled because of in-place input layer. Enable this later.
+ * tensor_map[val->getName()] != val->getVariableRef().getData())
+ */
+ return false;
+
+ if (skip_grad &&
+ (tensor_map.find(val->getGradientName()) == tensor_map.end()))
+ return false;
+
+ return true;
+ };
+
+ auto matcher_w = [this, &matcher](const std::vector<Weight *> &vec) {
+ return std::all_of(vec.begin(), vec.end(),
+ std::bind(matcher, std::placeholders::_1, false));
+ };
+
+ auto matcher_vw = [this, &matcher](const std::vector<Var_Grad *> &vec,
+ bool skip_grad = false) {
+ return std::all_of(vec.begin(), vec.end(),
+ std::bind(matcher, std::placeholders::_1, skip_grad));
+ };
+
+ /** match the tensor map from the next validations */
+ ret =
+ matcher_w(weights) & matcher_vw(tensors) & matcher_vw(outputs, skip_label);
+ if (!skip_input)
+ ret &= matcher_vw(inputs);
#endif
- return true;
+ return ret;
}
} // namespace nntrainer
auto const &all_dims = context.getInputDimensions();
auto const &query_dim = all_dims[AttentionParams::query];
auto const &value_dim = all_dims[AttentionParams::value];
+ auto const &state_dim = all_dims[AttentionParams::state];
wt_idx[AttentionParams::query] = AttentionParams::query;
wt_idx[AttentionParams::value] = AttentionParams::value;
tanh.setActiFunc(ActivationType::ACT_TANH);
sigmoid.setActiFunc(ActivationType::ACT_SIGMOID);
+ NNTR_THROW_IF(query_dim.width() != value_dim.width(), std::invalid_argument)
+ << "Query and Value dimension mismatch for layer " << context.getName();
+
NNTR_THROW_IF(std::get<props::Unit>(mol_props).empty(), std::invalid_argument)
<< "Number of units not provided for layer " << context.getName();
auto unit = std::get<props::Unit>(mol_props).get();
<< "MoL_K property not provided for layer " << context.getName();
auto mol_k = std::get<props::MoL_K>(mol_props).get();
+ NNTR_THROW_IF(mol_k != state_dim.width(), std::invalid_argument)
+ << "MoL_K property mismatches the provided state dimension for layer"
+ << context.getName();
+
auto &weight_regularizer =
std::get<props::WeightRegularizer>(*layer_impl_props);
auto &weight_regularizer_constant =
false, TensorLifespan::ITERATION_LIFESPAN);
TensorDim fc_proj_out_dim = fc_out_dim;
- fc_out_dim.width(fc_proj_w_dim.width());
+ fc_proj_out_dim.width(fc_proj_w_dim.width());
wt_idx[AttentionParams::fc_proj_out] = context.requestTensor(
fc_proj_out_dim, "fc_proj_out", Tensor::Initializer::NONE, false,
TensorLifespan::ITERATION_LIFESPAN);
/** reset helper state */
helper_exec = false;
- fc_out = query.dot(fc_w);
+ query.dot(fc_w, fc_out);
fc_out.add_i(fc_bias);
tanh.run_fn(fc_out, fc_tanh);
- fc_proj_out = fc_tanh.dot(fc_proj_w);
+ fc_tanh.dot(fc_proj_w, fc_proj_out);
Tensor kappa_src, beta_src, alpha_src;
kappa_src.copy_with_stride(
void MoLAttentionLayer::calcDerivativeHelper(RunLayerContext &context,
Tensor &dstate) {
+ /** optimize temporary tensor usage here */
Tensor &query = context.getInput(wt_idx[AttentionParams::query]);
Tensor &value = context.getInput(wt_idx[AttentionParams::value]);
void MoLAttentionLayer::setProperty(const std::vector<std::string> &values) {
auto remain_props = loadProperties(values, mol_props);
- AttentionLayer::setProperty(remain_props);
+ LayerImpl::setProperty(remain_props);
}
void MoLAttentionLayer::setBatch(RunLayerContext &context, unsigned int batch) {
void MoLAttentionLayer::exportTo(Exporter &exporter,
const ExportMethods &method) const {
- AttentionLayer::exportTo(exporter, method);
LayerImpl::exportTo(exporter, method);
exporter.saveResult(mol_props, method, this);
}
* @class MoL Attention Layer
* @brief Mixture of Logistics Attention Layer
*/
-class MoLAttentionLayer : public AttentionLayer, public LayerImpl {
+class MoLAttentionLayer : public LayerImpl {
public:
/**
* @brief Constructor of MoL Attention Layer
Tensor in = m.transpose("1:0:2");
in.reshape({dim[2], dim[1], dim[0], dim[3]});
m.reshape(dim);
+ in.setName(m.getName() + "_trans");
return in;
}
// TODO: This transposed Input Tensor could be resued for backwarding
Tensor in = transposeTensor(input_);
- Tensor out = Tensor({ho_dim[2], 1, ho_dim[0], ho_dim[3]}, true);
+ Tensor out =
+ Tensor({ho_dim[2], 1, ho_dim[0], ho_dim[3]}, true,
+ Tensor::Initializer::NONE, context.getName() + ":inter_output");
TensorDim i_dim = in_dim;
i_dim.channel(1);
//
Tensor label_iter;
- Tensor in_iter =
- in.getSharedDataTensor(i_dim, i * in_dim.batch() * in_dim.width());
- Tensor out_iter =
- out.getSharedDataTensor(h_dim, i * ho_dim.batch() * ho_dim.width());
+ Tensor in_iter = in.getSharedDataTensor(
+ i_dim, i * in_dim.batch() * in_dim.width(), true, in.getName());
+ Tensor out_iter = out.getSharedDataTensor(
+ h_dim, i * ho_dim.batch() * ho_dim.width(), true, out.getName());
in_var.initializeVariable(in_iter);
out_var.initializeVariable(out_iter);
if (dist_layer->requireLabel() &&
context.isLabelAvailable(SINGLE_INOUT_IDX)) {
- label_iter =
- h_g.getSharedDataTensor(h_dim, i * ho_dim.batch() * ho_dim.width());
+ label_iter = h_g.getSharedDataTensor(
+ h_dim, i * ho_dim.batch() * ho_dim.width(), true, h_g.getName());
out_var.initializeGradient(label_iter);
}
fillTensorsFromContext(context);
for (unsigned int i = 0; i < der_dim[0]; ++i) {
- Tensor ret_iter =
- ret_.getSharedDataTensor(r_dim, i * r_dim.batch() * r_dim.width());
- Tensor in_iter =
- input_.getSharedDataTensor(r_dim, i * r_dim.batch() * r_dim.width());
- Tensor d_iter =
- derivative_.getSharedDataTensor(d_dim, i * d_dim.batch() * d_dim.width());
- Tensor hval_iter =
- hval_.getSharedDataTensor(d_dim, i * d_dim.batch() * d_dim.width());
+ Tensor ret_iter = ret_.getSharedDataTensor(
+ r_dim, i * r_dim.batch() * r_dim.width(), true, ret_.getName());
+ Tensor in_iter = input_.getSharedDataTensor(
+ r_dim, i * r_dim.batch() * r_dim.width(), true, input_.getName());
+ Tensor d_iter = derivative_.getSharedDataTensor(
+ d_dim, i * d_dim.batch() * d_dim.width(), true, derivative_.getName());
+ Tensor hval_iter = hval_.getSharedDataTensor(
+ d_dim, i * d_dim.batch() * d_dim.width(), true, hval_.getName());
in_var.initializeGradient(ret_iter);
in_var.initializeVariable(in_iter);
fillTensorsFromContext(context);
for (unsigned int i = 0; i < der_dim[0]; ++i) {
- Tensor in_iter =
- input_.getSharedDataTensor(i_dim, i * i_dim.batch() * i_dim.width());
- Tensor d_iter =
- derivative_.getSharedDataTensor(d_dim, i * d_dim.batch() * d_dim.width());
+ Tensor in_iter = input_.getSharedDataTensor(
+ i_dim, i * i_dim.batch() * i_dim.width(), true, input_.getName());
+ Tensor d_iter = derivative_.getSharedDataTensor(
+ d_dim, i * d_dim.batch() * d_dim.width(), true, derivative_.getName());
Var_Grad in_var(i_dim, Tensor::Initializer::NONE, true, false, "input");
Var_Grad out_var(d_dim, Tensor::Initializer::NONE, true, false, "output");
label_dims=[(3,1,6)],
name="mol_attention",
)
+
+ # inspect_file("mol_attention.nnmodelgolden")
input_dims=[(3, 2)],
label_dims=[(3, 2, 2)],
name="grucell_stacked",
+ )
# inspect_file("lstm_single.nnmodelgolden")
vg.reserve(dims.size());
for (auto &dim : dims) {
- vg.emplace_back(dim, Tensor::Initializer::NONE, true, true);
+ vg.emplace_back(dim, Tensor::Initializer::NONE, true, true, "golden");
sizeCheckedReadTensor(vg.back().getVariableRef(), file,
vg.back().getName());
}
auto lnode = nntrainer::createLayerNode(nntrainer::FullyConnectedLayer::type);
lnode->setProperty({"input_shape=1:1:1", "name=abc", "unit=4"});
EXPECT_NO_THROW(lnode->finalize());
- nntrainer::Var_Grad input =
- nntrainer::Var_Grad(nntrainer::TensorDim({1, 1, 1, 1}));
+ nntrainer::Var_Grad input = nntrainer::Var_Grad(
+ nntrainer::TensorDim({1, 1, 1, 1}), nntrainer::Tensor::Initializer::NONE,
+ true, false, "dummy");
lnode->configureRunContext({}, {&input}, {}, {});
EXPECT_ANY_THROW(lnode->finalize());
}
auto semantic_mol_attention = LayerSemanticsParamType(
nntrainer::createLayer<nntrainer::MoLAttentionLayer>,
- nntrainer::MoLAttentionLayer::type, {"unit=5", "mol_k=4"}, 0, false, 3);
+ nntrainer::MoLAttentionLayer::type, {"unit=5", "mol_k=1"}, 0, false, 3);
INSTANTIATE_TEST_CASE_P(MoLAttention, LayerSemantics,
::testing::Values(semantic_mol_attention));