beta.setZero();
setParamSize(4);
- paramsAt(0) = {std::move(mu), Tensor(dim), "BN:moving_average"};
- paramsAt(1) = {std::move(var), Tensor(dim), "BN:moving_variance"};
- paramsAt(2) = {std::move(gamma), Tensor(dim), "BN:gamma"};
- paramsAt(3) = {std::move(beta), Tensor(dim), "BN:beta"};
+ paramsAt(0) = {std::move(mu), Tensor(mu.getDim()), "BN:moving_average"};
+ paramsAt(1) = {std::move(var), Tensor(var.getDim()), "BN:moving_variance"};
+ paramsAt(2) = {std::move(gamma), Tensor(gamma.getDim()), "BN:gamma"};
+ paramsAt(3) = {std::move(beta), Tensor(beta.getDim()), "BN:beta"};
return status;
}
delK = delK.chain()
.applyIf(this->isWeightDecayL2Norm(), _LIFT(add_i), filter,
weight_decay.lambda)
- .run();
+ .run().average(0);
+ }
+ for (unsigned int i = filter_size; i < 2 * filter_size; ++i) {
+ Tensor &delBias = paramsAt(i).grad;
+ delBias = delBias.average(0);
}
opt.apply_gradients(params, param_size, iteration);
}
setParamSize(2);
- paramsAt(0) = {std::move(weight), Tensor(dim), "FC:weight"};
- paramsAt(1) = {std::move(bias), Tensor(1, unit), "FC:bias"};
+ paramsAt(0) = {std::move(weight), Tensor(weight.getDim()), "FC:weight"};
+ paramsAt(1) = {std::move(bias), Tensor(bias.getDim()), "FC:bias"};
return status;
}
Tensor &djdb = paramsAt(bias_idx).grad;
Tensor ret = derivative.dot(weight.transpose("0:2:1"));
- djdb = derivative;
+ djdb = derivative.average(0);
djdw = input.chain()
.transpose("0:2:1")
.dot(derivative)
.applyIf(this->isWeightDecayL2Norm(), _LIFT(add_i), weight,
weight_decay.lambda)
- .run();
+ .run().average(0);
if (trainable) {
opt.apply_gradients(params, param_size, iteration);
UpdatableParam ¶m = param_data[i];
Tensor &x = param.weight;
- /// @fixme: #280 and use const Tensor &x_grad once fixed.
- /// @note: that current implementation does not update grad since updating
- /// grad changes it's dimension
- Tensor x_grad = param.grad;
- x_grad = x_grad.average(0);
+ const Tensor &x_grad = param.grad;
switch (type) {
case OptType::sgd:
x.add_i(x_grad, -ll);