Rework bn forward & backward
authorJihoon Lee <jhoon.it.lee@samsung.com>
Wed, 24 Jun 2020 01:49:51 +0000 (10:49 +0900)
committerJijoong Moon <jijoong.moon@samsung.com>
Thu, 9 Jul 2020 02:54:04 +0000 (11:54 +0900)
Rework bn layer forward & backward pass and fix few bugs.

This patch only includes training passes.

**Self evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test: [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Jihoon Lee <jhoon.it.lee@samsung.com>
nntrainer/include/bn_layer.h
nntrainer/src/bn_layer.cpp
packaging/unittest_layers.tar.gz
test/input_gen/genInput.py
test/unittest/unittest_nntrainer_layers.cpp

index e3f3e5e1a98d8cdc77b5f90a299a8aae9a4a408d..83626fc3be3b35bfba718812e07bb43dcdc240eb 100644 (file)
@@ -121,10 +121,19 @@ public:
 private:
   Tensor weight;
   Tensor bias;
-  Tensor mu;
-  Tensor var;
+
+  Tensor mu;  /**< moving mu used for inferencing.
+                   momentum * mu + (1 - momenutm) * mu
+                   of current batch is used */
+  Tensor var; /**< moving var used for inferencing.
+                   momentum * var + (1 - momenutm) * var
+                   of current batch is used */
+
+  Tensor cvar; /**< training varaince saved in bn_layer::forwarding and used in
+                    bn_layer::backwarding */
   Tensor gamma;
   Tensor beta;
+  Tensor x_normalized;
   float epsilon;
 };
 } // namespace nntrainer
index afa6b1f4cbad60918a8301d150a1c43802879284..a076b3d63fd7474a46d4b272d3490cb3dbd93a56 100644 (file)
@@ -24,6 +24,7 @@
 #include <assert.h>
 #include <bn_layer.h>
 #include <layer.h>
+#include <lazy_tensor.h>
 #include <nntrainer_error.h>
 #include <nntrainer_log.h>
 #include <parse_util.h>
 
 namespace nntrainer {
 
+/// @todo add channel wise bn for convolutional layer.
 int BatchNormalizationLayer::initialize(bool last) {
   int status = ML_ERROR_NONE;
 
   dim = input_dim;
-  output_dim = dim;
+  dim = input_dim;
+  dim.batch(1);
+  output_dim = input_dim;
 
-  this->gamma = Tensor(dim.channel(), dim.batch(), dim.width());
-  this->beta = Tensor(dim.channel(), dim.batch(), dim.width());
-  beta.setZero();
+  this->mu = Tensor(dim);
+  this->var = Tensor(dim);
+  this->gamma = Tensor(dim);
+  this->beta = Tensor(dim);
+
+  mu.setZero();
+  var.setValue(1);
   gamma.setZero();
+  beta.setZero();
+
+  weights.clear();
+  weights.push_back(gamma);
+  weights.push_back(beta);
 
   return status;
 }
@@ -48,8 +61,7 @@ int BatchNormalizationLayer::initialize(bool last) {
 int BatchNormalizationLayer::setOptimizer(Optimizer &opt) {
   this->opt.setType(opt.getType());
   this->opt.setOptParam(opt.getOptParam());
-
-  this->epsilon = 0.0;
+  this->epsilon = epsilon;
   return this->opt.initialize(dim, false);
 }
 
@@ -63,7 +75,6 @@ int BatchNormalizationLayer::setProperty(std::vector<std::string> values) {
     NN_RETURN_STATUS();
 
     unsigned int type = parseLayerProperty(key);
-
     switch (static_cast<PropertyType>(type)) {
     case PropertyType::epsilon:
       status = setFloat(epsilon, value);
@@ -79,70 +90,86 @@ int BatchNormalizationLayer::setProperty(std::vector<std::string> values) {
 }
 
 Tensor BatchNormalizationLayer::forwarding(Tensor in, int &status) {
-  Tensor temp;
-  assert(dim.batch() > 0);
-  hidden = in;
 
-  mu = in.sum(0).multiply(1.0 / dim.batch());
+  if (trainable) {
+    Tensor deviation;
+    this->input = in;
+
+    ///< current mu / var */
+    Tensor cmu;
 
-  temp = in.subtract(mu);
+    cmu = in.average(0);
 
-  var = temp.multiply(temp).sum(0).multiply(1.0 / dim.batch());
+    deviation = in.subtract(cmu);
 
-  Tensor hath = temp.divide(var.add(0.001).apply(sqrtFloat));
+    this->cvar = deviation.chain()
+                   .multiply_i(deviation)
+                   .sum(0)
+                   .multiply_i(1.0 / input_dim.batch())
+                   .add_i(epsilon)
+                   .run();
 
-  hidden = hath;
+    /// @todo replace momentum paramter
+    float momentum = 0.9;
+    this->mu.multiply_i(momentum);
+    this->mu.add_i(cmu, 1 - momentum);
+    this->var.multiply_i(momentum);
+    this->var.add_i(cvar, 1 - momentum);
 
-  Tensor ret = hath.multiply(gamma).add(beta);
+    this->x_normalized = deviation.divide(cvar.apply(sqrtFloat));
 
-  status = ML_ERROR_NONE;
-  return ret;
+    this->hidden = x_normalized.chain().multiply_i(gamma).add_i(beta).run();
+
+    status = ML_ERROR_NONE;
+  } else {
+    /// NYI
+    status = ML_ERROR_NOT_SUPPORTED;
+    throw std::runtime_error("not_yet_implemented");
+  }
+  return hidden;
 }
 
-Tensor BatchNormalizationLayer::backwarding(Tensor derivative, int iteration) {
+Tensor BatchNormalizationLayer::backwarding(Tensor dy, int iteration) {
   Tensor dbeta;
   Tensor dgamma;
-  assert(dim.batch() > 0);
+  Tensor dx_normalized;
 
-  Tensor hath = hidden;
-  Tensor dy = derivative.multiply(hath.multiply(gamma).add(beta));
+  Tensor dx;
 
+  int batch = dy.batch();
+
+  dgamma = x_normalized.multiply(dy).sum(0);
   dbeta = dy.sum(0);
-  dgamma = (input.subtract(mu)
-              .divide(var.add(0.001).apply(sqrtFloat))
-              .multiply(dy)
-              .sum(0));
-
-  Tensor Temp =
-    (dy.multiply(dim.batch()).subtract(dy.sum(0)))
-      .subtract(input.subtract(mu)
-                  .divide(var.add(0.001))
-                  .multiply(dy.multiply(input.subtract(mu)).sum(0)));
-  Tensor dh = Temp.multiply(1.0 / dim.batch())
-                .multiply(var.add(0.001).apply(sqrtFloat))
-                .multiply(gamma);
-
-  float ll = opt.getLearningRate();
-  if (opt.getDecaySteps() != -1) {
-    ll = ll * pow(opt.getDecayRate(), (iteration / opt.getDecaySteps()));
-  }
 
-  gamma = gamma.subtract(dgamma.multiply(ll));
-  beta = beta.subtract(dbeta.multiply(ll));
+  dx_normalized = dy.multiply(gamma);
+
+  dx = dx_normalized.chain()
+         .multiply_i(batch)
+         .subtract_i(dx_normalized.sum(0))
+         .subtract_i(
+           x_normalized.multiply(dx_normalized.multiply(x_normalized).sum(0)))
+         .divide_i(cvar.multiply(batch))
+         .run();
+
+  gradients.clear();
+  gradients.push_back(dgamma);
+  gradients.push_back(dbeta);
+
+  opt.apply_gradients(weights, gradients, iteration);
 
-  return dh;
+  return dx;
 }
 
 void BatchNormalizationLayer::read(std::ifstream &file) {
-  file.read((char *)&mu, sizeof(float));
-  file.read((char *)&var, sizeof(float));
+  mu.read(file);
+  var.read(file);
   gamma.read(file);
   beta.read(file);
 }
 
 void BatchNormalizationLayer::save(std::ofstream &file) {
-  file.write((char *)&mu, sizeof(float));
-  file.write((char *)&var, sizeof(float));
+  mu.save(file);
+  var.save(file);
   gamma.save(file);
   beta.save(file);
 }
@@ -159,8 +186,9 @@ void BatchNormalizationLayer::copy(std::shared_ptr<Layer> l) {
   this->hidden.copy(from->hidden);
   this->weight.copy(from->weight);
   this->bias.copy(from->bias);
-  this->mu = from->mu;
-  this->var = from->var;
+  this->mu.copy(from->mu);
+  this->var.copy(from->var);
+  this->cvar.copy(from->cvar);
   this->gamma.copy(from->gamma);
   this->beta.copy(from->beta);
 }
index 40acd15a10ad36ffa179fc45bbfc0eebd88e8464..1610013a1ebf9a1e15f42062f73bc6a279e6934d 100644 (file)
Binary files a/packaging/unittest_layers.tar.gz and b/packaging/unittest_layers.tar.gz differ
index 36f9e2ddbe1d780a49483c43204dd7f329eb5655..070362e8711569472fc082daa448d9fba09c156a 100755 (executable)
@@ -33,18 +33,22 @@ np.random.seed(SEED)
 ##
 # @brief save data into file with filename
 # @param[in] data The data to be saved
-def save(filename, data):
+def save(filename, *data):
     if os.path.isfile(filename):
         os.remove(filename)
 
     with open(filename, 'ab') as outfile:
-        np.array(data, dtype=np.float32).tofile(outfile)
-    print(data.shape, " data is generated")
+        for item in data:
+          np.array(item, dtype=np.float32).tofile(outfile)
+          try:
+            print(item.shape, " data is generated")
+          except:
+            pass
 
 ##
 # @brief generate random tensor
-def gen_tensor(shape, dtype=None):
-  return np.random.random_sample(input_shape)
+def gen_tensor(shape, dtype=dtypes.float32):
+  return np.random.random_sample(shape)
 
 ##
 # @brief generate random data and save
@@ -198,41 +202,52 @@ def fc_tf(x, kernel, label, bias, activation, train=False, loss='mse', opt='sgd'
 # tested with tf 1.14.0
 # @param[in] x input
 # @param[in] trainable
-# @return bn output, [updated_gamma, updated_beta], grad_result (0. dx / 1. gamma / 2. beta / 3. mean / 4. variance)
+# @return input_variables, bn output, output_variables, grad_result (0. dx / 1. gamma / 2. beta / 3. mean / 4. variance)
 # for updated_gamma, updated_beta, x <- x - grad is used for easier calculation
-def bn_tf(x, trainable=False):
+def bn_tf(x, *, trainable=True, init_beta=gen_tensor, init_gamma=gen_tensor, axis=[1, 2, 3]):
     tf.compat.v1.reset_default_graph()
     tf_input = tf.compat.v1.placeholder(
         dtype=dtypes.float32, shape=x.shape, name='input')
 
     bnlayer = tf.keras.layers.BatchNormalization(
-        axis=0,
+        axis=axis,
         trainable=trainable,
+        momentum=1.0,
         gamma_initializer=gen_tensor,
         beta_initializer=gen_tensor)(tf_input)
 
-    bn_variables = tf.compat.v1.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
+    bn_variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES,
                                                scope='batch_normalization')
+
     input_variables = [tf_input] + bn_variables
 
     grad = tf.gradients(bnlayer, input_variables)
 
+    f_dict = {tf_input: x, tf.keras.backend.learning_phase(): trainable}
+
     with tf.compat.v1.Session() as sess:
+      with tf.compat.v1.variable_scope('bn'):
         sess.run(tf.compat.v1.global_variables_initializer())
-        bn_result = sess.run(bnlayer, feed_dict={tf_input: x})
-        grad_result = sess.run(grad, feed_dict={tf_input: x})
+
+        old_var = sess.run(input_variables, feed_dict=f_dict)
+        bn_result = sess.run(bnlayer, feed_dict=f_dict)
+        grad_result = sess.run(grad, feed_dict=f_dict)
+
         updated_gamma = sess.run(input_variables[1] - grad_result[1])
-        updated_beta = sess.run(input_variables[1] - grad_result[2])
+        updated_beta = sess.run(input_variables[2] - grad_result[2])
+
+        output_variables = [bn_result, updated_gamma, updated_beta]
 
     if DEBUG:
-        print(x[0], bn_result[0])
-        print("updated_gamma: %s" % updated_gamma)
-        print("updated_beta: %s" % updated_beta)
-        for item, input_variable in zip(grad_result, input_variables):
-            print(input_variable.name)
-            print(item[0])
+        print("======================================")
+        print("Input:\n %s\n Output:\n %s"  % (x[0], bn_result[0]))
+        print("dx: %s" % grad_result[0][0][0])
+        print("gradient of gamma: %s" % grad_result[1][0][0], grad_result[1].shape)
+        print("gradient of beta: %s" % grad_result[2][0][0], grad_result[2].shape)
+        print("======================================")
+
+    return old_var, output_variables, grad_result
 
-    return bn_result, [updated_gamma, updated_beta], grad_result
 
 def gen_test_case_conv(i_b, i_c, i_h, i_w, k_c, k_h, k_w, padding, stride, bias, base_name):
     x=gen_input(base_name+"conv2DLayer.in", [i_b, i_c, i_h, i_w])
@@ -265,8 +280,18 @@ def gen_test_case_fc(input_shape, kernel_shape, base_name):
     golden_fc = fc_tf(input_data, kernel, None, bias, activation=tf.nn.softmax)
     save(base_name + "goldenFCResultSoftmax.out", golden_fc[0])
 
-def get_test_case_bn(input_shape, training=False):
-    pass
+def gen_test_case_bn(input_shape, base_name, training=True):
+    input_data = gen_input(base_name + "BNLayerInput.in", input_shape)
+
+    input_variables, output_variables, grad = bn_tf(input_data)
+
+    # mu / var / gamma / beta
+    save(base_name + "BNLayerWeights.in", input_variables[3], input_variables[4], input_variables[1], input_variables[2])
+    save(base_name + "goldenBNResultForward.out", output_variables[0])
+    # todo: change 0 to initial moving avg / std in case of training
+    save(base_name + "goldenBNLayerAfterUpdate.out", 0, 0, output_variables[1], output_variables[2])
+    save(base_name + "goldenBNLayerBackwardDx.out", grad[0])
+
 
 if __name__ == "__main__":
     target = int(sys.argv[1])
@@ -304,3 +329,7 @@ if __name__ == "__main__":
         gen_test_case_fc(input_shape = [3, 1, 1, 12],
                 kernel_shape = [12, 15],
                 base_name = "test_1_")
+
+# Bn layer unit test case:
+    if target == 5:
+        gen_test_case_bn(input_shape = [3, 1, 4, 5], base_name = "test_5_")
index a0b5dae222e4606e949f554fd2ec4d33aa3dae8c..c09f7d0484b54e6203a1ded2aed2e5edfce5cdf7 100644 (file)
@@ -429,6 +429,84 @@ TEST(nntrainer_BatchNormalizationLayer, checkValidation_01_p) {
   EXPECT_EQ(status, ML_ERROR_NONE);
 }
 
+class nntrainer_batchNormalizationLayer_TFmatch : public ::testing::Test {
+protected:
+  nntrainer_batchNormalizationLayer_TFmatch() {}
+
+  virtual void SetUp() {
+    std::vector<std::string> input_str;
+    input_str.push_back("input_shape=3:1:4:5");
+    input_str.push_back("epsilon=0.001");
+
+    nntrainer::Optimizer opt;
+    nntrainer::OptParam p;
+    p.learning_rate = 1;
+
+    status = opt.setType(nntrainer::OptType::sgd);
+    ASSERT_EQ(status, ML_ERROR_NONE);
+
+    status = opt.setOptParam(p);
+    ASSERT_EQ(status, ML_ERROR_NONE);
+
+    status = layer.setOptimizer(opt);
+    ASSERT_EQ(status, ML_ERROR_NONE);
+
+    status = layer.setProperty(input_str);
+    ASSERT_EQ(status, ML_ERROR_NONE);
+
+    status = layer.initialize(false);
+    ASSERT_EQ(status, ML_ERROR_NONE);
+
+    in = nntrainer::Tensor(3, 1, 4, 5);
+    expected = nntrainer::Tensor(3, 1, 4, 5);
+
+    loadFile("test_5_BNLayerInput.in", in);
+    loadFile("test_5_BNLayerWeights.in", layer);
+  }
+
+  void matchOutput(const nntrainer::Tensor &result, const char *path) {
+    loadFile(path, expected);
+    const float *out_ptr, *golden;
+
+    golden = expected.getData();
+    out_ptr = result.getData();
+
+    for (size_t i = 0; i < result.length(); ++i) {
+      EXPECT_NEAR(out_ptr[i], golden[i], tolerance);
+    }
+  }
+
+  int status;
+  nntrainer::BatchNormalizationLayer layer;
+  nntrainer::Tensor expected;
+  nntrainer::Tensor in;
+
+private:
+  template <typename T> void loadFile(const char *filename, T &t) {
+    std::ifstream file(filename);
+    if (!file.good()) {
+      throw std::runtime_error("filename is wrong");
+    }
+    t.read(file);
+    file.close();
+  }
+};
+
+TEST_F(nntrainer_batchNormalizationLayer_TFmatch,
+       forward_backward_training_01_p) {
+  int status = ML_ERROR_NONE;
+  layer.setTrainable(true);
+  nntrainer::Tensor forward_result = layer.forwarding(in, status);
+  EXPECT_EQ(status, ML_ERROR_NONE);
+
+  matchOutput(forward_result, "test_5_goldenBNResultForward.out");
+
+  nntrainer::Tensor backward_result =
+    layer.backwarding(constant(1.0, 3, 1, 4, 5), 1);
+
+  matchOutput(backward_result, "test_5_goldenBNLayerBackwardDx.out");
+}
+
 /**
  * @brief Convolution 2D Layer
  */