[dataset] Add file producer
authorJihoon Lee <jhoon.it.lee@samsung.com>
Mon, 12 Jul 2021 12:28:12 +0000 (21:28 +0900)
committerJijoong Moon <jijoong.moon@samsung.com>
Tue, 3 Aug 2021 01:35:39 +0000 (10:35 +0900)
This patch adds file producer which abstracts reading a raw file. Also,
this component makes sure that the file accepts various kind of input
shapes.

**Self evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test: [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Jihoon Lee <jhoon.it.lee@samsung.com>
16 files changed:
jni/Android.mk
nntrainer/dataset/data_producers.h
nntrainer/dataset/databuffer.h
nntrainer/dataset/func_data_producer.cpp
nntrainer/dataset/func_data_producer.h
nntrainer/dataset/meson.build
nntrainer/dataset/random_data_producers.cpp
nntrainer/dataset/random_data_producers.h
nntrainer/dataset/raw_file_data_producer.cpp [new file with mode: 0644]
nntrainer/dataset/raw_file_data_producer.h [new file with mode: 0644]
nntrainer/layers/common_properties.cpp
nntrainer/layers/common_properties.h
test/unittest/datasets/data_producer_common_tests.cpp
test/unittest/datasets/meson.build
test/unittest/datasets/unittest_func_data_producer.cpp [moved from test/unittest/datasets/unittest_func_data_producers.cpp with 98% similarity]
test/unittest/datasets/unittest_raw_file_data_producer.cpp [new file with mode: 0644]

index 4862f35..d9dc480 100644 (file)
@@ -131,6 +131,7 @@ NNTRAINER_SRCS := $(NNTRAINER_ROOT)/nntrainer/models/neuralnet.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/dataset/databuffer_file.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/dataset/func_data_producer.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/dataset/random_data_producers.cpp \
+                  $(NNTRAINER_ROOT)/nntrainer/dataset/raw_file_data_producer.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/tensor/tensor.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/tensor/lazy_tensor.cpp \
                   $(NNTRAINER_ROOT)/nntrainer/tensor/manager.cpp \
index 9e578f5..5345f2a 100644 (file)
@@ -47,7 +47,7 @@ public:
    * iteration, at the end of the iteration, it's responsibility of @a this to
    * shuffle.
    */
-  using Gernerator = std::function<Iteration(void)>;
+  using Generator = std::function<Iteration(void)>;
 
   constexpr inline static unsigned long long SIZE_UNDEFINED =
     std::numeric_limits<unsigned long long>::max();
@@ -81,8 +81,8 @@ public:
    * call
    *
    */
-  virtual Gernerator finalize(const std::vector<TensorDim> &input_dims,
-                              const std::vector<TensorDim> &label_dims) = 0;
+  virtual Generator finalize(const std::vector<TensorDim> &input_dims,
+                             const std::vector<TensorDim> &label_dims) = 0;
 
   /**
    * @brief get size of total dataset given input_dims, label_dims, if size
index 64c2d1a..7e29e28 100644 (file)
 #ifdef __cplusplus
 
 #include <condition_variable>
+#include <future>
 #include <memory>
 #include <mutex>
 #include <random>
 #include <thread>
 #include <vector>
 
+#include <data_producers.h>
 #include <dataset.h>
 #include <tensor_dim.h>
 
index 0b723e7..31b4ab4 100644 (file)
@@ -32,7 +32,7 @@ void FuncDataProducer::setProperty(const std::vector<std::string> &properties) {
     << "properties is not empty, size: " << properties.size();
 }
 
-DataProducer::Gernerator
+DataProducer::Generator
 FuncDataProducer::finalize(const std::vector<TensorDim> &input_dims,
                            const std::vector<TensorDim> &label_dims) {
   NNTR_THROW_IF(!this->cb, std::invalid_argument)
index b2b22c8..10fc2dc 100644 (file)
@@ -62,7 +62,7 @@ public:
    * @copydoc DataProducer::finalize(const std::vector<TensorDim>, const
    * std::vector<TensorDim>)
    */
-  virtual DataProducer::Gernerator
+  virtual DataProducer::Generator
   finalize(const std::vector<TensorDim> &input_dims,
            const std::vector<TensorDim> &label_dims) override;
 
index 9d58867..e3cebe7 100644 (file)
@@ -5,6 +5,7 @@ dataset_sources = [
   'databuffer_func.cpp',
   'random_data_producers.cpp',
   'func_data_producer.cpp',
+  'raw_file_data_producer.cpp'
 ]
 
 dataset_headers = [
index 94b8900..b8d0b05 100644 (file)
@@ -89,7 +89,7 @@ void RandomDataOneHotProducer::setProperty(
     << "There are unparsed properties, size: " << left.size();
 }
 
-DataProducer::Gernerator
+DataProducer::Generator
 RandomDataOneHotProducer::finalize(const std::vector<TensorDim> &input_dims,
                                    const std::vector<TensorDim> &label_dims) {
   /** check if the given producer is ready to finalize */
@@ -129,7 +129,7 @@ RandomDataOneHotProducer::finalize(const std::vector<TensorDim> &input_dims,
   return [rng, sz, input_dims, label_dims, min_ = min_.get(), max_ = max_.get(),
           current_iteration = 0ULL,
           label_chooser = std::move(label_chooser_)]() mutable {
-    if (current_iteration++ == sz) {
+    if (current_iteration++ == sz / input_dims[0].batch()) {
       current_iteration = 0;
       return DataProducer::Iteration(true, {}, {});
     }
index 0faaa39..872107c 100644 (file)
@@ -68,7 +68,7 @@ public:
    * @copydoc DataProducer::finalize(const std::vector<TensorDim>, const
    * std::vector<TensorDim>)
    */
-  virtual DataProducer::Gernerator
+  virtual DataProducer::Generator
   finalize(const std::vector<TensorDim> &input_dims,
            const std::vector<TensorDim> &label_dims) override;
 
diff --git a/nntrainer/dataset/raw_file_data_producer.cpp b/nntrainer/dataset/raw_file_data_producer.cpp
new file mode 100644 (file)
index 0000000..be396ad
--- /dev/null
@@ -0,0 +1,155 @@
+
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2021 Jihoon Lee <jhoon.it.lee@samsung.com>
+ *
+ * @file   raw_file_data_producer.cpp
+ * @date   12 July 2021
+ * @brief  This file contains raw file data producers, reading from a file
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Jihoon Lee <jhoon.it.lee@samsung.com>
+ * @bug    No known bugs except for NYI items
+ *
+ */
+
+#include <raw_file_data_producer.h>
+
+#include <memory>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <common_properties.h>
+#include <nntrainer_error.h>
+#include <node_exporter.h>
+#include <util_func.h>
+
+namespace nntrainer {
+
+RawFileDataProducer::RawFileDataProducer() : raw_file_props(new PropTypes()) {}
+RawFileDataProducer::~RawFileDataProducer() {}
+
+const std::string RawFileDataProducer::getType() const {
+  return RawFileDataProducer::type;
+}
+
+unsigned long long
+RawFileDataProducer::size(const std::vector<TensorDim> &input_dims,
+                          const std::vector<TensorDim> &label_dims) const {
+  auto size_accumulator = [](const unsigned int &a, const TensorDim &b) {
+    return a + b.getFeatureLen();
+  };
+
+  auto sample_size =
+    std::accumulate(input_dims.begin(), input_dims.end(), 0u, size_accumulator);
+  sample_size = std::accumulate(label_dims.begin(), label_dims.end(),
+                                sample_size, size_accumulator);
+
+  auto path_prop = std::get<props::FilePath>(*raw_file_props);
+  auto file_size = path_prop.file_size();
+
+  /// checking alignment is a good way to make check if a file is valid,
+  /// unfortunately, our test dataset does not have this property
+  /// (trainingSet.dat, valSet.dat, testSet.dat) after checking, we can
+  /// uncomment below line.
+  // NNTR_THROW_IF((file_size % sample_size * RawFileDataProducer::pixel_size !=
+  // 0),
+  //               std::invalid_argument)
+  //   << " Given file does not align with the given sample size, sample size: "
+  //   << sample_size << " file_size: " << file_size;
+
+  return path_prop.file_size() /
+         (sample_size * RawFileDataProducer::pixel_size);
+}
+
+void RawFileDataProducer::setProperty(
+  const std::vector<std::string> &properties) {
+  auto left = loadProperties(properties, *raw_file_props);
+  NNTR_THROW_IF(!left.empty(), std::invalid_argument)
+    << "There is unparsed properties, size: " << left.size();
+}
+
+DataProducer::Generator
+RawFileDataProducer::finalize(const std::vector<TensorDim> &input_dims,
+                              const std::vector<TensorDim> &label_dims) {
+
+  /****************** Validation ****************/
+  auto sz = size(input_dims, label_dims);
+  auto batch = input_dims[0].batch();
+
+  NNTR_THROW_IF(sz < batch, std::invalid_argument)
+    << "calculated sample size is less than a batch";
+
+  auto path_prop = std::get<props::FilePath>(*raw_file_props);
+
+  auto size_accumulator = [](const unsigned int &a, const TensorDim &b) {
+    return a + b.getFeatureLen();
+  };
+
+  auto sample_size =
+    std::accumulate(input_dims.begin(), input_dims.end(), 0u, size_accumulator);
+  sample_size = std::accumulate(label_dims.begin(), label_dims.end(),
+                                sample_size, size_accumulator);
+
+  /// below works when checking alignment is correct
+  // auto sample_size = path_prop.file_size() / (sz *
+  // RawFileDataProducer::pixel_size);
+
+  /****************** Prepare states ****************/
+  std::mt19937 rng_;
+  rng_.seed(getSeed());
+  auto idxes_ = std::vector<unsigned int>();
+  idxes_.reserve(sz);
+  /// idxes point to the file position in bytes where a sample starts
+  std::generate_n(std::back_inserter(idxes_), sz,
+                  [sample_size, current = 0ULL]() mutable {
+                    auto c = current;
+                    current += sample_size * RawFileDataProducer::pixel_size;
+                    return c;
+                  });
+  /// @todo remove shuffle from here as we are migrating this to element wise
+  /// operator
+  std::shuffle(idxes_.begin(), idxes_.end(), rng_);
+
+  auto file =
+    std::make_shared<std::ifstream>(path_prop.get(), std::ios::binary);
+  auto iter = idxes_.begin();
+
+  return [batch, input_dims, label_dims, rng = rng_, idxes = std::move(idxes_),
+          file, iter]() mutable -> DataProducer::Iteration {
+    if (std::distance(iter, idxes.end()) < static_cast<std::ptrdiff_t>(batch)) {
+      std::shuffle(idxes.begin(), idxes.end(), rng);
+      iter = idxes.begin();
+      return DataProducer::Iteration(true, {}, {});
+    }
+
+    std::vector<Tensor> inputs;
+    inputs.reserve(input_dims.size());
+    for (unsigned int i = 0; i < input_dims.size(); ++i) {
+      inputs.emplace_back(input_dims[i]);
+    }
+
+    std::vector<Tensor> labels;
+    labels.reserve(label_dims.size());
+    for (unsigned int i = 0; i < label_dims.size(); ++i) {
+      labels.emplace_back(label_dims[i]);
+    }
+
+    for (unsigned int b = 0; b < batch; ++b) {
+      file->seekg(*iter, std::ios_base::beg);
+      for (auto &input : inputs) {
+        Tensor input_slice = input.getBatchSlice(b, 1);
+        input_slice.read(*file);
+      }
+      for (auto &label : labels) {
+        Tensor label_slice = label.getBatchSlice(b, 1);
+        label_slice.read(*file);
+      }
+
+      iter++;
+    }
+
+    return DataProducer::Iteration(false, inputs, labels);
+  };
+}
+} // namespace nntrainer
diff --git a/nntrainer/dataset/raw_file_data_producer.h b/nntrainer/dataset/raw_file_data_producer.h
new file mode 100644 (file)
index 0000000..0f76d00
--- /dev/null
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2021 Jihoon Lee <jhoon.it.lee@samsung.com>
+ *
+ * @file   raw_file_data_producer.h
+ * @date   12 July 2021
+ * @brief  This file contains raw file data producers, reading from a file
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Jihoon Lee <jhoon.it.lee@samsung.com>
+ * @bug    No known bugs except for NYI items
+ *
+ */
+#ifndef __RAW_FILE_DATA_PRODUCER_H__
+#define __RAW_FILE_DATA_PRODUCER_H__
+
+#include <data_producers.h>
+
+#include <dataset.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace nntrainer {
+
+namespace props {
+class FilePath;
+}
+
+using datagen_cb = ml::train::datagen_cb;
+
+/**
+ * @brief RawFileDataProducer which contains a callback and returns back
+ *
+ */
+class RawFileDataProducer final : public DataProducer {
+public:
+  inline static constexpr unsigned int pixel_size =
+    sizeof(float); /**< @todo make this a configurable type */
+  /**
+   * @brief Construct a new RawFileDataProducer object
+   *
+   */
+  RawFileDataProducer();
+
+  /**
+   * @brief Destroy the RawFileDataProducer object
+   *
+   */
+  ~RawFileDataProducer();
+
+  inline static const std::string type = "file";
+
+  /**
+   * @copydoc DataProducer::getType()
+   */
+  const std::string getType() const override;
+
+  /**
+   * @copydoc DataProducer::size()
+   */
+  unsigned long long
+  size(const std::vector<TensorDim> &input_dims,
+       const std::vector<TensorDim> &label_dims) const override;
+
+  /**
+   * @copydoc DataProducer::setProeprty(const std::vector<std::string>
+   * &properties)
+   */
+  void setProperty(const std::vector<std::string> &properties) override;
+
+  /**
+   * @copydoc DataProducer::finalize(const std::vector<TensorDim>, const
+   * std::vector<TensorDim>)
+   * @remark current implementation drops remainder that are less than the
+   * batchsize, if we don't want the behavior, there needs some refactoring
+   * across data processing places because we are assuming fixed batchsize at
+   * this point
+   */
+  DataProducer::Generator
+  finalize(const std::vector<TensorDim> &input_dims,
+           const std::vector<TensorDim> &label_dims) override;
+
+private:
+  using PropTypes = std::tuple<props::FilePath>;
+  std::unique_ptr<PropTypes> raw_file_props;
+};
+
+} // namespace nntrainer
+
+#endif // __RAW_FILE_DATA_PRODUCER_H__
index 54abb22..ae4c7b8 100644 (file)
@@ -35,6 +35,19 @@ bool DropOutSpec::isValid(const float &v) const {
     return true;
 }
 
+bool FilePath::isValid(const std::string &v) {
+  std::ifstream file(v, std::ios::binary | std::ios::ate);
+  return file.good();
+}
+
+void FilePath::set(const std::string &v) {
+  Property<std::string>::set(v);
+  std::ifstream file(v, std::ios::binary | std::ios::ate);
+  cached_pos_size = file.tellg();
+}
+
+std::ifstream::pos_type FilePath::file_size() { return cached_pos_size; }
+
 ConnectionSpec::ConnectionSpec(const std::vector<props::Name> &layer_ids_,
                                const std::string &op_type_) :
   op_type(op_type_),
index fcbc5a6..412693a 100644 (file)
@@ -15,6 +15,7 @@
 
 #include <array>
 #include <base_properties.h>
+#include <fstream>
 
 #ifndef __COMMON_PROPERTIES_H__
 #define __COMMON_PROPERTIES_H__
@@ -251,6 +252,40 @@ public:
   bool isValid(const float &v) const override;
 };
 
+/**
+ * @brief Props containing file path value
+ *
+ */
+class FilePath : public Property<std::string> {
+public:
+  static constexpr const char *key = "path"; /**< unique key to access */
+  using prop_tag = str_prop_tag;             /**< property type */
+
+  /**
+   * @brief check if given value is valid
+   *
+   * @param v value to check
+   * @return bool true if valid
+   */
+  bool isValid(const std::string &v);
+
+  /**
+   * @brief setter
+   *
+   * @param v value to set
+   */
+  void set(const std::string &v);
+
+  /**
+   * @brief return file size
+   *
+   * @return std::ifstream::pos_type size of the file
+   */
+  std::ifstream::pos_type file_size();
+
+private:
+  std::ifstream::pos_type cached_pos_size;
+};
 } // namespace props
 } // namespace nntrainer
 
index cb2419f..4fb8630 100644 (file)
  */
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <data_producer_common_tests.h>
 
 void DataProducerSemantics::SetUp() {
   auto [producerFactory, properties, input_dims_, label_dims_, validator_,
         result_] = GetParam();
 
+  /** check if input_dims, label_dims not empty and have the same batch */
+  ASSERT_FALSE(input_dims_.empty());
+  ASSERT_FALSE(label_dims_.empty());
+  auto b = input_dims_[0].batch();
+
+  ASSERT_TRUE(std::all_of(input_dims_.begin(), input_dims_.end(),
+                          [b](const auto &dim) { return b == dim.batch(); }));
+  ASSERT_TRUE(std::all_of(label_dims_.begin(), label_dims_.end(),
+                          [b](const auto &dim) { return b == dim.batch(); }));
+
   producer = producerFactory(properties);
   input_dims = std::move(input_dims_);
   label_dims = std::move(label_dims_);
@@ -65,7 +76,9 @@ TEST_P(DataProducerSemantics, fetch_one_epoch_or_10_iteration_pn) {
     sz = 10;
   }
 
-  for (unsigned i = 0; i < sz; ++i) {
+  auto num_iterations = sz / input_dims[0].batch();
+
+  for (unsigned i = 0; i < num_iterations; ++i) {
     auto [last, ins, labels] = generator();
 
     ASSERT_FALSE(last) << " reached last at iteration: " << i << '\n';
index d2b1762..64de934 100644 (file)
@@ -5,7 +5,8 @@ test_target = []
 producer_targets = [
   'data_producer_common_tests.cpp',
   'unittest_random_data_producers.cpp',
-  'unittest_func_data_producers.cpp'
+  'unittest_func_data_producer.cpp',
+  'unittest_raw_file_data_producer.cpp'
 ]
 
 test_target += producer_targets
@@ -2,7 +2,7 @@
 /**
  * Copyright (C) 2021 Jihoon Lee <jhoon.it.lee@samsung.com>
  *
- * @file unittest_func_data_producers.cpp
+ * @file unittest_func_data_producer.cpp
  * @date 12 July 2021
  * @brief Function data producers (Param Tests)
  * @see        https://github.com/nnstreamer/nntrainer
@@ -17,8 +17,8 @@
 #include <tensor.h>
 
 namespace {
-std::vector<nntrainer::TensorDim> input_shapes = {{3, 2, 4, 5}, {1, 2, 3, 4}};
-std::vector<nntrainer::TensorDim> label_shapes = {{3, 1, 1, 10}, {1, 1, 1, 2}};
+std::vector<nntrainer::TensorDim> input_shapes = {{3, 2, 4, 5}, {3, 2, 3, 4}};
+std::vector<nntrainer::TensorDim> label_shapes = {{3, 1, 1, 10}, {3, 1, 1, 2}};
 int user_data = 0;
 
 int getBatch(float **outVec, float **outLabel, bool *last, void *user_data) {
diff --git a/test/unittest/datasets/unittest_raw_file_data_producer.cpp b/test/unittest/datasets/unittest_raw_file_data_producer.cpp
new file mode 100644 (file)
index 0000000..e24f1e8
--- /dev/null
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2021 Jihoon Lee <jhoon.it.lee@samsung.com>
+ *
+ * @file unittest_raw_file_data_producer.cpp
+ * @date 12 July 2021
+ * @brief raw file data producers (Param Tests)
+ * @see        https://github.com/nnstreamer/nntrainer
+ * @author Jihoon Lee <jhoon.it.lee@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+
+#include <gtest/gtest.h>
+
+#include <data_producer_common_tests.h>
+#include <raw_file_data_producer.h>
+#include <tensor.h>
+
+#include <nntrainer_test_util.h>
+
+static const std::string getTestResPath(const std::string &file) {
+  return getResPath(file, {"test"});
+}
+
+namespace {
+std::vector<nntrainer::TensorDim> input_shapes = {{20, 3, 32, 32}};
+std::vector<nntrainer::TensorDim> label_shapes = {{20, 1, 1, 10}};
+
+bool validate(const std::vector<nntrainer::Tensor> &inputs,
+              const std::vector<nntrainer::Tensor> &labels) {
+  return true;
+};
+} // namespace
+
+auto training_set = DataProducerSemanticsParamType(
+  createDataProducer<nntrainer::RawFileDataProducer>,
+  {"path=" + getTestResPath("trainingSet.dat")}, {{20, 3, 32, 32}},
+  {{20, 1, 1, 10}}, validate, DataProducerSemanticsExpectedResult::SUCCESS);
+
+auto valSet = DataProducerSemanticsParamType(
+  createDataProducer<nntrainer::RawFileDataProducer>,
+  {"path=" + getTestResPath("valSet.dat")}, {{3, 32, 32}}, {{1, 1, 10}},
+  validate, DataProducerSemanticsExpectedResult::SUCCESS);
+
+auto testSet = DataProducerSemanticsParamType(
+  createDataProducer<nntrainer::RawFileDataProducer>,
+  {"path=" + getTestResPath("testSet.dat")}, {{3, 32, 32}}, {{1, 1, 10}},
+  validate, DataProducerSemanticsExpectedResult::SUCCESS);
+
+auto batch_too_big = DataProducerSemanticsParamType(
+  createDataProducer<nntrainer::RawFileDataProducer>,
+  {"path=" + getTestResPath("testSet.dat")}, {{50000, 3, 32, 32}},
+  {{50000, 1, 1, 10}}, nullptr,
+  DataProducerSemanticsExpectedResult::FAIL_AT_FINALIZE);
+
+INSTANTIATE_TEST_CASE_P(RawFile, DataProducerSemantics,
+                        ::testing::Values(training_set, valSet, testSet,
+                                          batch_too_big));