$(NNTRAINER_ROOT)/nntrainer/dataset/databuffer_file.cpp \
$(NNTRAINER_ROOT)/nntrainer/dataset/func_data_producer.cpp \
$(NNTRAINER_ROOT)/nntrainer/dataset/random_data_producers.cpp \
+ $(NNTRAINER_ROOT)/nntrainer/dataset/raw_file_data_producer.cpp \
$(NNTRAINER_ROOT)/nntrainer/tensor/tensor.cpp \
$(NNTRAINER_ROOT)/nntrainer/tensor/lazy_tensor.cpp \
$(NNTRAINER_ROOT)/nntrainer/tensor/manager.cpp \
* iteration, at the end of the iteration, it's responsibility of @a this to
* shuffle.
*/
- using Gernerator = std::function<Iteration(void)>;
+ using Generator = std::function<Iteration(void)>;
constexpr inline static unsigned long long SIZE_UNDEFINED =
std::numeric_limits<unsigned long long>::max();
* call
*
*/
- virtual Gernerator finalize(const std::vector<TensorDim> &input_dims,
- const std::vector<TensorDim> &label_dims) = 0;
+ virtual Generator finalize(const std::vector<TensorDim> &input_dims,
+ const std::vector<TensorDim> &label_dims) = 0;
/**
* @brief get size of total dataset given input_dims, label_dims, if size
#ifdef __cplusplus
#include <condition_variable>
+#include <future>
#include <memory>
#include <mutex>
#include <random>
#include <thread>
#include <vector>
+#include <data_producers.h>
#include <dataset.h>
#include <tensor_dim.h>
<< "properties is not empty, size: " << properties.size();
}
-DataProducer::Gernerator
+DataProducer::Generator
FuncDataProducer::finalize(const std::vector<TensorDim> &input_dims,
const std::vector<TensorDim> &label_dims) {
NNTR_THROW_IF(!this->cb, std::invalid_argument)
* @copydoc DataProducer::finalize(const std::vector<TensorDim>, const
* std::vector<TensorDim>)
*/
- virtual DataProducer::Gernerator
+ virtual DataProducer::Generator
finalize(const std::vector<TensorDim> &input_dims,
const std::vector<TensorDim> &label_dims) override;
'databuffer_func.cpp',
'random_data_producers.cpp',
'func_data_producer.cpp',
+ 'raw_file_data_producer.cpp'
]
dataset_headers = [
<< "There are unparsed properties, size: " << left.size();
}
-DataProducer::Gernerator
+DataProducer::Generator
RandomDataOneHotProducer::finalize(const std::vector<TensorDim> &input_dims,
const std::vector<TensorDim> &label_dims) {
/** check if the given producer is ready to finalize */
return [rng, sz, input_dims, label_dims, min_ = min_.get(), max_ = max_.get(),
current_iteration = 0ULL,
label_chooser = std::move(label_chooser_)]() mutable {
- if (current_iteration++ == sz) {
+ if (current_iteration++ == sz / input_dims[0].batch()) {
current_iteration = 0;
return DataProducer::Iteration(true, {}, {});
}
* @copydoc DataProducer::finalize(const std::vector<TensorDim>, const
* std::vector<TensorDim>)
*/
- virtual DataProducer::Gernerator
+ virtual DataProducer::Generator
finalize(const std::vector<TensorDim> &input_dims,
const std::vector<TensorDim> &label_dims) override;
--- /dev/null
+
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2021 Jihoon Lee <jhoon.it.lee@samsung.com>
+ *
+ * @file raw_file_data_producer.cpp
+ * @date 12 July 2021
+ * @brief This file contains raw file data producers, reading from a file
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Jihoon Lee <jhoon.it.lee@samsung.com>
+ * @bug No known bugs except for NYI items
+ *
+ */
+
+#include <raw_file_data_producer.h>
+
+#include <memory>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <common_properties.h>
+#include <nntrainer_error.h>
+#include <node_exporter.h>
+#include <util_func.h>
+
+namespace nntrainer {
+
+RawFileDataProducer::RawFileDataProducer() : raw_file_props(new PropTypes()) {}
+RawFileDataProducer::~RawFileDataProducer() {}
+
+const std::string RawFileDataProducer::getType() const {
+ return RawFileDataProducer::type;
+}
+
+unsigned long long
+RawFileDataProducer::size(const std::vector<TensorDim> &input_dims,
+ const std::vector<TensorDim> &label_dims) const {
+ auto size_accumulator = [](const unsigned int &a, const TensorDim &b) {
+ return a + b.getFeatureLen();
+ };
+
+ auto sample_size =
+ std::accumulate(input_dims.begin(), input_dims.end(), 0u, size_accumulator);
+ sample_size = std::accumulate(label_dims.begin(), label_dims.end(),
+ sample_size, size_accumulator);
+
+ auto path_prop = std::get<props::FilePath>(*raw_file_props);
+ auto file_size = path_prop.file_size();
+
+ /// checking alignment is a good way to make check if a file is valid,
+ /// unfortunately, our test dataset does not have this property
+ /// (trainingSet.dat, valSet.dat, testSet.dat) after checking, we can
+ /// uncomment below line.
+ // NNTR_THROW_IF((file_size % sample_size * RawFileDataProducer::pixel_size !=
+ // 0),
+ // std::invalid_argument)
+ // << " Given file does not align with the given sample size, sample size: "
+ // << sample_size << " file_size: " << file_size;
+
+ return path_prop.file_size() /
+ (sample_size * RawFileDataProducer::pixel_size);
+}
+
+void RawFileDataProducer::setProperty(
+ const std::vector<std::string> &properties) {
+ auto left = loadProperties(properties, *raw_file_props);
+ NNTR_THROW_IF(!left.empty(), std::invalid_argument)
+ << "There is unparsed properties, size: " << left.size();
+}
+
+DataProducer::Generator
+RawFileDataProducer::finalize(const std::vector<TensorDim> &input_dims,
+ const std::vector<TensorDim> &label_dims) {
+
+ /****************** Validation ****************/
+ auto sz = size(input_dims, label_dims);
+ auto batch = input_dims[0].batch();
+
+ NNTR_THROW_IF(sz < batch, std::invalid_argument)
+ << "calculated sample size is less than a batch";
+
+ auto path_prop = std::get<props::FilePath>(*raw_file_props);
+
+ auto size_accumulator = [](const unsigned int &a, const TensorDim &b) {
+ return a + b.getFeatureLen();
+ };
+
+ auto sample_size =
+ std::accumulate(input_dims.begin(), input_dims.end(), 0u, size_accumulator);
+ sample_size = std::accumulate(label_dims.begin(), label_dims.end(),
+ sample_size, size_accumulator);
+
+ /// below works when checking alignment is correct
+ // auto sample_size = path_prop.file_size() / (sz *
+ // RawFileDataProducer::pixel_size);
+
+ /****************** Prepare states ****************/
+ std::mt19937 rng_;
+ rng_.seed(getSeed());
+ auto idxes_ = std::vector<unsigned int>();
+ idxes_.reserve(sz);
+ /// idxes point to the file position in bytes where a sample starts
+ std::generate_n(std::back_inserter(idxes_), sz,
+ [sample_size, current = 0ULL]() mutable {
+ auto c = current;
+ current += sample_size * RawFileDataProducer::pixel_size;
+ return c;
+ });
+ /// @todo remove shuffle from here as we are migrating this to element wise
+ /// operator
+ std::shuffle(idxes_.begin(), idxes_.end(), rng_);
+
+ auto file =
+ std::make_shared<std::ifstream>(path_prop.get(), std::ios::binary);
+ auto iter = idxes_.begin();
+
+ return [batch, input_dims, label_dims, rng = rng_, idxes = std::move(idxes_),
+ file, iter]() mutable -> DataProducer::Iteration {
+ if (std::distance(iter, idxes.end()) < static_cast<std::ptrdiff_t>(batch)) {
+ std::shuffle(idxes.begin(), idxes.end(), rng);
+ iter = idxes.begin();
+ return DataProducer::Iteration(true, {}, {});
+ }
+
+ std::vector<Tensor> inputs;
+ inputs.reserve(input_dims.size());
+ for (unsigned int i = 0; i < input_dims.size(); ++i) {
+ inputs.emplace_back(input_dims[i]);
+ }
+
+ std::vector<Tensor> labels;
+ labels.reserve(label_dims.size());
+ for (unsigned int i = 0; i < label_dims.size(); ++i) {
+ labels.emplace_back(label_dims[i]);
+ }
+
+ for (unsigned int b = 0; b < batch; ++b) {
+ file->seekg(*iter, std::ios_base::beg);
+ for (auto &input : inputs) {
+ Tensor input_slice = input.getBatchSlice(b, 1);
+ input_slice.read(*file);
+ }
+ for (auto &label : labels) {
+ Tensor label_slice = label.getBatchSlice(b, 1);
+ label_slice.read(*file);
+ }
+
+ iter++;
+ }
+
+ return DataProducer::Iteration(false, inputs, labels);
+ };
+}
+} // namespace nntrainer
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2021 Jihoon Lee <jhoon.it.lee@samsung.com>
+ *
+ * @file raw_file_data_producer.h
+ * @date 12 July 2021
+ * @brief This file contains raw file data producers, reading from a file
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Jihoon Lee <jhoon.it.lee@samsung.com>
+ * @bug No known bugs except for NYI items
+ *
+ */
+#ifndef __RAW_FILE_DATA_PRODUCER_H__
+#define __RAW_FILE_DATA_PRODUCER_H__
+
+#include <data_producers.h>
+
+#include <dataset.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace nntrainer {
+
+namespace props {
+class FilePath;
+}
+
+using datagen_cb = ml::train::datagen_cb;
+
+/**
+ * @brief RawFileDataProducer which contains a callback and returns back
+ *
+ */
+class RawFileDataProducer final : public DataProducer {
+public:
+ inline static constexpr unsigned int pixel_size =
+ sizeof(float); /**< @todo make this a configurable type */
+ /**
+ * @brief Construct a new RawFileDataProducer object
+ *
+ */
+ RawFileDataProducer();
+
+ /**
+ * @brief Destroy the RawFileDataProducer object
+ *
+ */
+ ~RawFileDataProducer();
+
+ inline static const std::string type = "file";
+
+ /**
+ * @copydoc DataProducer::getType()
+ */
+ const std::string getType() const override;
+
+ /**
+ * @copydoc DataProducer::size()
+ */
+ unsigned long long
+ size(const std::vector<TensorDim> &input_dims,
+ const std::vector<TensorDim> &label_dims) const override;
+
+ /**
+ * @copydoc DataProducer::setProeprty(const std::vector<std::string>
+ * &properties)
+ */
+ void setProperty(const std::vector<std::string> &properties) override;
+
+ /**
+ * @copydoc DataProducer::finalize(const std::vector<TensorDim>, const
+ * std::vector<TensorDim>)
+ * @remark current implementation drops remainder that are less than the
+ * batchsize, if we don't want the behavior, there needs some refactoring
+ * across data processing places because we are assuming fixed batchsize at
+ * this point
+ */
+ DataProducer::Generator
+ finalize(const std::vector<TensorDim> &input_dims,
+ const std::vector<TensorDim> &label_dims) override;
+
+private:
+ using PropTypes = std::tuple<props::FilePath>;
+ std::unique_ptr<PropTypes> raw_file_props;
+};
+
+} // namespace nntrainer
+
+#endif // __RAW_FILE_DATA_PRODUCER_H__
return true;
}
+bool FilePath::isValid(const std::string &v) {
+ std::ifstream file(v, std::ios::binary | std::ios::ate);
+ return file.good();
+}
+
+void FilePath::set(const std::string &v) {
+ Property<std::string>::set(v);
+ std::ifstream file(v, std::ios::binary | std::ios::ate);
+ cached_pos_size = file.tellg();
+}
+
+std::ifstream::pos_type FilePath::file_size() { return cached_pos_size; }
+
ConnectionSpec::ConnectionSpec(const std::vector<props::Name> &layer_ids_,
const std::string &op_type_) :
op_type(op_type_),
#include <array>
#include <base_properties.h>
+#include <fstream>
#ifndef __COMMON_PROPERTIES_H__
#define __COMMON_PROPERTIES_H__
bool isValid(const float &v) const override;
};
+/**
+ * @brief Props containing file path value
+ *
+ */
+class FilePath : public Property<std::string> {
+public:
+ static constexpr const char *key = "path"; /**< unique key to access */
+ using prop_tag = str_prop_tag; /**< property type */
+
+ /**
+ * @brief check if given value is valid
+ *
+ * @param v value to check
+ * @return bool true if valid
+ */
+ bool isValid(const std::string &v);
+
+ /**
+ * @brief setter
+ *
+ * @param v value to set
+ */
+ void set(const std::string &v);
+
+ /**
+ * @brief return file size
+ *
+ * @return std::ifstream::pos_type size of the file
+ */
+ std::ifstream::pos_type file_size();
+
+private:
+ std::ifstream::pos_type cached_pos_size;
+};
} // namespace props
} // namespace nntrainer
*/
#include <gtest/gtest.h>
+#include <algorithm>
#include <data_producer_common_tests.h>
void DataProducerSemantics::SetUp() {
auto [producerFactory, properties, input_dims_, label_dims_, validator_,
result_] = GetParam();
+ /** check if input_dims, label_dims not empty and have the same batch */
+ ASSERT_FALSE(input_dims_.empty());
+ ASSERT_FALSE(label_dims_.empty());
+ auto b = input_dims_[0].batch();
+
+ ASSERT_TRUE(std::all_of(input_dims_.begin(), input_dims_.end(),
+ [b](const auto &dim) { return b == dim.batch(); }));
+ ASSERT_TRUE(std::all_of(label_dims_.begin(), label_dims_.end(),
+ [b](const auto &dim) { return b == dim.batch(); }));
+
producer = producerFactory(properties);
input_dims = std::move(input_dims_);
label_dims = std::move(label_dims_);
sz = 10;
}
- for (unsigned i = 0; i < sz; ++i) {
+ auto num_iterations = sz / input_dims[0].batch();
+
+ for (unsigned i = 0; i < num_iterations; ++i) {
auto [last, ins, labels] = generator();
ASSERT_FALSE(last) << " reached last at iteration: " << i << '\n';
producer_targets = [
'data_producer_common_tests.cpp',
'unittest_random_data_producers.cpp',
- 'unittest_func_data_producers.cpp'
+ 'unittest_func_data_producer.cpp',
+ 'unittest_raw_file_data_producer.cpp'
]
test_target += producer_targets
/**
* Copyright (C) 2021 Jihoon Lee <jhoon.it.lee@samsung.com>
*
- * @file unittest_func_data_producers.cpp
+ * @file unittest_func_data_producer.cpp
* @date 12 July 2021
* @brief Function data producers (Param Tests)
* @see https://github.com/nnstreamer/nntrainer
#include <tensor.h>
namespace {
-std::vector<nntrainer::TensorDim> input_shapes = {{3, 2, 4, 5}, {1, 2, 3, 4}};
-std::vector<nntrainer::TensorDim> label_shapes = {{3, 1, 1, 10}, {1, 1, 1, 2}};
+std::vector<nntrainer::TensorDim> input_shapes = {{3, 2, 4, 5}, {3, 2, 3, 4}};
+std::vector<nntrainer::TensorDim> label_shapes = {{3, 1, 1, 10}, {3, 1, 1, 2}};
int user_data = 0;
int getBatch(float **outVec, float **outLabel, bool *last, void *user_data) {
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2021 Jihoon Lee <jhoon.it.lee@samsung.com>
+ *
+ * @file unittest_raw_file_data_producer.cpp
+ * @date 12 July 2021
+ * @brief raw file data producers (Param Tests)
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Jihoon Lee <jhoon.it.lee@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+
+#include <gtest/gtest.h>
+
+#include <data_producer_common_tests.h>
+#include <raw_file_data_producer.h>
+#include <tensor.h>
+
+#include <nntrainer_test_util.h>
+
+static const std::string getTestResPath(const std::string &file) {
+ return getResPath(file, {"test"});
+}
+
+namespace {
+std::vector<nntrainer::TensorDim> input_shapes = {{20, 3, 32, 32}};
+std::vector<nntrainer::TensorDim> label_shapes = {{20, 1, 1, 10}};
+
+bool validate(const std::vector<nntrainer::Tensor> &inputs,
+ const std::vector<nntrainer::Tensor> &labels) {
+ return true;
+};
+} // namespace
+
+auto training_set = DataProducerSemanticsParamType(
+ createDataProducer<nntrainer::RawFileDataProducer>,
+ {"path=" + getTestResPath("trainingSet.dat")}, {{20, 3, 32, 32}},
+ {{20, 1, 1, 10}}, validate, DataProducerSemanticsExpectedResult::SUCCESS);
+
+auto valSet = DataProducerSemanticsParamType(
+ createDataProducer<nntrainer::RawFileDataProducer>,
+ {"path=" + getTestResPath("valSet.dat")}, {{3, 32, 32}}, {{1, 1, 10}},
+ validate, DataProducerSemanticsExpectedResult::SUCCESS);
+
+auto testSet = DataProducerSemanticsParamType(
+ createDataProducer<nntrainer::RawFileDataProducer>,
+ {"path=" + getTestResPath("testSet.dat")}, {{3, 32, 32}}, {{1, 1, 10}},
+ validate, DataProducerSemanticsExpectedResult::SUCCESS);
+
+auto batch_too_big = DataProducerSemanticsParamType(
+ createDataProducer<nntrainer::RawFileDataProducer>,
+ {"path=" + getTestResPath("testSet.dat")}, {{50000, 3, 32, 32}},
+ {{50000, 1, 1, 10}}, nullptr,
+ DataProducerSemanticsExpectedResult::FAIL_AT_FINALIZE);
+
+INSTANTIATE_TEST_CASE_P(RawFile, DataProducerSemantics,
+ ::testing::Values(training_set, valSet, testSet,
+ batch_too_big));