From: Jihoon Lee Date: Fri, 20 Aug 2021 09:14:54 +0000 (+0900) Subject: [Dataset] Change and apply callback handler X-Git-Tag: submit/tizen/20210827.122527~3 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3acefd899fa7372184c9c2283c9fd592fd35c69d;p=platform%2Fcore%2Fml%2Fnntrainer.git [Dataset] Change and apply callback handler This patch change and apply callback handler from batchwise -> samplewise **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Jihoon Lee --- diff --git a/Applications/Custom/LayerClient/jni/main.cpp b/Applications/Custom/LayerClient/jni/main.cpp index 1d65d31e..b655fce4 100644 --- a/Applications/Custom/LayerClient/jni/main.cpp +++ b/Applications/Custom/LayerClient/jni/main.cpp @@ -40,7 +40,7 @@ int constant_generator_cb(float **outVec, float **outLabel, bool *last, void *user_data) { static int count = 0; unsigned int i; - unsigned int data_size = BATCH_SIZE * FEATURE_SIZE; + unsigned int data_size = FEATURE_SIZE; for (i = 0; i < data_size; ++i) { outVec[0][i] = 2.0f; @@ -51,12 +51,12 @@ int constant_generator_cb(float **outVec, float **outLabel, bool *last, } outLabel[0][0] = 1.0f; + count++; if (count == 10) { *last = true; count = 0; } else { *last = false; - count++; } return ML_ERROR_NONE; diff --git a/Applications/LogisticRegression/jni/main.cpp b/Applications/LogisticRegression/jni/main.cpp index d2b304f5..9c22b1e3 100644 --- a/Applications/LogisticRegression/jni/main.cpp +++ b/Applications/LogisticRegression/jni/main.cpp @@ -24,12 +24,12 @@ * (test.txt) */ +#include #include #include #include +#include #include -#include -#include #include #include @@ -47,6 +47,8 @@ const unsigned int feature_size = 2; const unsigned int total_val_data_size = 10; +constexpr unsigned int SEED = 0; + bool training = false; /** @@ -69,13 +71,12 @@ float stepFunction(float x) { /** * @brief get idth Data * @param[in] F file stream - * @param[out] outVec feature data - * @param[out] outLabel label data + * @param[out] input feature data + * @param[out] label label data * @param[in] id id th * @retval boolean true if there is no error */ -bool getData(std::ifstream &F, std::vector &outVec, - std::vector &outLabel, unsigned int id) { +bool getData(std::ifstream &F, float *input, float *label, unsigned int id) { std::string temp; F.clear(); F.seekg(0, std::ios_base::beg); @@ -94,55 +95,41 @@ bool getData(std::ifstream &F, std::vector &outVec, float x; for (unsigned int j = 0; j < feature_size; ++j) { buffer >> x; - outVec[j] = x; + input[j] = x; } buffer >> x; - outLabel[0] = x; + label[0] = x; return true; } +std::mt19937 rng; +std::vector train_idxes; + /** - * @brief get Data as much as batch size + * @brief get a single data * @param[out] outVec feature data * @param[out] outLabel label data * @param[out] last end of data * @param[in] user_data user data * @retval int 0 if there is no error */ -int getBatch_train(float **outVec, float **outLabel, bool *last, - void *user_data) { +int getSample_train(float **outVec, float **outLabel, bool *last, + void *user_data) { std::ifstream dataFile(data_file); - unsigned int data_size = total_train_data_size; - unsigned int count = 0; - if (data_size - train_count < batch_size) { + if (!getData(dataFile, *outVec, *outLabel, train_idxes.at(train_count))) { + return -1; + } + train_count++; + if (train_count < total_train_data_size) { + *last = false; + } else { *last = true; train_count = 0; - return 0; - } - - for (unsigned int i = train_count; i < train_count + batch_size; ++i) { - - std::vector o; - std::vector l; - o.resize(feature_size); - l.resize(1); - - if (!getData(dataFile, o, l, i)) { - return -1; - }; - - for (unsigned int j = 0; j < feature_size; ++j) - outVec[0][count * feature_size + j] = o[j]; - outLabel[0][count] = l[0]; - - count++; + std::shuffle(train_idxes.begin(), train_idxes.end(), rng); } - dataFile.close(); - *last = false; - train_count += batch_size; return 0; } @@ -161,6 +148,9 @@ int main(int argc, char *argv[]) { } const std::string weight_path = "logistic_model.bin"; + train_idxes.resize(total_train_data_size); + std::iota(train_idxes.begin(), train_idxes.end(), 0); + rng.seed(SEED); const std::vector args(argv + 1, argv + argc); std::string config = args[1]; @@ -171,8 +161,8 @@ int main(int argc, char *argv[]) { srand(time(NULL)); - auto data_train = - ml::train::createDataset(ml::train::DatasetType::GENERATOR, getBatch_train); + auto data_train = ml::train::createDataset(ml::train::DatasetType::GENERATOR, + getSample_train); /** * @brief Create NN @@ -219,7 +209,7 @@ int main(int argc, char *argv[]) { o.resize(feature_size); l.resize(1); - getData(dataFile, o, l, j); + getData(dataFile, o.data(), l.data(), j); try { float answer = diff --git a/Applications/MNIST/jni/main.cpp b/Applications/MNIST/jni/main.cpp index 4445448c..66823f4c 100644 --- a/Applications/MNIST/jni/main.cpp +++ b/Applications/MNIST/jni/main.cpp @@ -23,15 +23,16 @@ #define APP_VALIDATE #endif +#include #include #include #include #include #include -#include +#include +#include #include -#include -#include +#include #if defined(APP_VALIDATE) #include @@ -45,6 +46,8 @@ #define VALIDATION false +constexpr unsigned int SEED = 0; + #if VALIDATION /** * @brief Data size for each category @@ -55,8 +58,6 @@ const unsigned int total_val_data_size = 32; const unsigned int total_test_data_size = 32; -const unsigned int buffer_size = 32; - const unsigned int batch_size = 32; #else @@ -67,8 +68,6 @@ const unsigned int total_val_data_size = 100; const unsigned int total_test_data_size = 100; -const unsigned int buffer_size = 100; - const unsigned int batch_size = 32; #endif @@ -112,13 +111,12 @@ float stepFunction(float x) { /** * @brief load data at specific position of file * @param[in] F ifstream (input file) - * @param[out] outVec - * @param[out] outLabel + * @param[out] input input + * @param[out] label label * @param[in] id th data to get * @retval true/false false : end of data */ -bool getData(std::ifstream &F, std::vector &outVec, - std::vector &outLabel, unsigned int id) { +bool getData(std::ifstream &F, float *input, float *label, unsigned int id) { F.clear(); F.seekg(0, std::ios_base::end); uint64_t file_length = F.tellg(); @@ -129,119 +127,75 @@ bool getData(std::ifstream &F, std::vector &outVec, return false; } F.seekg(position, std::ios::beg); - for (unsigned int i = 0; i < feature_size; i++) - F.read((char *)&outVec[i], sizeof(float)); - for (unsigned int i = 0; i < total_label_size; i++) - F.read((char *)&outLabel[i], sizeof(float)); + F.read((char *)input, sizeof(float) * feature_size); + F.read((char *)label, sizeof(float) * total_label_size); return true; } /** - * @brief get data which size is batch for train - * @param[out] outVec - * @param[out] outLabel - * @param[out] last if the data is finished - * @param[in] user_data private data for the callback - * @retval status for handling error + * @brief UserData which stores information used to feed data from data callback + * */ -int getBatch_train(float **outVec, float **outLabel, bool *last, - void *user_data) { - std::vector memI; - std::vector memJ; - unsigned int count = 0; - int data_size = total_train_data_size; - - std::ifstream F(filename, std::ios::in | std::ios::binary); - -#if VALIDATION - if (data_size - train_count < batch_size) { -#else - if (data_size * total_label_size - train_count < batch_size) { -#endif - *last = true; - train_count = 0; - return ML_ERROR_NONE; - } - - count = 0; - for (unsigned int i = train_count; i < train_count + batch_size; i++) { - std::vector o; - std::vector l; - - o.resize(feature_size); - l.resize(total_label_size); - - getData(F, o, l, i); - - for (unsigned int j = 0; j < feature_size; ++j) - outVec[0][count * feature_size + j] = o[j]; - for (unsigned int j = 0; j < total_label_size; ++j) - outLabel[0][count * total_label_size + j] = l[j]; - count++; +class DataInformation { +public: + /** + * @brief Construct a new Data Information object + * + * @param num_samples number of data + * @param filename file name to read from + */ + DataInformation(unsigned int num_samples, const std::string &filename); + unsigned int count; + unsigned int num_samples; + std::ifstream file; + std::vector idxes; + std::mt19937 rng; +}; + +DataInformation::DataInformation(unsigned int num_samples, + const std::string &filename) : + count(0), + num_samples(num_samples), + file(filename, std::ios::in | std::ios::binary), + idxes(num_samples) { + std::iota(idxes.begin(), idxes.end(), 0); + rng.seed(SEED); + std::shuffle(idxes.begin(), idxes.end(), rng); + if (!file.good()) { + throw std::invalid_argument("given file is not good, filename: " + + filename); } - - F.close(); - *last = false; - train_count += batch_size; - return ML_ERROR_NONE; } /** - * @brief get data which size is batch for validation - * @param[out] outVec - * @param[out] outLabel + * @brief get data which size is batch for train + * @param[out] outInput input vectors + * @param[out] outLabel label vectors * @param[out] last if the data is finished * @param[in] user_data private data for the callback * @retval status for handling error */ -int getBatch_val(float **outVec, float **outLabel, bool *last, - void *user_data) { - - std::vector memI; - std::vector memJ; - unsigned int count = 0; - int data_size = total_val_data_size; - - std::ifstream F(filename, std::ios::in | std::ios::binary); - -#if VALIDATION - if (data_size - val_count < batch_size) { -#else - if (data_size * total_label_size - val_count < batch_size) { -#endif +int getSample(float **outVec, float **outLabel, bool *last, void *user_data) { + auto data = reinterpret_cast(user_data); + + getData(data->file, *outVec, *outLabel, data->idxes.at(data->count)); + data->count++; + if (data->count < data->num_samples) { + *last = false; + } else { *last = true; - val_count = 0; - return ML_ERROR_NONE; + data->count = 0; + std::shuffle(data->idxes.begin(), data->idxes.end(), data->rng); } - count = 0; - for (unsigned int i = val_count; i < val_count + batch_size; i++) { - std::vector o; - std::vector l; - - o.resize(feature_size); - l.resize(total_label_size); - - getData(F, o, l, i); - - for (unsigned int j = 0; j < feature_size; ++j) - outVec[0][count * feature_size + j] = o[j]; - for (unsigned int j = 0; j < total_label_size; ++j) - outLabel[0][count * total_label_size + j] = l[j]; - count++; - } - - F.close(); - *last = false; - val_count += batch_size; return ML_ERROR_NONE; } #if defined(APP_VALIDATE) TEST(MNIST_training, verify_accuracy) { - EXPECT_FLOAT_EQ(training_loss, 2.3031187); - EXPECT_FLOAT_EQ(validation_loss, 2.2951343); + EXPECT_FLOAT_EQ(training_loss, 2.5698349); + EXPECT_FLOAT_EQ(validation_loss, 2.5551746); } #endif @@ -255,8 +209,9 @@ int main(int argc, char *argv[]) { int status = 0; #ifdef APP_VALIDATE status = remove("mnist_model.bin"); - if (status != 0) + if (status != 0) { std::cout << "Pre-existing model file doesn't exist.\n"; + } #endif if (argc < 3) { std::cout << "./nntrainer_mnist mnist.ini dataset.dat\n"; @@ -267,26 +222,28 @@ int main(int argc, char *argv[]) { std::string config = args[0]; filename = args[1]; - std::ifstream f(filename); - if (!f.good()) { - std::cout << "dataset is not good, filename: " << filename << '\n'; - exit(1); + std::unique_ptr train_user_data; + std::unique_ptr valid_user_data; + try { + train_user_data = + std::make_unique(total_train_data_size, filename); + valid_user_data = + std::make_unique(total_val_data_size, filename); + } catch (std::invalid_argument &e) { + std::cerr << "Error creating userdata for the data callback " << e.what() + << std::endl; + return 1; } - srand(time(NULL)); - std::vector> inputVector, outputVector; - std::vector> inputValVector, outputValVector; - std::vector> inputTestVector, outputTestVector; - /** * @brief Data buffer Create & Initialization */ std::shared_ptr dataset_train, dataset_val; try { - dataset_train = - createDataset(ml::train::DatasetType::GENERATOR, getBatch_train); - dataset_val = - createDataset(ml::train::DatasetType::GENERATOR, getBatch_val); + dataset_train = createDataset(ml::train::DatasetType::GENERATOR, getSample, + train_user_data.get()); + dataset_val = createDataset(ml::train::DatasetType::GENERATOR, getSample, + valid_user_data.get()); } catch (std::exception &e) { std::cerr << "Error creating dataset" << e.what() << std::endl; return 1; diff --git a/Applications/ProductRatings/jni/main.cpp b/Applications/ProductRatings/jni/main.cpp index fd486fc3..bab337be 100644 --- a/Applications/ProductRatings/jni/main.cpp +++ b/Applications/ProductRatings/jni/main.cpp @@ -17,9 +17,8 @@ #include #include #include +#include #include -#include -#include #include #include @@ -28,6 +27,8 @@ std::string data_file; +constexpr unsigned int SEED = 0; + const unsigned int total_train_data_size = 25; unsigned int train_count = 0; @@ -60,13 +61,12 @@ float stepFunction(float x) { /** * @brief get idth Data * @param[in] F file stream - * @param[out] outVec feature data - * @param[out] outLabel label data + * @param[out] input feature data + * @param[out] label label data * @param[in] id id th * @retval boolean true if there is no error */ -bool getData(std::ifstream &F, std::vector &outVec, - std::vector &outLabel, unsigned int id) { +bool getData(std::ifstream &F, float *input, float *label, unsigned int id) { std::string temp; F.clear(); F.seekg(0, std::ios_base::beg); @@ -78,18 +78,17 @@ bool getData(std::ifstream &F, std::vector &outVec, F.putback(c); - if (!std::getline(F, temp)) { + if (!std::getline(F, temp)) return false; - } std::istringstream buffer(temp); float x; for (unsigned int j = 0; j < feature_size; ++j) { buffer >> x; - outVec[j] = x; + input[j] = x; } buffer >> x; - outLabel[0] = x; + label[0] = x; return true; } @@ -103,50 +102,34 @@ template void loadFile(const char *filename, T &t) { file.close(); } +std::mt19937 rng; +std::vector train_idxes; + /** - * @brief get Data as much as batch size + * @brief get a single data * @param[out] outVec feature data * @param[out] outLabel label data * @param[out] last end of data * @param[in] user_data user data * @retval int 0 if there is no error */ -int getBatch_train(float **outVec, float **outLabel, bool *last, - void *user_data) { +int getSample_train(float **outVec, float **outLabel, bool *last, + void *user_data) { std::ifstream dataFile(data_file); - unsigned int data_size = total_train_data_size; - unsigned int count = 0; - - if (data_size - train_count < batch_size) { + if (!getData(dataFile, *outVec, *outLabel, train_idxes.at(train_count))) { + return -1; + } + train_count++; + if (train_count < total_train_data_size) { + *last = false; + } else { *last = true; train_count = 0; - return 0; + std::shuffle(train_idxes.begin(), train_idxes.end(), rng); } - std::vector o; - std::vector l; - o.resize(feature_size); - l.resize(1); - - for (unsigned int i = train_count; i < train_count + batch_size; ++i) { - if (!getData(dataFile, o, l, i)) { - return -1; - } - - for (unsigned int j = 0; j < feature_size; ++j) { - outVec[0][count * feature_size + j] = o[j]; - } - outLabel[0][count] = l[0]; - - count++; - } - - dataFile.close(); - *last = false; - train_count += batch_size; return 0; } - /** * @brief create NN * back propagation of NN @@ -172,14 +155,16 @@ int main(int argc, char *argv[]) { if (!args[0].compare("train")) training = true; - srand(time(NULL)); + train_idxes.resize(total_train_data_size); + std::iota(train_idxes.begin(), train_idxes.end(), 0); + rng.seed(SEED); std::shared_ptr dataset_train, dataset_val; try { dataset_train = - createDataset(ml::train::DatasetType::GENERATOR, getBatch_train); + createDataset(ml::train::DatasetType::GENERATOR, getSample_train); dataset_val = - createDataset(ml::train::DatasetType::GENERATOR, getBatch_train); + createDataset(ml::train::DatasetType::GENERATOR, getSample_train); } catch (std::exception &e) { std::cerr << "Error creating dataset " << e.what() << std::endl; return 1; @@ -188,7 +173,6 @@ int main(int argc, char *argv[]) { /** * @brief Create NN */ - std::vector> inputVector, outputVector; nntrainer::NeuralNetwork NN; /** * @brief Initialize NN with configuration file path @@ -258,7 +242,7 @@ int main(int argc, char *argv[]) { o.resize(feature_size); l.resize(1); - getData(dataFile, o, l, j); + getData(dataFile, o.data(), l.data(), j); try { float answer = diff --git a/Applications/TransferLearning/Draw_Classification/jni/main.cpp b/Applications/TransferLearning/Draw_Classification/jni/main.cpp index a9281d3e..fda60674 100644 --- a/Applications/TransferLearning/Draw_Classification/jni/main.cpp +++ b/Applications/TransferLearning/Draw_Classification/jni/main.cpp @@ -24,8 +24,6 @@ * Classifier : One Fully Connected Layer * */ -///@todo update below -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" #if defined(NNSTREAMER_AVAILABLE) && defined(ENABLE_TEST) #define APP_VALIDATE @@ -149,12 +147,6 @@ void loadAllData(const std::string &data_path, float input_data[][INPUT_SIZE], */ int getBatch_train(float **input, float **label, bool *last, void *user_data) { static unsigned int iteration = 0; - if (iteration >= EPOCH_SIZE) { - *last = true; - iteration = 0; - return ML_ERROR_NONE; - } - for (int idx = 0; idx < INPUT_SIZE; idx++) { input[0][idx] = inputVector[iteration][idx]; } @@ -163,8 +155,13 @@ int getBatch_train(float **input, float **label, bool *last, void *user_data) { label[0][idx] = labelVector[iteration][idx]; } - *last = false; iteration += 1; + if (iteration < EPOCH_SIZE) { + *last = false; + } else { + *last = true; + iteration = 0; + } return ML_ERROR_NONE; } @@ -191,14 +188,22 @@ int trainModel(const char *config) { } /** Set the dataset from generator */ - status = ml_train_dataset_create_with_generator(&dataset, getBatch_train, - NULL, NULL); + status = ml_train_dataset_create(&dataset); if (status != ML_ERROR_NONE) { ml_train_model_destroy(handle); return status; } - status = ml_train_dataset_set_property(dataset, "buffer_size=100", NULL); + status = ml_train_dataset_add_generator(dataset, ML_TRAIN_DATASET_MODE_TRAIN, + getBatch_train, nullptr); + if (status != ML_ERROR_NONE) { + ml_train_dataset_destroy(dataset); + ml_train_model_destroy(handle); + return status; + } + + status = ml_train_dataset_set_property_for_mode( + dataset, ML_TRAIN_DATASET_MODE_TRAIN, "buffer_size=100", NULL); if (status != ML_ERROR_NONE) { ml_train_dataset_destroy(dataset); ml_train_model_destroy(handle); diff --git a/nntrainer/models/neuralnet.cpp b/nntrainer/models/neuralnet.cpp index 13c2acb9..c061044c 100644 --- a/nntrainer/models/neuralnet.cpp +++ b/nntrainer/models/neuralnet.cpp @@ -635,14 +635,8 @@ int NeuralNetwork::train_run() { for (epoch_idx = epoch_idx + 1; epoch_idx <= epochs; ++epoch_idx) { training.loss = 0.0f; - std::future> future_bq; - std::future> future_iq; - if (train_buffer->getType() == "callback") { - future_bq = train_buffer->startFetchWorker(in_dims, label_dims); - } else { - future_iq = - train_buffer->startFetchWorker_sample(in_dims, label_dims, true); - } + std::future> future_iq = + train_buffer->startFetchWorker_sample(in_dims, label_dims, true); // /// @todo make this working, test buffer is running but doing nothing // if (test_buffer != nullptr && test_buffer->isValid()) { @@ -656,30 +650,19 @@ int NeuralNetwork::train_run() { int count = 0; while (true) { - ScopedView iter_view(nullptr); - if (train_buffer->getType() == "callback") { - - auto [last, ins, labels] = *train_buffer->fetch(); - /// @todo multiple input support - if (last) { - break; - } - in = ins[0]; - label = labels[0]; - } else { - iter_view = train_buffer->fetch_sample(); - if (iter_view.isEmpty()) { - break; - } - auto &iter = iter_view.get(); - if (iter.batch() != batch_size) { - /// this is partial batch scenario - continue; - } - /// @todo multiple input support - in = iter.getInputsRef().front(); - label = iter.getLabelsRef().front(); + ScopedView iter_view = train_buffer->fetch_sample(); + if (iter_view.isEmpty()) { + break; + } + auto &iteration = iter_view.get(); + if (iteration.batch() != batch_size) { + /// this is partial batch scenario + continue; } + /// @todo multiple input support + in = iteration.getInputsRef().front(); + label = iteration.getLabelsRef().front(); + forwarding(true); backwarding(iter++); @@ -689,11 +672,7 @@ int NeuralNetwork::train_run() { training.loss += loss; } - if (train_buffer->getType() == "callback") { - future_bq.get(); - } else { - future_iq.get(); - } + future_iq.get(); if (count == 0) throw std::runtime_error("No training data"); @@ -710,40 +689,23 @@ int NeuralNetwork::train_run() { int right = 0; validation.loss = 0.0f; unsigned int tcases = 0; - std::future> future_bq; - std::future> future_iq; - - if (valid_buffer->getType() == "callback") { - future_bq = valid_buffer->startFetchWorker(in_dims, label_dims); - } else { - future_iq = - valid_buffer->startFetchWorker_sample(in_dims, label_dims, false); - } + + std::future> future_iq = + valid_buffer->startFetchWorker_sample(in_dims, label_dims, false); while (true) { - ScopedView iter_view(nullptr); - if (valid_buffer->getType() == "callback") { - auto [last, ins, labels] = *valid_buffer->fetch(); - if (last) { - break; - } - /// @todo multiple input support - in = ins[0]; - label = labels[0]; - } else { - iter_view = valid_buffer->fetch_sample(); - if (iter_view.isEmpty()) { - break; - } - auto &iter = iter_view.get(); - if (iter.batch() != batch_size) { - /// this is partial batch scenario - continue; - } - /// @todo multiple input support - in = iter.getInputsRef().front(); - label = iter.getLabelsRef().front(); + ScopedView iter_view = valid_buffer->fetch_sample(); + if (iter_view.isEmpty()) { + break; } + auto &iter = iter_view.get(); + if (iter.batch() != batch_size) { + /// this is partial batch scenario + continue; + } + /// @todo multiple input support + in = iter.getInputsRef().front(); + label = iter.getLabelsRef().front(); forwarding(false); auto model_out = output.argmax(); @@ -756,11 +718,7 @@ int NeuralNetwork::train_run() { tcases++; } - if (valid_buffer->getType() == "callback") { - future_bq.get(); - } else { - future_iq.get(); - } + future_iq.get(); if (tcases == 0) { ml_loge("Error : 0 test cases"); diff --git a/nntrainer/utils/base_properties.h b/nntrainer/utils/base_properties.h index 7ef4d411..259f712c 100644 --- a/nntrainer/utils/base_properties.h +++ b/nntrainer/utils/base_properties.h @@ -350,7 +350,6 @@ template struct str_converter { static DataType from_string(const std::string &value) { std::stringstream ss(value); uintptr_t addr = static_cast(std::stoull(value, 0, 16)); - std::cerr << "value: " << value << " addr: " << addr; return reinterpret_cast(addr); } }; diff --git a/test/ccapi/unittest_ccapi.cpp b/test/ccapi/unittest_ccapi.cpp index 19fdcdbb..7779fe63 100644 --- a/test/ccapi/unittest_ccapi.cpp +++ b/test/ccapi/unittest_ccapi.cpp @@ -291,14 +291,16 @@ TEST(nntrainer_ccapi, train_dataset_with_generator_01_p) { "beta1=0.002", "beta2=0.001", "epsilon=1e-7"})); EXPECT_NO_THROW(model->setOptimizer(optimizer)); + auto train_data = createTrainData(); + auto valid_data = createValidData(); EXPECT_NO_THROW(dataset = ml::train::createDataset( - ml::train::DatasetType::GENERATOR, getBatch_train)); + ml::train::DatasetType::GENERATOR, getSample, &train_data)); EXPECT_NO_THROW(dataset->setProperty({"buffer_size=100"})); EXPECT_EQ(model->setDataset(ml::train::DatasetModeType::MODE_TRAIN, dataset), ML_ERROR_NONE); EXPECT_NO_THROW(dataset = ml::train::createDataset( - ml::train::DatasetType::GENERATOR, getBatch_val)); + ml::train::DatasetType::GENERATOR, getSample, &valid_data)); EXPECT_NO_THROW(dataset->setProperty({"buffer_size=100"})); EXPECT_EQ(model->setDataset(ml::train::DatasetModeType::MODE_VALID, dataset), ML_ERROR_NONE); @@ -309,8 +311,8 @@ TEST(nntrainer_ccapi, train_dataset_with_generator_01_p) { EXPECT_EQ(model->initialize(), ML_ERROR_NONE); EXPECT_NO_THROW(model->train()); - EXPECT_NEAR(model->getTrainingLoss(), 2.2109976, tolerance); - EXPECT_NEAR(model->getValidationLoss(), 1.995334, tolerance); + EXPECT_NEAR(model->getTrainingLoss(), 2.238682, tolerance); + EXPECT_NEAR(model->getValidationLoss(), 2.0042247, tolerance); } /** diff --git a/test/include/nntrainer_test_util.h b/test/include/nntrainer_test_util.h index 00068c9b..4f4bec6c 100644 --- a/test/include/nntrainer_test_util.h +++ b/test/include/nntrainer_test_util.h @@ -137,25 +137,48 @@ void replaceString(const std::string &from, const std::string &to, const std::string n, std::string str); /** - * @brief get data which size is batch for train - * @param[out] outVec - * @param[out] outLabel - * @param[out] last if the data is finished - * @param[in] user_data private data for the callback - * @retval status for handling error + * @brief UserData which stores information used to feed data from data callback + * + */ +class DataInformation { +public: + /** + * @brief Construct a new Data Information object + * + * @param num_samples number of data + * @param filename file name to read from + */ + DataInformation(unsigned int num_samples, const std::string &filename); + unsigned int count; + unsigned int num_samples; + std::ifstream file; + std::vector idxes; + std::mt19937 rng; +}; + +/** + * @brief Create a user data for training + * + * @return DataInformation + */ +DataInformation createTrainData(); + +/** + * @brief Create a user data for validataion + * + * @return DataInformation */ -int getBatch_train(float **outVec, float **outLabel, bool *last, - void *user_data); +DataInformation createValidData(); /** - * @brief get data which size is batch for val + * @brief get data which size is batch * @param[out] outVec * @param[out] outLabel * @param[out] last if the data is finished * @param[in] user_data private data for the callback * @retval status for handling error */ -int getBatch_val(float **outVec, float **outLabel, bool *last, void *user_data); +int getSample(float **outVec, float **outLabel, bool *last, void *user_data); /** * @brief Get the Res Path object diff --git a/test/nntrainer_test_util.cpp b/test/nntrainer_test_util.cpp index 13479c6b..ae2664bf 100644 --- a/test/nntrainer_test_util.cpp +++ b/test/nntrainer_test_util.cpp @@ -33,10 +33,6 @@ #define batch_size 16 #define feature_size 62720 -static bool *duplicate; -static bool *valduplicate; -static bool alloc_train = false; -static bool alloc_val = false; static std::mt19937 rng(0); /** @@ -68,17 +64,6 @@ void replaceString(const std::string &from, const std::string &to, data_file.close(); } -/** - * @brief Generate Random integer value between min to max - * @param[in] min : minimum value - * @param[in] max : maximum value - * @retval min < random value < max - */ -static int rangeRandom(int min, int max) { - std::uniform_int_distribution dist(min, max); - return dist(rng); -} - /** * @brief load data at specific position of file * @param[in] F ifstream (input file) @@ -87,8 +72,8 @@ static int rangeRandom(int min, int max) { * @param[in] id th data to get * @retval true/false false : end of data */ -static bool getData(std::ifstream &F, std::vector &outVec, - std::vector &outLabel, unsigned int id) { +static bool getData(std::ifstream &F, float *outVec, float *outLabel, + unsigned int id) { F.clear(); F.seekg(0, std::ios_base::end); uint64_t file_length = F.tellg(); @@ -100,179 +85,69 @@ static bool getData(std::ifstream &F, std::vector &outVec, return false; } F.seekg(position, std::ios::beg); - for (unsigned int i = 0; i < feature_size; i++) - F.read((char *)&outVec[i], sizeof(float)); - for (unsigned int i = 0; i < num_class; i++) - F.read((char *)&outLabel[i], sizeof(float)); + F.read((char *)outVec, sizeof(float) * feature_size); + F.read((char *)outLabel, sizeof(float) * num_class); return true; } -/** - * @brief get data which size is batch for train - * @param[out] outVec - * @param[out] outLabel - * @param[out] last if the data is finished - * @param[in] user_data private data for the callback - * @retval status for handling error - */ -int getBatch_train(float **outVec, float **outLabel, bool *last, - void *user_data) { - std::vector memI; - std::vector memJ; - unsigned int count = 0; - unsigned int data_size = 0; - *last = true; - - std::string filename = getResPath("trainingSet.dat", {"test"}); - std::ifstream F(filename, std::ios::in | std::ios::binary); - - if (F.good()) { - F.seekg(0, std::ios::end); - long file_size = F.tellg(); - data_size = static_cast( - file_size / ((num_class + feature_size) * sizeof(float))); - } - - if (!alloc_train) { - duplicate = (bool *)malloc(sizeof(bool) * data_size); - if (duplicate == nullptr) { - ml_loge("[test_util] allocationg memory failed"); - alloc_train = false; - *last = false; - F.close(); - return ML_ERROR_BAD_ADDRESS; - } - - for (unsigned int i = 0; i < data_size; ++i) { - duplicate[i] = false; - } - alloc_train = true; - } - - for (unsigned int i = 0; i < data_size; i++) { - if (!duplicate[i]) - count++; - } - - if (count < batch_size) { - free(duplicate); - alloc_train = false; - *last = true; - return ML_ERROR_NONE; - } - - count = 0; - while (count < batch_size) { - int nomI = rangeRandom(0, data_size - 1); - if (!duplicate[nomI]) { - memI.push_back(nomI); - duplicate[nomI] = true; - count++; - } +DataInformation::DataInformation(unsigned int num_samples, + const std::string &filename) : + count(0), + num_samples(num_samples), + file(filename, std::ios::in | std::ios::binary), + idxes(num_samples) { + std::iota(idxes.begin(), idxes.end(), 0); + std::shuffle(idxes.begin(), idxes.end(), rng); + rng.seed(0); + if (!file.good()) { + throw std::invalid_argument("given file is not good, filename: " + + filename); } +} - for (unsigned int i = 0; i < count; i++) { - std::vector o; - std::vector l; - - o.resize(feature_size); - l.resize(num_class); - - getData(F, o, l, memI[i]); - - for (unsigned int j = 0; j < feature_size; ++j) - outVec[0][i * feature_size + j] = o[j]; - for (unsigned int j = 0; j < num_class; ++j) - outLabel[0][i * num_class + j] = l[j]; - } +static auto getDataSize = [](const std::string &file_name) { + std::ifstream f(file_name, std::ios::in | std::ios::binary); + NNTR_THROW_IF(!f.good(), std::invalid_argument) + << "cannot find " << file_name; + f.seekg(0, std::ios::end); + long file_size = f.tellg(); + return static_cast( + file_size / ((num_class + feature_size) * sizeof(float))); +}; + +std::string train_filename = getResPath("trainingSet.dat", {"test"}); +std::string valid_filename = getResPath("trainingSet.dat", {"test"}); + +DataInformation createTrainData() { + return DataInformation(getDataSize(train_filename), train_filename); +} - F.close(); - *last = false; - return ML_ERROR_NONE; +DataInformation createValidData() { + return DataInformation(getDataSize(valid_filename), valid_filename); } /** - * @brief get data which size is batch for validation + * @brief get data which size is batch for train * @param[out] outVec * @param[out] outLabel * @param[out] last if the data is finished * @param[in] user_data private data for the callback * @retval status for handling error */ -int getBatch_val(float **outVec, float **outLabel, bool *last, - void *user_data) { - - std::vector memI; - std::vector memJ; - unsigned int count = 0; - unsigned int data_size = 0; - *last = true; - - std::string filename = getResPath("trainingSet.dat", {"test"}); - std::ifstream F(filename, std::ios::in | std::ios::binary); - - if (F.good()) { - F.seekg(0, std::ios::end); - long file_size = F.tellg(); - data_size = static_cast( - file_size / ((num_class + feature_size) * sizeof(float))); - } +int getSample(float **outVec, float **outLabel, bool *last, void *user_data) { + auto data = reinterpret_cast(user_data); - if (!alloc_val) { - valduplicate = (bool *)malloc(sizeof(bool) * data_size); - if (valduplicate == nullptr) { - ml_loge("[test_util] allocationg memory failed"); - alloc_val = false; - *last = false; - F.close(); - return ML_ERROR_BAD_ADDRESS; - } - for (unsigned int i = 0; i < data_size; ++i) { - valduplicate[i] = false; - } - alloc_val = true; - } - - for (unsigned int i = 0; i < data_size; i++) { - if (!valduplicate[i]) - count++; - } - - if (count < batch_size) { - free(valduplicate); - alloc_val = false; + getData(data->file, *outVec, *outLabel, data->idxes.at(data->count)); + data->count++; + if (data->count < data->num_samples) { + *last = false; + } else { *last = true; - return ML_ERROR_NONE; - } - - count = 0; - while (count < batch_size) { - int nomI = rangeRandom(0, data_size - 1); - if (!valduplicate[nomI]) { - memI.push_back(nomI); - valduplicate[nomI] = true; - count++; - } - } - - for (unsigned int i = 0; i < count; i++) { - std::vector o; - std::vector l; - - o.resize(feature_size); - l.resize(num_class); - - getData(F, o, l, memI[i]); - - for (unsigned int j = 0; j < feature_size; ++j) - outVec[0][i * feature_size + j] = o[j]; - for (unsigned int j = 0; j < num_class; ++j) - outLabel[0][i * num_class + j] = l[j]; + data->count = 0; + std::shuffle(data->idxes.begin(), data->idxes.end(), data->rng); } - F.close(); - *last = false; return ML_ERROR_NONE; } diff --git a/test/tizen_capi/unittest_tizen_capi.cpp b/test/tizen_capi/unittest_tizen_capi.cpp index 151ce9da..0c8aa0d3 100644 --- a/test/tizen_capi/unittest_tizen_capi.cpp +++ b/test/tizen_capi/unittest_tizen_capi.cpp @@ -795,13 +795,24 @@ TEST(nntrainer_capi_nnmodel, train_with_generator_01_p) { status = ml_train_model_set_optimizer(model, optimizer); EXPECT_EQ(status, ML_ERROR_NONE); - status = ml_train_dataset_create_with_generator(&dataset, getBatch_train, - getBatch_val, NULL); + auto train_data = createTrainData(); + auto valid_data = createValidData(); + + status = ml_train_dataset_create_with_generator(&dataset, getSample, + getSample, NULL); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_set_property(dataset, "buffer_size=100", NULL); EXPECT_EQ(status, ML_ERROR_NONE); + status = ml_train_dataset_set_property_for_mode( + dataset, ML_TRAIN_DATASET_MODE_TRAIN, "user_data", &train_data, NULL); + EXPECT_EQ(status, ML_ERROR_NONE); + + status = ml_train_dataset_set_property_for_mode( + dataset, ML_TRAIN_DATASET_MODE_VALID, "user_data", &valid_data, NULL); + EXPECT_EQ(status, ML_ERROR_NONE); + status = ml_train_model_set_dataset(model, dataset); EXPECT_EQ(status, ML_ERROR_NONE); @@ -813,7 +824,7 @@ TEST(nntrainer_capi_nnmodel, train_with_generator_01_p) { EXPECT_EQ(status, ML_ERROR_NONE); /** Compare training statistics */ - nntrainer_capi_model_comp_metrics(model, 2.17921, 1.96506, 60.4167); + nntrainer_capi_model_comp_metrics(model, 2.2063899, 1.983489, 64.583297); status = ml_train_model_destroy(model); EXPECT_EQ(status, ML_ERROR_NONE); @@ -822,29 +833,24 @@ TEST(nntrainer_capi_nnmodel, train_with_generator_01_p) { static int constant_generator_cb(float **outVec, float **outLabel, bool *last, void *user_data) { static int count = 0; - - unsigned int batch_size = 9; unsigned int feature_size = 100; unsigned int num_class = 10; - unsigned int data_size = batch_size * feature_size; - for (unsigned int i = 0; i < data_size; ++i) { + for (unsigned int i = 0; i < feature_size; ++i) { outVec[0][i] = 0.0f; } - for (unsigned int i = 0; i < batch_size; ++i) { - outLabel[0][i * num_class] = 1.0f; - for (unsigned int j = 1; j < num_class; ++j) { - outLabel[0][i * num_class + j] = 0.0f; - } + outLabel[0][0] = 1.0f; + for (unsigned int j = 1; j < num_class; ++j) { + outLabel[0][j] = 0.0f; } - if (count == 10) { + count++; + if (count == 9) { *last = true; count = 0; } else { *last = false; - count++; } return ML_ERROR_NONE; diff --git a/test/tizen_capi/unittest_tizen_capi_dataset.cpp b/test/tizen_capi/unittest_tizen_capi_dataset.cpp index 8821e026..90f55f1c 100644 --- a/test/tizen_capi/unittest_tizen_capi_dataset.cpp +++ b/test/tizen_capi/unittest_tizen_capi_dataset.cpp @@ -125,14 +125,14 @@ TEST(nntrainer_capi_dataset, create_destroy_06_n) { TEST(nntrainer_capi_dataset, create_destroy_07_p) { ml_train_dataset_h dataset; int status; - status = ml_train_dataset_create_with_generator(&dataset, getBatch_train, - NULL, NULL); + status = + ml_train_dataset_create_with_generator(&dataset, getSample, NULL, NULL); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_destroy(dataset); EXPECT_EQ(status, ML_ERROR_NONE); - status = ml_train_dataset_create_with_generator(&dataset, getBatch_train, - getBatch_val, getBatch_val); + status = ml_train_dataset_create_with_generator(&dataset, getSample, + getSample, getSample); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_destroy(dataset); EXPECT_EQ(status, ML_ERROR_NONE); @@ -147,19 +147,19 @@ TEST(nntrainer_cpi_dataset, add_generator_01_p) { EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_add_generator(dataset, ML_TRAIN_DATASET_MODE_TRAIN, - getBatch_train, NULL); + getSample, NULL); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_add_generator(dataset, ML_TRAIN_DATASET_MODE_TRAIN, - getBatch_val, NULL); + getSample, NULL); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_add_generator(dataset, ML_TRAIN_DATASET_MODE_VALID, - getBatch_train, NULL); + getSample, NULL); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_add_generator(dataset, ML_TRAIN_DATASET_MODE_TEST, - getBatch_train, NULL); + getSample, NULL); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_destroy(dataset); @@ -267,8 +267,8 @@ TEST(nntrainer_capi_dataset, set_dataset_property_01_p) { ml_train_dataset_h dataset; int status; - status = ml_train_dataset_create_with_generator(&dataset, getBatch_train, - NULL, NULL); + status = + ml_train_dataset_create_with_generator(&dataset, getSample, NULL, NULL); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_set_property(dataset, "buffer_size=10", NULL); @@ -303,8 +303,8 @@ TEST(nntrainer_capi_dataset, set_dataset_property_03_n) { ml_train_dataset_h dataset; int status; - status = ml_train_dataset_create_with_generator(&dataset, getBatch_train, - NULL, NULL); + status = + ml_train_dataset_create_with_generator(&dataset, getSample, NULL, NULL); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_set_property(dataset, "user_data=10", NULL); @@ -324,8 +324,8 @@ TEST(nntrainer_capi_dataset, set_dataset_property_04_p) { ml_train_dataset_h dataset; int status = ML_ERROR_NONE; - status = ml_train_dataset_create_with_generator(&dataset, getBatch_train, - NULL, NULL); + status = + ml_train_dataset_create_with_generator(&dataset, getSample, NULL, NULL); EXPECT_EQ(status, ML_ERROR_NONE); status = @@ -350,20 +350,22 @@ TEST(nntrainer_capi_dataset, set_dataset_property_for_mode_01_p) { dataset, ML_TRAIN_DATASET_MODE_TRAIN, "buffer_size=1", NULL); EXPECT_EQ(status, ML_ERROR_INVALID_PARAMETER); + auto train_data = createTrainData(); + auto valid_data = createValidData(); status = ml_train_dataset_add_generator(dataset, ML_TRAIN_DATASET_MODE_TRAIN, - getBatch_val, nullptr); + getSample, &train_data); status = ml_train_dataset_set_property_for_mode( dataset, ML_TRAIN_DATASET_MODE_TRAIN, "buffer_size=1", NULL); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_add_generator(dataset, ML_TRAIN_DATASET_MODE_VALID, - getBatch_val, nullptr); + getSample, &valid_data); status = ml_train_dataset_set_property_for_mode( dataset, ML_TRAIN_DATASET_MODE_VALID, "buffer_size=1", NULL); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_add_generator(dataset, ML_TRAIN_DATASET_MODE_TEST, - getBatch_val, nullptr); + getSample, &train_data); status = ml_train_dataset_set_property_for_mode( dataset, ML_TRAIN_DATASET_MODE_TEST, "buffer_size=1", NULL); EXPECT_EQ(status, ML_ERROR_NONE); @@ -379,9 +381,8 @@ TEST(nntrainer_capi_dataset, set_dataset_property_for_mode_does_not_exist_valid_n) { ml_train_dataset_h dataset; int status = ML_ERROR_NONE; - - status = ml_train_dataset_create_with_generator(&dataset, getBatch_train, - nullptr, getBatch_train); + status = ml_train_dataset_create_with_generator(&dataset, getSample, nullptr, + getSample); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_set_property_for_mode( @@ -400,8 +401,8 @@ TEST(nntrainer_capi_dataset, ml_train_dataset_h dataset; int status = ML_ERROR_NONE; - status = ml_train_dataset_create_with_generator(&dataset, getBatch_train, - nullptr, nullptr); + status = ml_train_dataset_create_with_generator(&dataset, getSample, nullptr, + nullptr); EXPECT_EQ(status, ML_ERROR_NONE); status = ml_train_dataset_set_property_for_mode(