From: Marcin Kaminski Date: Fri, 28 Jan 2022 20:18:41 +0000 (+0100) Subject: [ML][Training] Thread-safe locks X-Git-Tag: submit/tizen/20220223.080409~6 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=57242977c5e8f929bd125e236a781b9bbc52753d;p=platform%2Fcore%2Fapi%2Fwebapi-plugins.git [ML][Training] Thread-safe locks Changes: - mutex added to model object to lock single model when compiling or in training - mutex added for synchronizing models map operations between dispose() and run() Change-Id: I56f80d5beb9f0d93f346d64849bea5aa583e506d --- diff --git a/src/ml/ml_trainer_manager.cc b/src/ml/ml_trainer_manager.cc index 00795d51..2f3fcafa 100644 --- a/src/ml/ml_trainer_manager.cc +++ b/src/ml/ml_trainer_manager.cc @@ -79,6 +79,7 @@ PlatformResult TrainerManager::ModelCompile(int id, } auto& model = models_[id]; + std::lock_guard modelLock(model->instanceLock); std::stringstream ss; for (const auto& opt : options) { @@ -124,12 +125,19 @@ PlatformResult TrainerManager::ModelRun(int id, const picojson::object& options) { ScopeLogger(); + // lock the models_ map operations to avoid conflicting with disposal function + models_map_lock_.lock(); if (models_.find(id) == models_.end()) { LoggerE("Could not find model with id: %d", id); + models_map_lock_.unlock(); return PlatformResult(ErrorCode::ABORT_ERR, "Could not find model"); } auto& model = models_[id]; + std::lock_guard modelLock(model->instanceLock); + // model instance is securely locked for other operations + // so it's safe to unlock map now + models_map_lock_.unlock(); if (!model->isCompiled()) { LoggerE("Trying to train model that is not compiled"); @@ -182,16 +190,25 @@ PlatformResult TrainerManager::ModelAddLayer(int id, int layerId) { return PlatformResult(ErrorCode::ABORT_ERR, "Could not find model"); } + auto& model = models_[id]; + bool available = model->instanceLock.try_lock(); + if (!available) { + LoggerE("Model locked - probaly training in progress"); + return PlatformResult(ErrorCode::NO_MODIFICATION_ALLOWED_ERR, + "Model training in progress - cannot modify"); + } + if (layers_.find(layerId) == layers_.end()) { LoggerE("Could not find layer with id: %d", id); + model->instanceLock.unlock(); return PlatformResult(ErrorCode::ABORT_ERR, "Could not find layer"); } - auto& model = models_[id]; auto& layer = layers_[layerId]; if (model->isCompiled()) { LoggerE("Modification of compiled model"); + model->instanceLock.unlock(); return PlatformResult(ErrorCode::INVALID_STATE_ERR, "Modification of compiled model not allowed"); } @@ -201,12 +218,14 @@ PlatformResult TrainerManager::ModelAddLayer(int id, int layerId) { if (ret_val != ML_ERROR_NONE) { LoggerE("Could not add layer to model: %d (%s)", ret_val, ml_strerror(ret_val)); + model->instanceLock.unlock(); return PlatformResult(ErrorCode::ABORT_ERR, ml_strerror(ret_val)); } model->layerIndices.push_back(layerId); layer->setAttached(true); + model->instanceLock.unlock(); return PlatformResult(); } @@ -218,15 +237,25 @@ PlatformResult TrainerManager::ModelSetOptimizer(int id, int optimizerId) { return PlatformResult(ErrorCode::ABORT_ERR, "Could not find model"); } + auto& model = models_[id]; + bool available = model->instanceLock.try_lock(); + if (!available) { + LoggerE("Model locked - probaly training in progress"); + return PlatformResult(ErrorCode::NO_MODIFICATION_ALLOWED_ERR, + "Model training in progress - cannot modify"); + } + if (optimizers_.find(optimizerId) == optimizers_.end()) { LoggerE("Could not find optimizer with id: %d", id); + model->instanceLock.unlock(); return PlatformResult(ErrorCode::ABORT_ERR, "Could not find optimizer"); } - auto& model = models_[id]; auto& optimizer = optimizers_[optimizerId]; + if (model->isCompiled()) { LoggerE("Modification of compiled model"); + model->instanceLock.unlock(); return PlatformResult(ErrorCode::INVALID_STATE_ERR, "Modification of compiled model not allowed"); } @@ -236,6 +265,7 @@ PlatformResult TrainerManager::ModelSetOptimizer(int id, int optimizerId) { if (ret_val != ML_ERROR_NONE) { LoggerE("Could not set optimizer for model: %d (%s)", ret_val, ml_strerror(ret_val)); + model->instanceLock.unlock(); return PlatformResult(ErrorCode::ABORT_ERR, ml_strerror(ret_val)); } @@ -255,6 +285,7 @@ PlatformResult TrainerManager::ModelSetOptimizer(int id, int optimizerId) { model->optimizerIndex = optimizerId; optimizer->setAttached(true); + model->instanceLock.unlock(); return PlatformResult(); } @@ -266,16 +297,25 @@ PlatformResult TrainerManager::ModelSetDataset(int id, int datasetId) { return PlatformResult(ErrorCode::ABORT_ERR, "Could not find model"); } + auto& model = models_[id]; + bool available = model->instanceLock.try_lock(); + if (!available) { + LoggerE("Model locked - probaly training in progress"); + return PlatformResult(ErrorCode::NO_MODIFICATION_ALLOWED_ERR, + "Model training in progress - cannot modify"); + } + if (datasets_.find(datasetId) == datasets_.end()) { LoggerE("Could not find dataset with id: %d", id); + model->instanceLock.unlock(); return PlatformResult(ErrorCode::ABORT_ERR, "Could not find dataset"); } - auto& model = models_[id]; auto& dataset = datasets_[datasetId]; if (model->isCompiled()) { LoggerE("Modification of compiled model"); + model->instanceLock.unlock(); return PlatformResult(ErrorCode::INVALID_STATE_ERR, "Modification of compiled model not allowed"); } @@ -285,6 +325,7 @@ PlatformResult TrainerManager::ModelSetDataset(int id, int datasetId) { if (ret_val != ML_ERROR_NONE) { LoggerE("Could not set dataset for model: %d (%s)", ret_val, ml_strerror(ret_val)); + model->instanceLock.unlock(); return PlatformResult(ErrorCode::ABORT_ERR, ml_strerror(ret_val)); } @@ -303,6 +344,7 @@ PlatformResult TrainerManager::ModelSetDataset(int id, int datasetId) { model->datasetIndex = datasetId; dataset->setAttached(true); + model->instanceLock.unlock(); return PlatformResult(); } @@ -317,6 +359,8 @@ PlatformResult TrainerManager::ModelSummarize(int id, } auto& model = models_[id]; + std::lock_guard modelLock(model->instanceLock); + char* tmpSummary = NULL; int ret_val = @@ -345,6 +389,12 @@ PlatformResult TrainerManager::ModelSave(int id, } auto& model = models_[id]; + bool available = model->instanceLock.try_lock(); + if (!available) { + LoggerE("Model locked - probaly training in progress"); + return PlatformResult(ErrorCode::NO_MODIFICATION_ALLOWED_ERR, + "Model training in progress - cannot save"); + } auto tmpString = path; if (tmpString.substr(0, FILE_PATH_PREFIX.length()) == FILE_PATH_PREFIX) { @@ -356,6 +406,8 @@ PlatformResult TrainerManager::ModelSave(int id, int ret_val = ml_train_model_save(model->getNative(), tmpString.c_str(), format); + model->instanceLock.unlock(); + if (ret_val != ML_ERROR_NONE) { LoggerE("Could not model to file: %d (%s)", ret_val, ml_strerror(ret_val)); return PlatformResult(ErrorCode::ABORT_ERR, ml_strerror(ret_val)); @@ -367,6 +419,9 @@ PlatformResult TrainerManager::ModelSave(int id, PlatformResult TrainerManager::ModelDispose(int id) { ScopeLogger(); + // lock the models_ map operations to avoid conflicting with model training + std::lock_guard model_lock(models_map_lock_); + if (models_.find(id) == models_.end()) { LoggerE("Could not find model with id: %d", id); return PlatformResult(ErrorCode::NOT_FOUND_ERR, "Could not find model"); @@ -374,12 +429,23 @@ PlatformResult TrainerManager::ModelDispose(int id) { auto model = models_[id]; + bool available = model->instanceLock.try_lock(); + if (!available) { + LoggerE("Could not lock model for disposal - probaly training in progress"); + return PlatformResult(ErrorCode::NO_MODIFICATION_ALLOWED_ERR, + "Model training in progress - disposal not allowed"); + } + int ret_val = ml_train_model_destroy(model->getNative()); if (ret_val != ML_ERROR_NONE) { LoggerE("Could not destroy model: %d (%s)", ret_val, ml_strerror(ret_val)); + model->instanceLock.unlock(); return PlatformResult(ErrorCode::ABORT_ERR, ml_strerror(ret_val)); } + // erase model from map and use a shared pointer for related objects removal + models_.erase(id); + // When model is destroyed by ml_train_model_destroy() then all attached // handles (layers, optimizer, dataset) are also destroyed. This means that // after Model disposal all related objects in JS/C++ layer become invalid. @@ -389,16 +455,18 @@ PlatformResult TrainerManager::ModelDispose(int id) { LoggerD("Deleting attached optimizer: %d", model->optimizerIndex); optimizers_.erase(model->optimizerIndex); } + if (model->datasetIndex) { LoggerD("Deleting attached dataset: %d", model->datasetIndex); datasets_.erase(model->datasetIndex); } + for (auto const& ls : model->layerIndices) { LoggerD("Deleting attached layer: %d", ls); layers_.erase(ls); } - models_.erase(id); + model->instanceLock.unlock(); return PlatformResult(); } @@ -417,10 +485,12 @@ PlatformResult TrainerManager::CreateLayer(int& id, layers_[next_layer_id_] = std::make_shared>(n_layer); id = next_layer_id_++; + return PlatformResult(); } -PlatformResult TrainerManager::LayerSetProperty(int id, const std::string& name, +PlatformResult TrainerManager::LayerSetProperty(int id, + const std::string& name, const std::string& value) { ScopeLogger("id: %d, name: %s, value: %s", id, name.c_str(), value.c_str()); @@ -430,6 +500,7 @@ PlatformResult TrainerManager::LayerSetProperty(int id, const std::string& name, } auto layer = layers_[id]; + std::string opt = name + "=" + value; int ret_val = @@ -487,6 +558,7 @@ PlatformResult TrainerManager::CreateOptimizer(int& id, optimizers_[next_optimizer_id_] = std::make_shared>(n_optimizer); id = next_optimizer_id_++; + return PlatformResult(); } @@ -501,6 +573,7 @@ PlatformResult TrainerManager::OptimizerSetProperty(int id, } auto optimizer = optimizers_[id]; + std::string opt = name + "=" + value; int ret_val = ml_train_optimizer_set_property(optimizer->getNative(), opt.c_str(), NULL); @@ -542,7 +615,8 @@ PlatformResult TrainerManager::OptimizerDispose(int id) { return PlatformResult(); } -PlatformResult TrainerManager::CreateFileDataset(int& id, const std::string train_file, +PlatformResult TrainerManager::CreateFileDataset(int& id, + const std::string train_file, const std::string valid_file, const std::string test_file) { ScopeLogger(); @@ -607,6 +681,7 @@ PlatformResult TrainerManager::CreateFileDataset(int& id, const std::string trai datasets_[next_dataset_id_] = std::make_shared>(n_dataset); id = next_dataset_id_++; + return PlatformResult(); } @@ -625,6 +700,7 @@ PlatformResult TrainerManager::DatasetSetProperty( } auto dataset = datasets_[id]; + std::string opt = name + "=" + value; int ret_val = ml_train_dataset_set_property_for_mode(dataset->getNative(), diff --git a/src/ml/ml_trainer_manager.h b/src/ml/ml_trainer_manager.h index 4e38c5a6..4ed91b84 100644 --- a/src/ml/ml_trainer_manager.h +++ b/src/ml/ml_trainer_manager.h @@ -17,6 +17,8 @@ #ifndef ML_ML_TRAINER_MANAGER_H_ #define ML_ML_TRAINER_MANAGER_H_ +#include + #include #include "common/platform_result.h" @@ -78,6 +80,11 @@ class TrainerManager { std::map>> optimizers_; std::map>> layers_; std::map>> datasets_; + + // mutex for thread synchronization is needed only for model as only + // the model operations can be run in separate threads (model training to be + // precise) + std::mutex models_map_lock_; }; } // namespace ml diff --git a/src/ml/ml_trainer_objects.h b/src/ml/ml_trainer_objects.h index e826369e..a506c9bb 100644 --- a/src/ml/ml_trainer_objects.h +++ b/src/ml/ml_trainer_objects.h @@ -23,6 +23,7 @@ then the C library explodes. **/ +#include #include #include @@ -59,6 +60,9 @@ class Model { int datasetIndex; int optimizerIndex; std::vector layerIndices; + // additional mutex for locking single model in different operations + // (like run(), compile() or dispose()) without locking the full models' map + std::mutex instanceLock; }; template