[ML][Training] Thread-safe locks

author Marcin Kaminski <marcin.ka@partner.samsung.com>

Fri, 28 Jan 2022 20:18:41 +0000 (21:18 +0100)

committer Marcin Kaminski <marcin.ka@partner.samsung.com>

Wed, 16 Feb 2022 19:56:30 +0000 (20:56 +0100)
author Marcin Kaminski <marcin.ka@partner.samsung.com>
Fri, 28 Jan 2022 20:18:41 +0000 (21:18 +0100)
committer Marcin Kaminski <marcin.ka@partner.samsung.com>
Wed, 16 Feb 2022 19:56:30 +0000 (20:56 +0100)
diff --git a/src/ml/ml_trainer_manager.cc b/src/ml/ml_trainer_manager.cc

index 00795d5..2f3fcaf 100644 (file)
--- a/src/ml/ml_trainer_manager.cc
+++ b/src/ml/ml_trainer_manager.cc
@@ -79,6 +79,7 @@ PlatformResult TrainerManager::ModelCompile(int id,
    }
  
    auto& model = models_[id];
+  std::lock_guard<std::mutex> modelLock(model->instanceLock);
  
    std::stringstream ss;
    for (const auto& opt : options) {
@@ -124,12 +125,19 @@ PlatformResult TrainerManager::ModelRun(int id,
                                          const picojson::object& options) {
    ScopeLogger();
  
+  // lock the models_ map operations to avoid conflicting with disposal function
+  models_map_lock_.lock();
    if (models_.find(id) == models_.end()) {
      LoggerE("Could not find model with id: %d", id);
+    models_map_lock_.unlock();
      return PlatformResult(ErrorCode::ABORT_ERR, "Could not find model");
    }
  
    auto& model = models_[id];
+  std::lock_guard<std::mutex> modelLock(model->instanceLock);
+  // model instance is securely locked for other operations
+  // so it's safe to unlock map now
+  models_map_lock_.unlock();
  
    if (!model->isCompiled()) {
      LoggerE("Trying to train model that is not compiled");
@@ -182,16 +190,25 @@ PlatformResult TrainerManager::ModelAddLayer(int id, int layerId) {
      return PlatformResult(ErrorCode::ABORT_ERR, "Could not find model");
    }
  
+  auto& model = models_[id];
+  bool available = model->instanceLock.try_lock();
+  if (!available) {
+    LoggerE("Model locked - probaly training in progress");
+    return PlatformResult(ErrorCode::NO_MODIFICATION_ALLOWED_ERR,
+                          "Model training in progress - cannot modify");
+  }
+
    if (layers_.find(layerId) == layers_.end()) {
      LoggerE("Could not find layer with id: %d", id);
+    model->instanceLock.unlock();
      return PlatformResult(ErrorCode::ABORT_ERR, "Could not find layer");
    }
  
-  auto& model = models_[id];
    auto& layer = layers_[layerId];
  
    if (model->isCompiled()) {
      LoggerE("Modification of compiled model");
+    model->instanceLock.unlock();
      return PlatformResult(ErrorCode::INVALID_STATE_ERR,
                            "Modification of compiled model not allowed");
    }
@@ -201,12 +218,14 @@ PlatformResult TrainerManager::ModelAddLayer(int id, int layerId) {
    if (ret_val != ML_ERROR_NONE) {
      LoggerE("Could not add layer to model: %d (%s)", ret_val,
              ml_strerror(ret_val));
+    model->instanceLock.unlock();
      return PlatformResult(ErrorCode::ABORT_ERR, ml_strerror(ret_val));
    }
  
    model->layerIndices.push_back(layerId);
    layer->setAttached(true);
  
+  model->instanceLock.unlock();
    return PlatformResult();
  }
  
@@ -218,15 +237,25 @@ PlatformResult TrainerManager::ModelSetOptimizer(int id, int optimizerId) {
      return PlatformResult(ErrorCode::ABORT_ERR, "Could not find model");
    }
  
+  auto& model = models_[id];
+  bool available = model->instanceLock.try_lock();
+  if (!available) {
+    LoggerE("Model locked - probaly training in progress");
+    return PlatformResult(ErrorCode::NO_MODIFICATION_ALLOWED_ERR,
+                          "Model training in progress - cannot modify");
+  }
+
    if (optimizers_.find(optimizerId) == optimizers_.end()) {
      LoggerE("Could not find optimizer with id: %d", id);
+    model->instanceLock.unlock();
      return PlatformResult(ErrorCode::ABORT_ERR, "Could not find optimizer");
    }
  
-  auto& model = models_[id];
    auto& optimizer = optimizers_[optimizerId];
+
    if (model->isCompiled()) {
      LoggerE("Modification of compiled model");
+    model->instanceLock.unlock();
      return PlatformResult(ErrorCode::INVALID_STATE_ERR,
                            "Modification of compiled model not allowed");
    }
@@ -236,6 +265,7 @@ PlatformResult TrainerManager::ModelSetOptimizer(int id, int optimizerId) {
    if (ret_val != ML_ERROR_NONE) {
      LoggerE("Could not set optimizer for model: %d (%s)", ret_val,
              ml_strerror(ret_val));
+    model->instanceLock.unlock();
      return PlatformResult(ErrorCode::ABORT_ERR, ml_strerror(ret_val));
    }
  
@@ -255,6 +285,7 @@ PlatformResult TrainerManager::ModelSetOptimizer(int id, int optimizerId) {
    model->optimizerIndex = optimizerId;
    optimizer->setAttached(true);
  
+  model->instanceLock.unlock();
    return PlatformResult();
  }
  
@@ -266,16 +297,25 @@ PlatformResult TrainerManager::ModelSetDataset(int id, int datasetId) {
      return PlatformResult(ErrorCode::ABORT_ERR, "Could not find model");
    }
  
+  auto& model = models_[id];
+  bool available = model->instanceLock.try_lock();
+  if (!available) {
+    LoggerE("Model locked - probaly training in progress");
+    return PlatformResult(ErrorCode::NO_MODIFICATION_ALLOWED_ERR,
+                          "Model training in progress - cannot modify");
+  }
+
    if (datasets_.find(datasetId) == datasets_.end()) {
      LoggerE("Could not find dataset with id: %d", id);
+    model->instanceLock.unlock();
      return PlatformResult(ErrorCode::ABORT_ERR, "Could not find dataset");
    }
  
-  auto& model = models_[id];
    auto& dataset = datasets_[datasetId];
  
    if (model->isCompiled()) {
      LoggerE("Modification of compiled model");
+    model->instanceLock.unlock();
      return PlatformResult(ErrorCode::INVALID_STATE_ERR,
                            "Modification of compiled model not allowed");
    }
@@ -285,6 +325,7 @@ PlatformResult TrainerManager::ModelSetDataset(int id, int datasetId) {
    if (ret_val != ML_ERROR_NONE) {
      LoggerE("Could not set dataset for model: %d (%s)", ret_val,
              ml_strerror(ret_val));
+    model->instanceLock.unlock();
      return PlatformResult(ErrorCode::ABORT_ERR, ml_strerror(ret_val));
    }
  
@@ -303,6 +344,7 @@ PlatformResult TrainerManager::ModelSetDataset(int id, int datasetId) {
    model->datasetIndex = datasetId;
    dataset->setAttached(true);
  
+  model->instanceLock.unlock();
    return PlatformResult();
  }
  
@@ -317,6 +359,8 @@ PlatformResult TrainerManager::ModelSummarize(int id,
    }
  
    auto& model = models_[id];
+  std::lock_guard<std::mutex> modelLock(model->instanceLock);
+
    char* tmpSummary = NULL;
  
    int ret_val =
@@ -345,6 +389,12 @@ PlatformResult TrainerManager::ModelSave(int id,
    }
  
    auto& model = models_[id];
+  bool available = model->instanceLock.try_lock();
+  if (!available) {
+    LoggerE("Model locked - probaly training in progress");
+    return PlatformResult(ErrorCode::NO_MODIFICATION_ALLOWED_ERR,
+                          "Model training in progress - cannot save");
+  }
  
    auto tmpString = path;
    if (tmpString.substr(0, FILE_PATH_PREFIX.length()) == FILE_PATH_PREFIX) {
@@ -356,6 +406,8 @@ PlatformResult TrainerManager::ModelSave(int id,
    int ret_val =
        ml_train_model_save(model->getNative(), tmpString.c_str(), format);
  
+  model->instanceLock.unlock();
+
    if (ret_val != ML_ERROR_NONE) {
      LoggerE("Could not model to file: %d (%s)", ret_val, ml_strerror(ret_val));
      return PlatformResult(ErrorCode::ABORT_ERR, ml_strerror(ret_val));
@@ -367,6 +419,9 @@ PlatformResult TrainerManager::ModelSave(int id,
  PlatformResult TrainerManager::ModelDispose(int id) {
    ScopeLogger();
  
+  // lock the models_ map operations to avoid conflicting with model training
+  std::lock_guard<std::mutex> model_lock(models_map_lock_);
+
    if (models_.find(id) == models_.end()) {
      LoggerE("Could not find model with id: %d", id);
      return PlatformResult(ErrorCode::NOT_FOUND_ERR, "Could not find model");
@@ -374,12 +429,23 @@ PlatformResult TrainerManager::ModelDispose(int id) {
  
    auto model = models_[id];
  
+  bool available = model->instanceLock.try_lock();
+  if (!available) {
+    LoggerE("Could not lock model for disposal - probaly training in progress");
+    return PlatformResult(ErrorCode::NO_MODIFICATION_ALLOWED_ERR,
+                          "Model training in progress - disposal not allowed");
+  }
+
    int ret_val = ml_train_model_destroy(model->getNative());
    if (ret_val != ML_ERROR_NONE) {
      LoggerE("Could not destroy model: %d (%s)", ret_val, ml_strerror(ret_val));
+    model->instanceLock.unlock();
      return PlatformResult(ErrorCode::ABORT_ERR, ml_strerror(ret_val));
    }
  
+  // erase model from map and use a shared pointer for related objects removal
+  models_.erase(id);
+
    // When model is destroyed by ml_train_model_destroy() then all attached
    // handles (layers, optimizer, dataset) are also destroyed. This means that
    // after Model disposal all related objects in JS/C++ layer become invalid.
@@ -389,16 +455,18 @@ PlatformResult TrainerManager::ModelDispose(int id) {
      LoggerD("Deleting attached optimizer: %d", model->optimizerIndex);
      optimizers_.erase(model->optimizerIndex);
    }
+
    if (model->datasetIndex) {
      LoggerD("Deleting attached dataset: %d", model->datasetIndex);
      datasets_.erase(model->datasetIndex);
    }
+
    for (auto const& ls : model->layerIndices) {
      LoggerD("Deleting attached layer: %d", ls);
      layers_.erase(ls);
    }
-  models_.erase(id);
  
+  model->instanceLock.unlock();
    return PlatformResult();
  }
  
@@ -417,10 +485,12 @@ PlatformResult TrainerManager::CreateLayer(int& id,
    layers_[next_layer_id_] =
        std::make_shared<NativeWrapper<ml_train_layer_h>>(n_layer);
    id = next_layer_id_++;
+
    return PlatformResult();
  }
  
-PlatformResult TrainerManager::LayerSetProperty(int id, const std::string& name,
+PlatformResult TrainerManager::LayerSetProperty(int id,
+                                                const std::string& name,
                                                  const std::string& value) {
    ScopeLogger("id: %d, name: %s, value: %s", id, name.c_str(), value.c_str());
  
@@ -430,6 +500,7 @@ PlatformResult TrainerManager::LayerSetProperty(int id, const std::string& name,
    }
  
    auto layer = layers_[id];
+
    std::string opt = name + "=" + value;
  
    int ret_val =
@@ -487,6 +558,7 @@ PlatformResult TrainerManager::CreateOptimizer(int& id,
    optimizers_[next_optimizer_id_] =
        std::make_shared<NativeWrapper<ml_train_optimizer_h>>(n_optimizer);
    id = next_optimizer_id_++;
+
    return PlatformResult();
  }
  
@@ -501,6 +573,7 @@ PlatformResult TrainerManager::OptimizerSetProperty(int id,
    }
  
    auto optimizer = optimizers_[id];
+
    std::string opt = name + "=" + value;
    int ret_val = ml_train_optimizer_set_property(optimizer->getNative(),
                                                  opt.c_str(), NULL);
@@ -542,7 +615,8 @@ PlatformResult TrainerManager::OptimizerDispose(int id) {
    return PlatformResult();
  }
  
-PlatformResult TrainerManager::CreateFileDataset(int& id, const std::string train_file,
+PlatformResult TrainerManager::CreateFileDataset(int& id,
+                                                 const std::string train_file,
                                                   const std::string valid_file,
                                                   const std::string test_file) {
    ScopeLogger();
@@ -607,6 +681,7 @@ PlatformResult TrainerManager::CreateFileDataset(int& id, const std::string trai
    datasets_[next_dataset_id_] =
        std::make_shared<NativeWrapper<ml_train_dataset_h>>(n_dataset);
    id = next_dataset_id_++;
+
    return PlatformResult();
  }
  
@@ -625,6 +700,7 @@ PlatformResult TrainerManager::DatasetSetProperty(
    }
  
    auto dataset = datasets_[id];
+
    std::string opt = name + "=" + value;
  
    int ret_val = ml_train_dataset_set_property_for_mode(dataset->getNative(),
diff --git a/src/ml/ml_trainer_manager.h b/src/ml/ml_trainer_manager.h

index 4e38c5a..4ed91b8 100644 (file)
--- a/src/ml/ml_trainer_manager.h
+++ b/src/ml/ml_trainer_manager.h
@@ -17,6 +17,8 @@
  #ifndef ML_ML_TRAINER_MANAGER_H_
  #define ML_ML_TRAINER_MANAGER_H_
  
+#include <mutex>
+
  #include <nntrainer/nntrainer.h>
  
  #include "common/platform_result.h"
@@ -78,6 +80,11 @@ class TrainerManager {
    std::map<int, std::shared_ptr<NativeWrapper<ml_train_optimizer_h>>> optimizers_;
    std::map<int, std::shared_ptr<NativeWrapper<ml_train_layer_h>>> layers_;
    std::map<int, std::shared_ptr<NativeWrapper<ml_train_dataset_h>>> datasets_;
+
+  // mutex for thread synchronization is needed only for model as only
+  // the model operations can be run in separate threads (model training to be
+  // precise)
+  std::mutex models_map_lock_;
  };
  
  }  // namespace ml
diff --git a/src/ml/ml_trainer_objects.h b/src/ml/ml_trainer_objects.h

index e826369..a506c9b 100644 (file)
--- a/src/ml/ml_trainer_objects.h
+++ b/src/ml/ml_trainer_objects.h
@@ -23,6 +23,7 @@ then the C library explodes.
  
  **/
  
+#include <mutex>
  #include <vector>
  
  #include <nntrainer/nntrainer.h>
@@ -59,6 +60,9 @@ class Model {
    int datasetIndex;
    int optimizerIndex;
    std::vector<int> layerIndices;
+  // additional mutex for locking single model in different operations
+  // (like run(), compile() or dispose()) without locking the full models' map
+  std::mutex instanceLock;
  };
  
  template <class T>
author	Marcin Kaminski <marcin.ka@partner.samsung.com>
	Fri, 28 Jan 2022 20:18:41 +0000 (21:18 +0100)
committer	Marcin Kaminski <marcin.ka@partner.samsung.com>
	Wed, 16 Feb 2022 19:56:30 +0000 (20:56 +0100)
src/ml/ml_trainer_manager.cc		patch \| blob \| history
src/ml/ml_trainer_manager.h		patch \| blob \| history
src/ml/ml_trainer_objects.h		patch \| blob \| history