* This ensures that the operations are executed in the order of their
* listing.
*/
- typedef std::tuple<unsigned int, unsigned int, unsigned int, unsigned int> ExecutionOrder;
+ typedef std::tuple<unsigned int, unsigned int, unsigned int, unsigned int>
+ ExecutionOrder;
/**
* @brief Destructor of Layer Class
virtual const std::string getType() const = 0;
/**
+ * @brief Get the trainable parameter
+ *
+ * @return bool true / false
+ */
+ virtual bool getTrainable() const = 0;
+
+ /**
* @brief Get the input connections for this node
*
* @return list of name of the nodes which form input connections
#include <cross_entropy_sigmoid_loss_layer.h>
#include <cross_entropy_softmax_loss_layer.h>
#include <flatten_layer.h>
+#include <grucell.h>
#include <identity_layer.h>
#include <input_layer.h>
#include <layer_node.h>
#include <layer_normalization_layer.h>
+#include <lstmcell.h>
#include <multiout_layer.h>
#include <network_graph.h>
#include <nntrainer_error.h>
#include <nntrainer_log.h>
#include <profiler.h>
#include <rnn.h>
+#include <rnncell.h>
#include <split_layer.h>
#include <time_dist.h>
#include <tracer.h>
* usage less than the max_exec_order are allocated.
*/
tensor_manager->allocateTensors(
- std::get<2>(backward_iter_end->getExecutionOrder()));
+ std::get<3>(backward_iter_end->getExecutionOrder()));
}
}
});
}
+ if (lnode->getType() == RNNCellLayer::type or
+ lnode->getType() == LSTMCellLayer::type or
+ lnode->getType() == GRUCellLayer::type) {
+ std::for_each(
+ out_specs.begin(), out_specs.end(), [this](VarGradSpecV2 &spec) {
+ spec.variable_spec.ls = TensorLifespan::FORWARD_GRAD_LIFESPAN;
+ });
+ }
+
const std::vector<Var_Grad *> &outputs = tensor_manager->requestTensors(
out_specs, Manager::TensorGroupType::OUTPUT, lnode->getExecutionOrder(),
lnode->getName());
*/
void allocateWeights() {
tensor_manager->allocateWeights(
- std::get<2>(backward_iter_end->getExecutionOrder()));
+ std::get<3>(backward_iter_end->getExecutionOrder()));
}
/**
*/
static VarGradSpecV2
outSpec(const TensorDim &dim, const std::string &name = "out",
- TensorLifespan ls = TensorLifespan::FORWARD_GRAD_LIFESPAN,
- TensorLifespan grad_ls = TensorLifespan::BACKWARD_FUNC_LIFESPAN);
+ TensorLifespan ls = TensorLifespan::FORWARD_FUNC_LIFESPAN,
+ TensorLifespan grad_ls = TensorLifespan::CALC_GRAD_DERIV_LIFESPAN);
/**
* @brief request outputs
#include <basic_planner.h>
#include <bn_layer.h>
#include <graph_node.h>
+#include <grucell.h>
#include <layer_node.h>
#include <layer_normalization_layer.h>
+#include <loss/cross_entropy_sigmoid_loss_layer.h>
+#include <loss/cross_entropy_softmax_loss_layer.h>
+#include <loss/mse_loss_layer.h>
#include <manager.h>
#include <multiout_layer.h>
#include <nntrainer_log.h>
#include <optimized_v1_planner.h>
+#include <optimized_v2_planner.h>
+#include <optimized_v3_planner.h>
#include <tensor_pool.h>
#include <tensor_wrap_specs.h>
#include <util_func.h>
static Tensor *requestTensor_(const TensorSpecV2 &spec,
const GraphNode::ExecutionOrder &exec_order,
const std::string &scope, TensorPool &tp,
- bool expose) {
+ bool expose, bool trainable) {
using RT = TensorSpecV2::RequestType;
using LS = TensorLifespan;
NNTR_THROW_IF(spec.request_type == RT::MAYBE_MODIFYING_VIEW,
"requestInputs() requestTensors() instead";
Tensor *var = requestTensor_(spec.variable_spec, exec_order, scope,
- tensor_pool, expose_var);
+ tensor_pool, expose_var, false);
Tensor *grad = spec.gradient_spec
? requestTensor_(*spec.gradient_spec, exec_order, scope,
- tensor_pool, expose_grad)
+ tensor_pool, expose_grad, false)
: nullptr;
/// @note as only supporting identify_as == TensorGroupType::output, only
std::vector<Weight *> Manager::requestWeights(
const GraphNode &node, const std::vector<Weight::Spec> &weights_spec,
bool trainable, const std::vector<std::string> &shared_names) {
- const auto [forwarding_order, calcGradient_order, calcDerivative_order, applyGradient_order] =
- node.getExecutionOrder();
+ const auto [forwarding_order, calcGradient_order, calcDerivative_order,
+ applyGradient_order] = node.getExecutionOrder();
std::vector<unsigned int> default_var_exec_order(
{forwarding_order, calcDerivative_order});
std::vector<Var_Grad *> Manager::requestTensors(
const GraphNode &node, const std::vector<Var_Grad::Spec> &tensors_spec,
bool trainable, const std::vector<std::string> &shared_names) {
- const auto [forwarding_order, calcGradient_order, calcDerivative_order, applyGradient_order] =
- node.getExecutionOrder();
+ const auto [forwarding_order, calcGradient_order, calcDerivative_order,
+ applyGradient_order] = node.getExecutionOrder();
std::vector<Var_Grad *> ret;
size_t current_size = tensors_v2.size();
grad_exec_order.push_back(calcDerivative_order);
}
- if (trainable && enum_class_logical_and(tspan, TensorLifespan::CALC_AGRAD_LIFESPAN)) {
+ if (trainable &&
+ enum_class_logical_and(tspan, TensorLifespan::CALC_AGRAD_LIFESPAN)) {
var_exec_order.push_back(applyGradient_order);
grad_exec_order.push_back(applyGradient_order);
}
if (node.getType() == ActivationLayer::type or
node.getType() == MultiOutLayer::type or
node.getType() == BatchNormalizationLayer::type or
- node.getType() == LayerNormalizationLayer::type)
+ node.getType() == LayerNormalizationLayer::type or !node.getTrainable())
var_common_spec.ls = TensorLifespan::FORWARD_FUNC_LIFESPAN;
+ if (node.getType() == MSELossLayer::type or
+ node.getType() == CrossEntropySoftmaxLossLayer::type or
+ node.getType() == CrossEntropySigmoidLossLayer::type)
+ var_common_spec.ls = TensorLifespan::FORWARD_DERIV_LIFESPAN;
+
+ if (node.getType() == GRUCellLayer::type) {
+ grad_common_spec.ls = TensorLifespan::CALC_GRAD_DERIV_LIFESPAN;
+ }
+
std::vector<Var_Grad *> ret;
size_t current_size = inputs_v2.size();
inputs_v2.emplace_back(std::make_unique<Var_Grad>(
requestTensor_(var_spec, node.getExecutionOrder(), node.getName(),
- tensor_pool, false),
+ tensor_pool, false, node.getTrainable()),
requestTensor_(grad_spec, node.getExecutionOrder(), node.getName(),
- tensor_pool, false)));
+ tensor_pool, false, node.getTrainable())));
}
ret.reserve(inputs_dim.size());
if (!condition || condition(w.get()))
conditional_weights.push_back(w.get());
}
-
return conditional_weights;
}
'swap_device.cpp',
'tensor_pool.cpp',
'optimized_v1_planner.cpp',
+ 'optimized_v2_planner.cpp',
+ 'optimized_v3_planner.cpp',
'task_executor.cpp',
]
end(valid.second),
loc(idx),
size(s),
- offset(0),
- size_from_offset(0) {}
+ offset(0) {}
};
/**
memory_offset[req.loc] = offset;
memory_req = std::max(memory_req, req.offset + req.size);
sorted_req.push_back(&req);
-
-#ifdef DEBUG
- if (n_wgrad && memory_is_wgrad[req.loc]) {
- new_grad_cnt++;
- new_grad_size += req.size;
- }
-#endif
-
}
// validateIntervalOverlap(memory_validity, memory_size, memory_offset,
std::vector<WGradMemoryRequest> wgrad_sorted_req;
bool replace_and_fill = false;
+#ifdef DEBUG
unsigned int new_grad_cnt = 0;
unsigned int reused_grad_cnt = 0;
size_t new_grad_size = 0;
size_t reused_grad_size = 0;
+#endif
for (auto &req : wgrad_requests) {
for (unsigned int idx = 0; idx < wgrad_sorted_req.size(); idx++) {
auto const sr = wgrad_sorted_req[idx];
replace_and_fill = true;
wgrad_sorted_req[idx].start_end.push_back(
std::make_pair(req.start, req.end));
+#ifdef DEBUG
reused_grad_size += req.size;
reused_grad_cnt++;
+#endif
break;
} else {
replace_and_fill = false;
wgrad_sorted_req.push_back(WGradMemoryRequest(&req));
wgrad_sorted_req.back().start_end.push_back(
std::make_pair(req.start, req.end));
+#ifdef DEBUG
new_grad_cnt++;
new_grad_size += req.size;
+#endif
}
-
- ml_logd("Total Requested Memory(OPTV2): %lf MiB>>>>>>>> \n - new mem for "
- "gradient = %d, "
- "(%lf MiB) & reused mem for gradient = %d (%lf MiB)\n",
- memory_req / 1024, new_grad_cnt, new_grad_size / 1024,
- reused_grad_cnt, reused_grad_size / 1024);
}
// validateIntervalOverlap(memory_validity, memory_size, memory_offset,
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2023 Jijoong Moon <jijoong.moon@samsung.com>
+ *
+ * @file optimized_v3_planner.cpp
+ * @date 2 January 2023
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Jijoong Moon <jijoong.moon@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is Optimized V3 Memory Planner
+ *
+ */
+
+#include <algorithm>
+#include <memory>
+#include <nntrainer_error.h>
+#include <stdexcept>
+#include <vector>
+
+#include <optimized_v3_planner.h>
+
+namespace nntrainer {
+
+/**
+ * @brief Memory Request data structure clubbing all the requests
+ *
+ */
+struct MemoryRequest {
+ unsigned int start; /**< start of the validity (inclusive) */
+ unsigned int end; /**< end of the validity (exclusive) */
+ unsigned int loc; /**< index/location of the this request */
+ size_t size; /**< size of the request */
+ size_t offset; /**< offset for this request */
+
+ /**
+ * @brief Constructor for the Memory Request
+ *
+ */
+ MemoryRequest(size_t s, const std::pair<unsigned int, unsigned int> &valid,
+ unsigned int idx) :
+ start(valid.first),
+ end(valid.second),
+ loc(idx),
+ size(s),
+ offset(0) {}
+};
+
+static size_t computeSpace(unsigned int exec_order,
+ std::vector<MemoryRequest *> &sorted_req,
+ std::vector<std::pair<size_t, size_t>> &vacant) {
+ size_t bottom = 0;
+ size_t max_offset = 0;
+
+ std::sort(sorted_req.begin(), sorted_req.end(),
+ [](auto const &v1, auto const &v2) -> int {
+ return v1->offset < v2->offset;
+ /** TODO: try this */
+ // if (v1.end == v2.end)
+ // return v1.start < v2.start;
+ // return v1.end > v2.end;
+ });
+
+ for (unsigned idx = 0; idx < sorted_req.size(); idx++) {
+ auto const &sr = sorted_req[idx];
+ size_t top = sr->offset + sr->size;
+
+ if (max_offset < top)
+ max_offset = top;
+
+ if (sr->offset > bottom) {
+ vacant.push_back(std::make_pair(bottom, sr->offset));
+ }
+ bottom = top;
+ }
+
+ return max_offset;
+}
+
+/**
+ * @brief check if validate interval is overlapping in a very naive way.
+ *
+ * @param memory_validity validity
+ * @param memory_size size
+ * @param memory_offset offset
+ * @param memory_req request
+ */
+[[maybe_unused]] static void validateIntervalOverlap(
+ const std::vector<std::pair<unsigned int, unsigned int>> &memory_validity,
+ const std::vector<size_t> &memory_size,
+ const std::vector<size_t> &memory_offset, size_t memory_req) {
+ auto bits = std::make_unique<bool[]>(memory_req);
+
+ for (size_t i = 0; i < memory_req; ++i) {
+ bits[i] = 0;
+ }
+
+ auto exec_start =
+ std::min_element(memory_validity.begin(), memory_validity.end(),
+ [](auto &a, auto &b) { return a.first < b.first; });
+
+ auto exec_end =
+ std::max_element(memory_validity.begin(), memory_validity.end(),
+ [](auto &a, auto &b) { return a.second < b.second; });
+
+ auto set = [&](int offset, size_t size, int idx) {
+ for (unsigned int i = offset; i < size; ++i) {
+ NNTR_THROW_IF(bits[i], std::invalid_argument)
+ << " bits taken at i: " << i << " offset: " << offset
+ << " size: " << size << " idx: " << idx;
+ bits[i] = 1;
+ }
+ };
+
+ auto unset = [&](int offset, size_t size, int idx) {
+ for (unsigned int i = offset; i < size; ++i) {
+ NNTR_THROW_IF(!bits[i], std::invalid_argument)
+ << "double freeing bits at i: " << i << " offset: " << offset
+ << " size: " << size << " idx: " << idx;
+ bits[i] = 0;
+ }
+ };
+
+ for (unsigned int exec = exec_start->first; exec <= exec_end->second;
+ ++exec) {
+
+ for (unsigned int idx = 0; idx < memory_validity.size(); ++idx) {
+ auto &validity = memory_validity.at(idx);
+ auto &sz = memory_size.at(idx);
+ auto &offset = memory_offset.at(idx);
+ if (validity.first == exec) {
+ set(offset, sz, idx);
+ }
+ if (validity.second == exec) {
+ unset(offset, sz, idx);
+ }
+ }
+ }
+ // check if there is any dangling memory
+ set(0, memory_req, memory_validity.size());
+}
+
+/**
+ * @copydoc MemoryPlanner::planLayout(
+ * const std::vector<size_t> &memory_size,
+ * const std::vector<std::pair<unsigned int, unsigned int>> &memory_validity,
+ * std::vector<size_t> &memory_offset,
+ * std::vector<bool> &memory_is_wgrad);
+ *
+ * @details The optimized v1 memory planner assigns memory to the requests whose
+ * validity starts first.
+ * The requested memories are sorted based on the ascending order of the start
+ * timestamps, and descending order using the end timestamps. The
+ * sorted memories are given increasing offset based on the memory size.
+ * At the end of each timestamp, invalid memories are freed, and offset updated
+ * for reuse. This planner allocates overlapping memory for all the required
+ * memories.
+ *
+ */
+size_t OptimizedV3Planner::planLayout(
+ const std::vector<size_t> &memory_size,
+ const std::vector<std::pair<unsigned int, unsigned int>> &memory_validity,
+ std::vector<size_t> &memory_offset, std::vector<bool> &memory_is_wgrad,
+ size_t n_wgrad) const {
+
+ /** create memory requests structure array for easier management */
+ std::vector<MemoryRequest> requests;
+ requests.reserve(memory_size.size());
+ for (unsigned int idx = 0; idx < memory_size.size(); idx++) {
+ requests.emplace_back(memory_size[idx], memory_validity[idx], idx);
+ }
+
+ /**
+ * sort the memory requests with ascending order of start time first, and
+ * then end time
+ */
+ std::sort(requests.begin(), requests.end(),
+ [](auto const &v1, auto const &v2) -> int {
+ if (v1.start == v2.start)
+ return v1.end < v2.end;
+ return v1.start < v2.start;
+ /** TODO: try this */
+ // if (v1.end == v2.end)
+ // return v1.start < v2.start;
+ // return v1.end > v2.end;
+ });
+
+ /** all the memories in use sorted by their assigned offset and size */
+ std::vector<MemoryRequest *> sorted_req;
+
+ /** iterate over the sorted requests and start allocation of the requests */
+ memory_offset.resize(memory_size.size());
+ size_t memory_req = 0;
+ for (auto &req : requests) {
+ sorted_req.erase(
+ std::remove_if(sorted_req.begin(), sorted_req.end(),
+ [req](auto elem) { return elem->end <= req.start; }),
+ sorted_req.end());
+
+ bool replace_and_fill = false;
+ std::vector<std::pair<size_t, size_t>> vacant;
+
+ size_t max_offset = computeSpace(req.start, sorted_req, vacant);
+
+ for (unsigned int idx = 0; idx < vacant.size(); idx++) {
+ if (vacant[idx].second - vacant[idx].first >= req.size) {
+ req.offset = vacant[idx].first;
+ memory_offset[req.loc] = req.offset;
+ sorted_req.push_back(&req);
+ replace_and_fill = true;
+ break;
+ }
+ }
+ vacant.clear();
+
+ if (replace_and_fill) {
+ continue;
+ }
+
+ req.offset = max_offset;
+ memory_offset[req.loc] = max_offset;
+ memory_req = std::max(memory_req, req.offset + req.size);
+ sorted_req.push_back(&req);
+ }
+
+ return memory_req;
+}
+
+} // namespace nntrainer
--- /dev/null
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2023 Jijoong Moon <jijoong.moon@samsung.com>
+ *
+ * @file optimzied_v3_planner.h
+ * @date 2 January 2023
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Jijoong Moon <jijoong.moon@samsung.com>
+ * @bug No known bugs except for NYI items
+ * @brief This is Optimized V3 Memory Planner
+ *
+ *
+ */
+
+#ifndef __OPTIMIZED_V3_PLANNER_H_
+#define __OPTIMIZED_V3_PLANNER_H_
+
+#include <vector>
+
+#include <memory_planner.h>
+
+namespace nntrainer {
+
+/**
+ * @class OptimizedV3Planner
+ * @brief Optimized V3 Memory Planner provides the optimized plan for memory
+ * layout
+ * @details optimized planner performs sharing of overlapping memory sharing
+ * upto certain extent
+ */
+class OptimizedV3Planner : public MemoryPlanner {
+public:
+ /**
+ * @brief OptimizedV3Planner destructor
+ *
+ */
+ OptimizedV3Planner() = default;
+
+ /**
+ * @copydoc MemoryPlanner::planLayout(
+ * const std::vector<size_t> &memory_size,
+ * const std::vector<std::pair<unsigned int, unsigned int>> &memory_validity,
+ * std::vector<size_t> &memory_offset,
+ * std::vector<bool> &memory_is_wgrad);
+ *
+ */
+ size_t planLayout(
+ const std::vector<size_t> &memory_size,
+ const std::vector<std::pair<unsigned int, unsigned int>> &memory_validity,
+ std::vector<size_t> &memory_offset, std::vector<bool> &memory_is_wgrad,
+ size_t n_wgrad = 0) const;
+
+ /**
+ * @copydoc MemoryPlanner::getType() const
+ *
+ */
+ const std::string &getType() const { return type; }
+
+ inline static const std::string type = "optimized_v3_planner";
+};
+
+} // namespace nntrainer
+
+#endif /** __OPTIMIZED_V3_PLANNER_H_ */