1 // SPDX-License-Identifier: Apache-2.0
3 * Copyright (C) 2022 Jijoong Moon <jijoong.moon@samsung.com>
5 * @file optimized_v2_planner.cpp
6 * @date 29 December 2022
7 * @see https://github.com/nnstreamer/nntrainer
8 * @author Parichay Kapoor <pk.kapoor@samsung.com>
9 * @author Jijoong Moon <jijoong.moon@samsung.com>
10 * @bug No known bugs except for NYI items
11 * @brief This is Optimized V2 Memory Planner
17 #include <nntrainer_error.h>
18 #include <nntrainer_log.h>
22 #include <optimized_v2_planner.h>
27 * @brief Memory Request data structure clubbing all the requests
30 struct MemoryRequest {
31 unsigned int start; /**< start of the validity (inclusive) */
32 unsigned int end; /**< end of the validity (exclusive) */
33 unsigned int loc; /**< index/location of the this request */
34 size_t size; /**< size of the request */
35 size_t offset; /**< offset for this request */
38 * @brief Constructor for the Memory Request
41 MemoryRequest(size_t s, const std::pair<unsigned int, unsigned int> &valid,
51 * @brief Memory Request data structure clubbing for the weight gradient
54 struct WGradMemoryRequest {
55 MemoryRequest *mem_req;
56 std::vector<std::pair<unsigned int, unsigned int>> start_end;
58 WGradMemoryRequest(MemoryRequest *req) : mem_req(req) {}
62 * @brief check if validate interval is overlapping in a very naive way.
64 * @param memory_validity validity
65 * @param memory_size size
66 * @param memory_offset offset
67 * @param memory_req request
69 [[maybe_unused]] static void validateIntervalOverlap(
70 const std::vector<std::pair<unsigned int, unsigned int>> &memory_validity,
71 const std::vector<size_t> &memory_size,
72 const std::vector<size_t> &memory_offset, size_t memory_req) {
73 auto bits = std::make_unique<bool[]>(memory_req);
75 for (size_t i = 0; i < memory_req; ++i) {
80 std::min_element(memory_validity.begin(), memory_validity.end(),
81 [](auto &a, auto &b) { return a.first < b.first; });
84 std::max_element(memory_validity.begin(), memory_validity.end(),
85 [](auto &a, auto &b) { return a.second < b.second; });
87 auto set = [&](int offset, size_t size, int idx) {
88 for (unsigned int i = offset; i < size; ++i) {
89 NNTR_THROW_IF(bits[i], std::invalid_argument)
90 << " bits taken at i: " << i << " offset: " << offset
91 << " size: " << size << " idx: " << idx;
96 auto unset = [&](int offset, size_t size, int idx) {
97 for (unsigned int i = offset; i < size; ++i) {
98 NNTR_THROW_IF(!bits[i], std::invalid_argument)
99 << "double freeing bits at i: " << i << " offset: " << offset
100 << " size: " << size << " idx: " << idx;
105 for (unsigned int exec = exec_start->first; exec <= exec_end->second;
108 for (unsigned int idx = 0; idx < memory_validity.size(); ++idx) {
109 auto &validity = memory_validity.at(idx);
110 auto &sz = memory_size.at(idx);
111 auto &offset = memory_offset.at(idx);
112 if (validity.first == exec) {
113 set(offset, sz, idx);
115 if (validity.second == exec) {
116 unset(offset, sz, idx);
120 // check if there is any dangling memory
121 set(0, memory_req, memory_validity.size());
125 * @copydoc MemoryPlanner::planLayout(
126 * const std::vector<size_t> &memory_size,
127 * const std::vector<std::pair<unsigned int, unsigned int>> &memory_validity,
128 * std::vector<size_t> &memory_offset,
129 * std::vector<bool> &memory_is_wgrad);
131 * @details The optimized v1 memory planner assigns memory to the requests whose
132 * validity starts first.
133 * The requested memories are sorted based on the ascending order of the start
134 * timestamps, and descending order using the end timestamps. The
135 * sorted memories are given increasing offset based on the memory size.
136 * At the end of each timestamp, invalid memories are freed, and offset updated
137 * for reuse. This planner allocates overlapping memory for all the required
141 size_t OptimizedV2Planner::planLayout(
142 const std::vector<size_t> &memory_size,
143 const std::vector<std::pair<unsigned int, unsigned int>> &memory_validity,
144 std::vector<size_t> &memory_offset, std::vector<bool> &memory_is_wgrad,
145 size_t n_wgrad) const {
147 std::vector<MemoryRequest> wgrad_requests;
148 wgrad_requests.reserve(n_wgrad);
150 /** create memory requests structure array for easier management */
151 std::vector<MemoryRequest> requests;
152 requests.reserve(memory_size.size() - n_wgrad);
154 for (unsigned int idx = 0; idx < memory_size.size(); idx++) {
155 if (!memory_is_wgrad[idx]) {
156 requests.emplace_back(memory_size[idx], memory_validity[idx], idx);
158 wgrad_requests.emplace_back(memory_size[idx], memory_validity[idx],
163 for (unsigned int idx = 0; idx < memory_size.size(); idx++) {
164 requests.emplace_back(memory_size[idx], memory_validity[idx], idx);
169 * sort the memory requests with ascending order of start time first, and
172 std::sort(requests.begin(), requests.end(),
173 [](auto const &v1, auto const &v2) -> int {
174 if (v1.start == v2.start)
175 return v1.end < v2.end;
176 return v1.start < v2.start;
177 /** TODO: try this */
178 // if (v1.end == v2.end)
179 // return v1.start < v2.start;
180 // return v1.end > v2.end;
183 /** all the memories in use sorted by their assigned offset and size */
184 std::vector<MemoryRequest *> sorted_req;
186 /** iterate over the sorted requests and start allocation of the requests */
187 memory_offset.resize(memory_size.size());
188 size_t memory_req = 0;
189 for (auto &req : requests) {
190 /** remove expired memories and update offset */
191 while (!sorted_req.empty() && sorted_req.back()->end <= req.start)
192 sorted_req.pop_back();
194 /** if there exists an expired memory with same size (not at the edge),
196 bool replace_and_fill = false;
197 for (int idx = sorted_req.size() - 1; idx >= 0; idx--) {
198 auto const &sr = sorted_req[idx];
199 /** TODO: reuse if memory size not exactly match */
200 if (sr->end <= req.start && sr->size == req.size) {
201 req.offset = sr->offset;
202 memory_offset[req.loc] = req.offset;
203 sorted_req[idx] = &req;
204 replace_and_fill = true;
208 if (replace_and_fill) {
213 if (!sorted_req.empty())
214 offset = sorted_req.back()->offset + sorted_req.back()->size;
216 /** assign offset to the new request and push to queue */
218 memory_offset[req.loc] = offset;
219 memory_req = std::max(memory_req, req.offset + req.size);
220 sorted_req.push_back(&req);
223 if (wgrad_requests.size()) {
224 /** TODO: We donot need to start from memeory_req. We might find proper
225 * offset considering execution order */
226 size_t last_offset = memory_req;
228 /* sort the memory request with ascending order of size */
230 wgrad_requests.begin(), wgrad_requests.end(),
231 [](auto const &v1, auto const &v2) -> int { return v1.size > v2.size; });
233 std::vector<WGradMemoryRequest> wgrad_sorted_req;
235 bool replace_and_fill = false;
236 unsigned int new_grad_cnt = 0;
237 unsigned int reused_grad_cnt = 0;
238 size_t new_grad_size = 0;
239 size_t reused_grad_size = 0;
240 for (auto &req : wgrad_requests) {
241 for (unsigned int idx = 0; idx < wgrad_sorted_req.size(); idx++) {
242 auto const sr = wgrad_sorted_req[idx];
244 if (sr.mem_req->size >= req.size) {
245 for (auto &interval : sr.start_end) {
246 if ((interval.first < req.start && interval.first < req.end &&
247 req.end < interval.second) ||
248 (req.start > interval.first && req.start < interval.second &&
249 req.end > interval.second) ||
250 (req.start == interval.first && req.end == interval.second)) {
258 req.offset = sr.mem_req->offset;
259 memory_offset[req.loc] = req.offset;
260 replace_and_fill = true;
261 wgrad_sorted_req[idx].start_end.push_back(
262 std::make_pair(req.start, req.end));
263 reused_grad_size += req.size;
267 replace_and_fill = false;
270 if (replace_and_fill) {
274 size_t offset = last_offset;
275 if (!wgrad_sorted_req.empty())
276 offset = wgrad_sorted_req.back().mem_req->offset +
277 wgrad_sorted_req.back().mem_req->size;
280 memory_offset[req.loc] = offset;
281 memory_req = std::max(memory_req, req.offset + req.size);
282 wgrad_sorted_req.push_back(WGradMemoryRequest(&req));
283 wgrad_sorted_req.back().start_end.push_back(
284 std::make_pair(req.start, req.end));
286 new_grad_size += req.size;
289 ml_logd("Total Requested Memory(OPTV2): %lf MiB>>>>>>>> \n - new mem for "
291 "(%lf MiB) & reused mem for gradient = %d (%lf MiB)\n",
292 memory_req / 1024, new_grad_cnt, new_grad_size / 1024,
293 reused_grad_cnt, reused_grad_size / 1024);
296 // validateIntervalOverlap(memory_validity, memory_size, memory_offset,
302 } // namespace nntrainer