1 /*******************************************************************************
2 * Copyright 2017-2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
19 #include "mkldnn_thread.hpp"
20 #include "mkldnn_types.h"
24 #include "cpu_reducer.hpp"
30 using namespace memory_tracking::names;
32 void reduce_balancer_t::balance() {
34 using namespace utils;
36 assert(nthr_ > 0 && job_size_ > 0 && njobs_ > 0 && reduction_size_ > 0);
38 const int job_complexity = 1;
40 const int min_njobs_per_group = max(1, njobs_ / nthr_);
41 const int max_njobs_per_group = max(1,
42 static_cast<int>(max_buffer_size_ / (nthr_ * job_size_)));
45 int ngroups = min(njobs_ / min_njobs_per_group, nthr_);
46 int nthr_per_group = syncable_ ? min(nthr_ / ngroups, reduction_size_) : 1;
47 int njobs_per_group_ub = div_up(njobs_, ngroups);
49 /* rough upper-bound estimation, will be fixed during brute force */
50 size_t thread_complexity_ub = njobs_ * job_size_ * reduction_size_;
52 /* brute force parameters for the best balance... */
53 for (int c_njobs_per_group = min_njobs_per_group;
54 c_njobs_per_group < njobs_; ++c_njobs_per_group) {
55 /* current assumption */
56 int c_ngroups = min(njobs_ / c_njobs_per_group, nthr_);
57 int c_nthr_per_group = syncable_
58 ? min(nthr_ / c_ngroups, reduction_size_) : 1;
59 int c_njobs_per_group_ub = div_up(njobs_, c_ngroups);
61 if (c_nthr_per_group > 1 && c_njobs_per_group_ub > max_njobs_per_group)
64 int c_thread_reduction_ub = div_up(reduction_size_, c_nthr_per_group);
65 size_t c_group_size_ub = job_size_ * c_njobs_per_group_ub;
66 size_t c_thread_complexity_ub = c_group_size_ub * (
67 job_complexity * c_thread_reduction_ub
68 + (c_nthr_per_group != 1));
70 if (c_thread_complexity_ub < thread_complexity_ub) {
72 nthr_per_group = c_nthr_per_group;
73 njobs_per_group_ub = c_njobs_per_group_ub;
74 thread_complexity_ub = c_thread_complexity_ub;
78 assert(njobs_per_group_ub <= max_njobs_per_group || nthr_per_group == 1);
79 assert(ngroups * nthr_per_group <= nthr_);
80 assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_
81 || nthr_per_group == 1); /* no reduction buffer overflow */
82 assert(IMPLICATION(!syncable_, nthr_per_group == 1));
85 nthr_per_group_ = nthr_per_group;
86 njobs_per_group_ub_ = njobs_per_group_ub;
89 /* reducer jit-ted driver */
91 using namespace Xbyak;
93 template <impl::data_type_t data_type>
94 struct reducer_2d_driver_t: public c_compatible {
95 typedef typename prec_traits<data_type>::type data_t;
97 reducer_2d_driver_t(int n_src, size_t src_ld,
98 size_t src_step, size_t dst_step, bool nullify_dst)
99 : n_src_(n_src), src_ld_(src_ld), src_step_(src_step)
100 , dst_step_(dst_step), nullify_dst_(nullify_dst), ker_(nullptr) {}
101 virtual ~reducer_2d_driver_t() {}
102 void operator()(data_t *dst, const data_t *srcs, size_t ny, size_t nx)
103 { assert(ker_); ker_(dst, srcs, ny, nx); }
107 size_t src_ld_, src_step_, dst_step_;
109 void (*ker_)(data_t *dst, const data_t *srcs, size_t ny, size_t nx);
112 template <impl::data_type_t data_type, cpu_isa_t isa>
113 struct reducer_2d_driver_f_s_32_t: public reducer_2d_driver_t<data_type>,
116 DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t)
118 /* cpu specific part */
119 using Vmm = typename utils::conditional<isa == avx2, Ymm, Zmm>::type;
120 const AddressFrame &vmmword = (isa == avx2) ? yword : zword;
121 void uni_vadd(const Xmm& x1, const Xmm& x2, const Operand& op)
122 { if (data_type == data_type::f32) vaddps(x1, x2, op);
123 else vpaddd(x1, x2, op); }
124 void uni_add(const Xmm& x1, const Operand& op)
125 { if (data_type == data_type::f32) addss(x1, op); else paddd(x1, op); }
127 const int vlen = cpu_isa_traits<isa>::vlen;
129 = sizeof(typename mkldnn::impl::prec_traits<data_type>::type);
130 Xbyak::Reg64 reg_dst = abi_param1;
131 Xbyak::Reg64 reg_src = abi_param2;
132 Xbyak::Reg64 reg_ny = abi_param3;
133 Xbyak::Reg64 reg_nx = abi_param4;
135 Xbyak::Reg64 reg_x = rax;
136 Xbyak::Reg64 reg_src_id = r10;
138 reducer_2d_driver_f_s_32_t(int n_src, size_t src_ld, size_t src_step,
139 size_t dst_step, bool nullify_dst)
140 : reducer_2d_driver_t<data_type>(n_src, src_ld, src_step,
141 dst_step, nullify_dst)
144 void nullify_dst(int nloads, int load_len) {
146 for (int i = 0; i < nloads; ++i)
147 uni_vpxor(Vmm(i), Vmm(i), Vmm(i));
148 /* prefetches[dst] ? */
151 void load_dst(int nloads, int load_len) {
152 for (int i = 0; i < nloads; ++i) {
153 if (load_len == typesize)
154 movd(Xmm(i), ptr[reg_dst + i * load_len]);
155 else if (load_len == vlen)
156 vmovups(Vmm(i), ptr[reg_dst + i * load_len]);
158 assert(!"unsupported");
162 void store_dst(int nloads, int load_len) {
163 for (int i = 0; i < nloads; ++i) {
164 if (load_len == typesize)
165 movd(ptr[reg_dst + i * load_len], Xmm(i));
166 else if (load_len == vlen)
167 vmovups(ptr[reg_dst + i * load_len], Vmm(i));
169 assert(!"unsupported");
173 void accumulate(int nloads, int load_len, size_t base_off) {
174 for (int i = 0; i < nloads; ++i) {
175 size_t off = base_off + i * load_len;
177 if (load_len == typesize)
178 uni_add(Xmm(i), ptr[reg_src + off]);
179 else if (load_len == vlen)
180 uni_vadd(Vmm(i), Vmm(i), vmmword[reg_src + off]);
182 assert(!"unsupported");
187 const int nloads[] = {cpu_isa_traits<isa>::n_vregs, 1, 1};
188 const int nbranches = sizeof(nloads) / sizeof(nloads[0]);
190 const int load_len[nbranches] = {vlen, vlen, typesize};
191 Label loop_x_label[nbranches + 1];
195 for (int id = 0; id < nbranches; ++id) {
198 cmp(reg_x, nloads[id] * load_len[id]);
199 jl(loop_x_label[id + 1], T_NEAR);
201 if (this->nullify_dst_)
202 nullify_dst(nloads[id], load_len[id]);
204 load_dst(nloads[id], load_len[id]);
206 if (nloads[id] > 1) {
208 mov(reg_src_id, this->n_src_);
211 accumulate(nloads[id], load_len[id], 0);
212 add(reg_src, this->src_ld_ * typesize);
215 jnz(loop_srcs, T_NEAR);
217 sub(reg_src, this->n_src_ * this->src_ld_ * typesize);
219 for (int src_id = 0; src_id < this->n_src_; ++src_id) {
220 const size_t base_off = src_id * this->src_ld_ * typesize;
221 accumulate(nloads[id], load_len[id], base_off);
225 store_dst(nloads[id], load_len[id]);
227 add(reg_src, nloads[id] * load_len[id]);
228 add(reg_dst, nloads[id] * load_len[id]);
230 sub(reg_x, nloads[id] * load_len[id]);
232 jmp(loop_x_label[id], T_NEAR);
235 L(loop_x_label[nbranches]);
237 /* restore address registers */
238 sub(reg_src, reg_nx);
239 sub(reg_dst, reg_nx);
243 assert(isa == avx2 || isa == avx512_common || isa == avx512_mic);
254 add(reg_dst, this->dst_step_ * typesize);
255 add(reg_src, this->src_step_ * typesize);
258 jnz(ny_loop, T_NEAR);
261 this->ker_ = reinterpret_cast<decltype(this->ker_)>(
262 const_cast<uint8_t*>(this->getCode()));
266 template <impl::data_type_t data_type>
267 inline reducer_2d_driver_t<data_type> *create_reduce_2d_drv(int n_src,
268 size_t src_ld, size_t src_step, size_t dst_step, bool nullify_dst) {
269 if (mayiuse(avx512_common))
270 return new reducer_2d_driver_f_s_32_t<data_type, avx512_common>(n_src,
271 src_ld, src_step, dst_step, nullify_dst);
272 else if (mayiuse(avx2))
273 return new reducer_2d_driver_f_s_32_t<data_type, avx2>(n_src, src_ld,
274 src_step, dst_step, nullify_dst);
275 assert(!"unimplemented");
281 template <impl::data_type_t data_type>
282 void cpu_reducer_t<data_type>::conf_t::init_scratchpad(
283 memory_tracking::registrar_t &scratchpad) const {
284 if (balancer_.nthr_per_group_ == 1) return;
286 const size_t space_size = balancer_.ngroups_
287 * (balancer_.nthr_per_group_ - 1)
288 * cpu_reducer_t<data_type>::space_per_thread(balancer_);
289 scratchpad.book(key_reducer_space, sizeof(data_t) * space_size, PAGE_4K);
290 scratchpad.book(key_reducer_space_bctx,
291 sizeof(simple_barrier::ctx_t) * balancer_.ngroups_);
294 template <impl::data_type_t data_type>
295 cpu_reducer_t<data_type>::cpu_reducer_t(const conf_t &conf)
296 : conf_(conf), drv_(nullptr)
298 if (balancer().nthr_per_group_ == 1) return;
300 drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_ - 1,
301 space_per_thread(balancer()), 0, 0, false);
304 template <impl::data_type_t data_type>
305 cpu_reducer_t<data_type>::~cpu_reducer_t() { delete drv_; }
307 template <impl::data_type_t data_type>
308 typename cpu_reducer_t<data_type>::data_t *
309 cpu_reducer_t<data_type>::get_local_ptr(int ithr, data_t *dst,
310 const memory_tracking::grantor_t &scratchpad) const {
311 const int id_in_grp = balancer().id_in_group(ithr);
313 /* threads 0 from each group writes directly to the destination */
315 return dst + balancer().ithr_job_off(ithr) * balancer().job_size_;
317 const int grp_id = balancer().group_id(ithr);
318 const int offset_factor = grp_id * (balancer().nthr_per_group_ - 1)
321 auto space = scratchpad.template get<data_t>(key_reducer_space);
322 return space + offset_factor * space_per_thread(balancer());
325 template <impl::data_type_t data_type>
326 void cpu_reducer_t<data_type>::reduce_nolock(int ithr, data_t *dst,
327 const memory_tracking::grantor_t &scratchpad) const {
328 bool redundant_reduction = balancer().nthr_per_group_ == 1
329 || balancer().idle(ithr);
330 if (redundant_reduction) return;
333 if (balancer().id_in_group(ithr) != 0)
334 return; /* only threads 0 do the reduction */
336 const int njobs_in_grp = balancer().ithr_njobs(ithr);
337 data_t *d = get_local_ptr(ithr, dst, scratchpad);
338 for (int id_in_grp = 1; id_in_grp < balancer_.nthr_per_group_; ++id_in_grp)
340 const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad);
341 for (size_t i = 0; i < (size_t)njobs_in_grp * balancer().job_size_; ++i)
345 using namespace utils;
347 const int id_in_grp = balancer().id_in_group(ithr);
348 const int njobs_in_grp = balancer().ithr_njobs(ithr);
349 const size_t cl = 64 / sizeof(data_t);
351 const size_t reduction_size = njobs_in_grp * balancer().job_size_;
352 size_t start{0}, end{0};
353 balance211(div_up(reduction_size, cl), balancer().nthr_per_group_,
354 id_in_grp, start, end);
356 if (start == end) return;
358 data_t *d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start * cl;
359 const data_t *space = get_local_ptr(ithr - id_in_grp + 1, dst, scratchpad)
361 const size_t len = nstl::min(end * cl, reduction_size) - start * cl;
363 (*drv_)(d, space, 1, len);
367 template struct cpu_reducer_t<data_type::f32>;
368 template struct cpu_reducer_t<data_type::s32>;
370 /* cpu_reducer_2d_t */
372 template <impl::data_type_t data_type>
373 void cpu_reducer_2d_t<data_type>::conf_t::init_scratchpad(
374 memory_tracking::registrar_t &scratchpad) const {
375 if (balancer_.nthr_per_group_ == 1) return;
377 const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_
378 * cpu_reducer_2d_t<data_type>::space_per_thread(balancer_);
379 scratchpad.book(key_reducer_space, sizeof(data_t) * space_size);
380 scratchpad.book(key_reducer_space_bctx,
381 sizeof(simple_barrier::ctx_t) * balancer_.ngroups_);
384 template <impl::data_type_t data_type>
385 cpu_reducer_2d_t<data_type>::cpu_reducer_2d_t(const conf_t &conf)
386 : conf_(conf), drv_(nullptr)
388 if (balancer().nthr_per_group_ == 1) return;
390 drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_,
391 space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_,
395 template <impl::data_type_t data_type>
396 cpu_reducer_2d_t<data_type>::~cpu_reducer_2d_t() { delete drv_; }
398 template <impl::data_type_t data_type>
399 typename cpu_reducer_2d_t<data_type>::data_t *cpu_reducer_2d_t<data_type>::
400 get_local_ptr(int ithr, const memory_tracking::grantor_t &scratchpad) const {
401 const int id_in_grp = balancer().id_in_group(ithr);
402 const int grp_id = balancer().group_id(ithr);
403 const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp;
404 auto space = scratchpad.template get<data_t>(key_reducer_space);
405 return space + offset_factor * space_per_thread(balancer());
408 template <impl::data_type_t data_type>
409 int cpu_reducer_2d_t<data_type>::choose_x_blocking(int nx, int ny,
410 int nthr_per_grp) const {
411 // find x_blocking for better balance reducing work between threads
412 assert(conf_.x_block_ > 0 && nx > conf_.x_block_
413 && nx % conf_.x_block_ == 0);
414 int x_blocking = nx / conf_.x_block_;
416 utils::div_up(x_blocking, nstl::max(1, nthr_per_grp / ny));
418 if (x_blocking % 2 == 0 && x_blocking >= min_x_blocking * 2)
420 else if (x_blocking % 3 == 0 && x_blocking >= min_x_blocking * 3)
425 if (x_blocking >= min_x_blocking * 4) x_blocking = 1;
426 x_blocking *= conf_.x_block_;
430 template <impl::data_type_t data_type>
431 void cpu_reducer_2d_t<data_type>::reduce_block(const data_t* space_base,
432 data_t *dst, int job, int start_y, int start_x,
433 int ny_start, int nx_start, int ny_step, int nx_step) const {
434 data_t *d = dst + (start_y + ny_start) * conf_.dst_x_
435 + start_x + nx_start;
436 const data_t *space = space_base + job * balancer().job_size_
437 + ny_start * conf_.job_size_x_ + nx_start;
439 for (int idg = 0; idg < balancer().nthr_per_group_; ++idg) {
440 const data_t *w = &space[idg * space_per_thread(balancer())];
441 for (int y = 0; y < ny_step; ++y)
442 for (int x = 0; x < nx_step; ++x) {
443 d[y * conf_.dst_x_ + x]
444 = (idg == 0 ? 0 : d[y * conf_.dst_x_ + x])
445 + w[y * conf_.job_size_x_ + x];
449 (*drv_)(d, space, ny_step, nx_step);
453 template <impl::data_type_t data_type>
454 void cpu_reducer_2d_t<data_type>::reduce_nolock(int ithr, data_t *dst,
455 const memory_tracking::grantor_t &scratchpad) const {
456 bool redundant_reduction = balancer().nthr_per_group_ == 1
457 || balancer().idle(ithr);
458 if (redundant_reduction) return;
460 const int id_in_grp = balancer().id_in_group(ithr);
461 const int njobs_in_grp = balancer().ithr_njobs(ithr);
462 const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_);
463 const int global_job_start = balancer().ithr_job_off(ithr);
465 const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad);
467 const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_);
468 const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps;
470 if (id_in_grp >= pr_grps * pr_nthr_per_grp)
473 const int pr_my_grp = id_in_grp / pr_nthr_per_grp;
474 const int pr_my_id = id_in_grp % pr_nthr_per_grp;
476 int pr_job_start{0}, pr_job_end{0};
477 balance211(njobs_in_grp, pr_grps, pr_my_grp, pr_job_start, pr_job_end);
479 for (int j = pr_job_start; j < pr_job_end; ++j) {
480 const int global_job = global_job_start + j;
481 const int j_y = global_job / njobs_x;
482 const int j_x = global_job % njobs_x;
483 const int start_y = j_y * conf_.job_size_y_;
484 const int start_x = j_x * conf_.job_size_x_;
485 const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_);
486 const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_);
487 int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp);
489 int nxy_start{0}, nxy_end{0};
490 balance211(ny * nx / x_blocking, pr_nthr_per_grp, pr_my_id,
492 if (nxy_start == nxy_end) continue;
493 nxy_start *= x_blocking;
494 nxy_end *= x_blocking;
498 int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy);
499 reduce_block(space_base, dst, j, start_y, start_x,
500 nxy / nx, nxy % nx, 1, nx_step);
503 if ((nxy_end - nxy) > nx) {
504 int ny_step = (nxy_end - nxy) / nx;
505 reduce_block(space_base, dst, j, start_y, start_x,
506 nxy / nx, nxy % nx, ny_step, nx);
509 if ((nxy_end - nxy) > 0) {
510 reduce_block(space_base, dst, j, start_y, start_x,
511 nxy / nx, nxy % nx, 1, nxy_end - nxy);
516 template struct cpu_reducer_2d_t<data_type::f32>;
517 template struct cpu_reducer_2d_t<data_type::s32>;
519 /* accumulator section */
521 template <impl::data_type_t data_type>
522 cpu_accumulator_1d_t<data_type>::cpu_accumulator_1d_t(): drv_(nullptr) {
523 drv_ = create_reduce_2d_drv<data_type>(1, 0, 0, 0, false);
526 template <impl::data_type_t data_type>
527 cpu_accumulator_1d_t<data_type>::~cpu_accumulator_1d_t() {
531 template <impl::data_type_t data_type>
532 void cpu_accumulator_1d_t<data_type>::accumulate(data_t *dst,
533 const data_t *src, size_t size) {
534 (*drv_)(dst, src, 1, size);
537 template struct cpu_accumulator_1d_t<data_type::f32>;
538 template struct cpu_accumulator_1d_t<data_type::s32>;
544 // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s