inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reducer.cpp

   1 /*******************************************************************************
   2 * Copyright 2017-2018 Intel Corporation
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *     http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 * See the License for the specific language governing permissions and
  14 * limitations under the License.
  15 *******************************************************************************/
  16
  17 #include <assert.h>
  18
  19 #include "mkldnn_thread.hpp"
  20 #include "mkldnn_types.h"
  21 #include "nstl.hpp"
  22 #include "utils.hpp"
  23
  24 #include "cpu_reducer.hpp"
  25
  26 namespace mkldnn {
  27 namespace impl {
  28 namespace cpu {
  29
  30 using namespace memory_tracking::names;
  31
  32 void reduce_balancer_t::balance() {
  33     using namespace nstl;
  34     using namespace utils;
  35
  36     assert(nthr_ > 0 && job_size_ > 0 && njobs_ > 0 && reduction_size_ > 0);
  37
  38     const int job_complexity = 1;
  39
  40     const int min_njobs_per_group = max(1, njobs_ / nthr_);
  41     const int max_njobs_per_group = max(1,
  42             static_cast<int>(max_buffer_size_ / (nthr_ * job_size_)));
  43
  44     /* initial guess */
  45     int ngroups = min(njobs_ / min_njobs_per_group, nthr_);
  46     int nthr_per_group = syncable_ ? min(nthr_ / ngroups, reduction_size_) : 1;
  47     int njobs_per_group_ub = div_up(njobs_, ngroups);
  48
  49     /* rough upper-bound estimation, will be fixed during brute force */
  50     size_t thread_complexity_ub = njobs_ * job_size_ * reduction_size_;
  51
  52     /* brute force parameters for the best balance... */
  53     for (int c_njobs_per_group = min_njobs_per_group;
  54             c_njobs_per_group < njobs_; ++c_njobs_per_group) {
  55         /* current assumption */
  56         int c_ngroups = min(njobs_ / c_njobs_per_group, nthr_);
  57         int c_nthr_per_group = syncable_
  58             ? min(nthr_ / c_ngroups, reduction_size_) : 1;
  59         int c_njobs_per_group_ub = div_up(njobs_, c_ngroups);
  60
  61         if (c_nthr_per_group > 1 && c_njobs_per_group_ub > max_njobs_per_group)
  62             continue;
  63
  64         int c_thread_reduction_ub = div_up(reduction_size_, c_nthr_per_group);
  65         size_t c_group_size_ub = job_size_ * c_njobs_per_group_ub;
  66         size_t c_thread_complexity_ub = c_group_size_ub * (
  67                 job_complexity * c_thread_reduction_ub
  68                 + (c_nthr_per_group != 1));
  69
  70         if (c_thread_complexity_ub < thread_complexity_ub) {
  71             ngroups = c_ngroups;
  72             nthr_per_group = c_nthr_per_group;
  73             njobs_per_group_ub = c_njobs_per_group_ub;
  74             thread_complexity_ub = c_thread_complexity_ub;
  75         }
  76     }
  77
  78     assert(njobs_per_group_ub <= max_njobs_per_group || nthr_per_group == 1);
  79     assert(ngroups * nthr_per_group <= nthr_);
  80     assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_
  81             || nthr_per_group == 1); /* no reduction buffer overflow */
  82     assert(IMPLICATION(!syncable_, nthr_per_group == 1));
  83
  84     ngroups_ = ngroups;
  85     nthr_per_group_ = nthr_per_group;
  86     njobs_per_group_ub_ = njobs_per_group_ub;
  87 }
  88
  89 /* reducer jit-ted driver */
  90
  91 using namespace Xbyak;
  92
  93 template <impl::data_type_t data_type>
  94 struct reducer_2d_driver_t: public c_compatible {
  95     typedef typename prec_traits<data_type>::type data_t;
  96
  97     reducer_2d_driver_t(int n_src, size_t src_ld,
  98             size_t src_step, size_t dst_step, bool nullify_dst)
  99         : n_src_(n_src), src_ld_(src_ld), src_step_(src_step)
 100         , dst_step_(dst_step), nullify_dst_(nullify_dst), ker_(nullptr) {}
 101     virtual ~reducer_2d_driver_t() {}
 102     void operator()(data_t *dst, const data_t *srcs, size_t ny, size_t nx)
 103     { assert(ker_); ker_(dst, srcs, ny, nx); }
 104
 105 protected:
 106     int n_src_;
 107     size_t src_ld_, src_step_, dst_step_;
 108     bool nullify_dst_;
 109     void (*ker_)(data_t *dst, const data_t *srcs, size_t ny, size_t nx);
 110 };
 111
 112 template <impl::data_type_t data_type, cpu_isa_t isa>
 113 struct reducer_2d_driver_f_s_32_t: public reducer_2d_driver_t<data_type>,
 114     public jit_generator
 115 {
 116     DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t)
 117
 118     /* cpu specific part */
 119     using Vmm = typename utils::conditional<isa == avx2, Ymm, Zmm>::type;
 120     const AddressFrame &vmmword = (isa == avx2) ? yword : zword;
 121     void uni_vadd(const Xmm& x1, const Xmm& x2, const Operand& op)
 122     { if (data_type == data_type::f32) vaddps(x1, x2, op);
 123       else vpaddd(x1, x2, op); }
 124     void uni_add(const Xmm& x1, const Operand& op)
 125     { if (data_type == data_type::f32) addss(x1, op); else paddd(x1, op); }
 126
 127     const int vlen = cpu_isa_traits<isa>::vlen;
 128     const int typesize
 129         = sizeof(typename mkldnn::impl::prec_traits<data_type>::type);
 130     Xbyak::Reg64 reg_dst = abi_param1;
 131     Xbyak::Reg64 reg_src = abi_param2;
 132     Xbyak::Reg64 reg_ny = abi_param3;
 133     Xbyak::Reg64 reg_nx = abi_param4;
 134
 135     Xbyak::Reg64 reg_x = rax;
 136     Xbyak::Reg64 reg_src_id = r10;
 137
 138     reducer_2d_driver_f_s_32_t(int n_src, size_t src_ld, size_t src_step,
 139             size_t dst_step, bool nullify_dst)
 140         : reducer_2d_driver_t<data_type>(n_src, src_ld, src_step,
 141                 dst_step, nullify_dst)
 142     { generate(); }
 143
 144     void nullify_dst(int nloads, int load_len) {
 145         UNUSED(load_len);
 146         for (int i = 0; i < nloads; ++i)
 147             uni_vpxor(Vmm(i), Vmm(i), Vmm(i));
 148         /* prefetches[dst] ? */
 149     }
 150
 151     void load_dst(int nloads, int load_len) {
 152         for (int i = 0; i < nloads; ++i) {
 153             if (load_len == typesize)
 154                 movd(Xmm(i), ptr[reg_dst + i * load_len]);
 155             else if (load_len == vlen)
 156                 vmovups(Vmm(i), ptr[reg_dst + i * load_len]);
 157             else
 158                 assert(!"unsupported");
 159         }
 160     }
 161
 162     void store_dst(int nloads, int load_len) {
 163         for (int i = 0; i < nloads; ++i) {
 164             if (load_len == typesize)
 165                 movd(ptr[reg_dst + i * load_len], Xmm(i));
 166             else if (load_len == vlen)
 167                 vmovups(ptr[reg_dst + i * load_len], Vmm(i));
 168             else
 169                 assert(!"unsupported");
 170         }
 171     }
 172
 173     void accumulate(int nloads, int load_len, size_t base_off) {
 174         for (int i = 0; i < nloads; ++i) {
 175             size_t off = base_off + i * load_len;
 176
 177             if (load_len == typesize)
 178                 uni_add(Xmm(i), ptr[reg_src + off]);
 179             else if (load_len == vlen)
 180                 uni_vadd(Vmm(i), Vmm(i), vmmword[reg_src + off]);
 181             else
 182                 assert(!"unsupported");
 183         }
 184     }
 185
 186     void loop_x() {
 187         const int nloads[] = {cpu_isa_traits<isa>::n_vregs, 1, 1};
 188         const int nbranches = sizeof(nloads) / sizeof(nloads[0]);
 189
 190         const int load_len[nbranches] = {vlen, vlen, typesize};
 191         Label loop_x_label[nbranches + 1];
 192
 193         mov(reg_x, reg_nx);
 194
 195         for (int id = 0; id < nbranches; ++id) {
 196             L(loop_x_label[id]);
 197
 198             cmp(reg_x, nloads[id] * load_len[id]);
 199             jl(loop_x_label[id + 1], T_NEAR);
 200
 201             if (this->nullify_dst_)
 202                 nullify_dst(nloads[id], load_len[id]);
 203             else
 204                 load_dst(nloads[id], load_len[id]);
 205
 206             if (nloads[id] > 1) {
 207                 Label loop_srcs;
 208                 mov(reg_src_id, this->n_src_);
 209                 L(loop_srcs);
 210
 211                 accumulate(nloads[id], load_len[id], 0);
 212                 add(reg_src, this->src_ld_ * typesize);
 213
 214                 dec(reg_src_id);
 215                 jnz(loop_srcs, T_NEAR);
 216
 217                 sub(reg_src, this->n_src_ * this->src_ld_ * typesize);
 218             } else {
 219                 for (int src_id = 0; src_id < this->n_src_; ++src_id) {
 220                     const size_t base_off = src_id * this->src_ld_ * typesize;
 221                     accumulate(nloads[id], load_len[id], base_off);
 222                 }
 223             }
 224
 225             store_dst(nloads[id], load_len[id]);
 226
 227             add(reg_src, nloads[id] * load_len[id]);
 228             add(reg_dst, nloads[id] * load_len[id]);
 229
 230             sub(reg_x, nloads[id] * load_len[id]);
 231
 232             jmp(loop_x_label[id], T_NEAR);
 233         }
 234
 235         L(loop_x_label[nbranches]);
 236
 237         /* restore address registers */
 238         sub(reg_src, reg_nx);
 239         sub(reg_dst, reg_nx);
 240     }
 241
 242     void generate() {
 243         assert(isa == avx2 || isa == avx512_common || isa == avx512_mic);
 244
 245         preamble();
 246
 247         shl(reg_nx, 2);
 248
 249         Label ny_loop;
 250         L(ny_loop);
 251
 252         loop_x();
 253
 254         add(reg_dst, this->dst_step_ * typesize);
 255         add(reg_src, this->src_step_ * typesize);
 256
 257         dec(reg_ny);
 258         jnz(ny_loop, T_NEAR);
 259
 260         postamble();
 261         this->ker_ = reinterpret_cast<decltype(this->ker_)>(
 262             const_cast<uint8_t*>(this->getCode()));
 263     }
 264 };
 265
 266 template <impl::data_type_t data_type>
 267 inline reducer_2d_driver_t<data_type> *create_reduce_2d_drv(int n_src,
 268         size_t src_ld, size_t src_step, size_t dst_step, bool nullify_dst) {
 269     if (mayiuse(avx512_common))
 270         return new reducer_2d_driver_f_s_32_t<data_type, avx512_common>(n_src,
 271             src_ld, src_step, dst_step, nullify_dst);
 272     else if (mayiuse(avx2))
 273         return new reducer_2d_driver_f_s_32_t<data_type, avx2>(n_src, src_ld,
 274             src_step, dst_step, nullify_dst);
 275     assert(!"unimplemented");
 276     return nullptr;
 277 }
 278
 279 /* cpu_reducer_t */
 280
 281 template <impl::data_type_t data_type>
 282 void cpu_reducer_t<data_type>::conf_t::init_scratchpad(
 283         memory_tracking::registrar_t &scratchpad) const {
 284     if (balancer_.nthr_per_group_ == 1) return;
 285
 286     const size_t space_size = balancer_.ngroups_
 287         * (balancer_.nthr_per_group_ - 1)
 288         * cpu_reducer_t<data_type>::space_per_thread(balancer_);
 289     scratchpad.book(key_reducer_space, sizeof(data_t) * space_size, PAGE_4K);
 290     scratchpad.book(key_reducer_space_bctx,
 291             sizeof(simple_barrier::ctx_t) * balancer_.ngroups_);
 292 }
 293
 294 template <impl::data_type_t data_type>
 295 cpu_reducer_t<data_type>::cpu_reducer_t(const conf_t &conf)
 296     : conf_(conf), drv_(nullptr)
 297 {
 298     if (balancer().nthr_per_group_ == 1) return;
 299
 300     drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_ - 1,
 301             space_per_thread(balancer()), 0, 0, false);
 302 }
 303
 304 template <impl::data_type_t data_type>
 305 cpu_reducer_t<data_type>::~cpu_reducer_t() { delete drv_; }
 306
 307 template <impl::data_type_t data_type>
 308 typename cpu_reducer_t<data_type>::data_t *
 309 cpu_reducer_t<data_type>::get_local_ptr(int ithr, data_t *dst,
 310         const memory_tracking::grantor_t &scratchpad) const {
 311     const int id_in_grp = balancer().id_in_group(ithr);
 312
 313     /* threads 0 from each group writes directly to the destination */
 314     if (id_in_grp == 0)
 315         return dst + balancer().ithr_job_off(ithr) * balancer().job_size_;
 316
 317     const int grp_id = balancer().group_id(ithr);
 318     const int offset_factor = grp_id * (balancer().nthr_per_group_ - 1)
 319         + (id_in_grp - 1);
 320
 321     auto space = scratchpad.template get<data_t>(key_reducer_space);
 322     return space + offset_factor * space_per_thread(balancer());
 323 }
 324
 325 template <impl::data_type_t data_type>
 326 void cpu_reducer_t<data_type>::reduce_nolock(int ithr, data_t *dst,
 327         const memory_tracking::grantor_t &scratchpad) const {
 328     bool redundant_reduction = balancer().nthr_per_group_ == 1
 329         || balancer().idle(ithr);
 330     if (redundant_reduction) return;
 331
 332 #ifdef SIMPLE_IMPL
 333     if (balancer().id_in_group(ithr) != 0)
 334         return; /* only threads 0 do the reduction */
 335
 336     const int njobs_in_grp = balancer().ithr_njobs(ithr);
 337     data_t *d = get_local_ptr(ithr, dst, scratchpad);
 338     for (int id_in_grp = 1; id_in_grp < balancer_.nthr_per_group_; ++id_in_grp)
 339     {
 340         const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad);
 341         for (size_t i = 0; i < (size_t)njobs_in_grp * balancer().job_size_; ++i)
 342             d[i] += space[i];
 343     }
 344 #else
 345     using namespace utils;
 346
 347     const int id_in_grp = balancer().id_in_group(ithr);
 348     const int njobs_in_grp = balancer().ithr_njobs(ithr);
 349     const size_t cl = 64 / sizeof(data_t);
 350
 351     const size_t reduction_size = njobs_in_grp * balancer().job_size_;
 352     size_t start{0}, end{0};
 353     balance211(div_up(reduction_size, cl), balancer().nthr_per_group_,
 354             id_in_grp, start, end);
 355
 356     if (start == end) return;
 357
 358     data_t *d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start * cl;
 359     const data_t *space = get_local_ptr(ithr - id_in_grp + 1, dst, scratchpad)
 360         + start * cl;
 361     const size_t len = nstl::min(end * cl, reduction_size) - start * cl;
 362
 363     (*drv_)(d, space, 1, len);
 364 #endif
 365 }
 366
 367 template struct cpu_reducer_t<data_type::f32>;
 368 template struct cpu_reducer_t<data_type::s32>;
 369
 370 /* cpu_reducer_2d_t */
 371
 372 template <impl::data_type_t data_type>
 373 void cpu_reducer_2d_t<data_type>::conf_t::init_scratchpad(
 374         memory_tracking::registrar_t &scratchpad) const {
 375     if (balancer_.nthr_per_group_ == 1) return;
 376
 377     const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_
 378         * cpu_reducer_2d_t<data_type>::space_per_thread(balancer_);
 379     scratchpad.book(key_reducer_space, sizeof(data_t) * space_size);
 380     scratchpad.book(key_reducer_space_bctx,
 381             sizeof(simple_barrier::ctx_t) * balancer_.ngroups_);
 382 }
 383
 384 template <impl::data_type_t data_type>
 385 cpu_reducer_2d_t<data_type>::cpu_reducer_2d_t(const conf_t &conf)
 386     : conf_(conf), drv_(nullptr)
 387 {
 388     if (balancer().nthr_per_group_ == 1) return;
 389
 390     drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_,
 391             space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_,
 392             true);
 393 }
 394
 395 template <impl::data_type_t data_type>
 396 cpu_reducer_2d_t<data_type>::~cpu_reducer_2d_t() { delete drv_; }
 397
 398 template <impl::data_type_t data_type>
 399 typename cpu_reducer_2d_t<data_type>::data_t *cpu_reducer_2d_t<data_type>::
 400 get_local_ptr(int ithr, const memory_tracking::grantor_t &scratchpad) const {
 401     const int id_in_grp = balancer().id_in_group(ithr);
 402     const int grp_id = balancer().group_id(ithr);
 403     const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp;
 404     auto space = scratchpad.template get<data_t>(key_reducer_space);
 405     return space + offset_factor * space_per_thread(balancer());
 406 }
 407
 408 template <impl::data_type_t data_type>
 409 int cpu_reducer_2d_t<data_type>::choose_x_blocking(int nx, int ny,
 410         int nthr_per_grp) const {
 411     // find x_blocking for better balance reducing work between threads
 412     assert(conf_.x_block_ > 0 && nx > conf_.x_block_
 413             && nx % conf_.x_block_ == 0);
 414     int x_blocking = nx / conf_.x_block_;
 415     int min_x_blocking =
 416             utils::div_up(x_blocking, nstl::max(1, nthr_per_grp / ny));
 417     while (true) {
 418         if (x_blocking % 2 == 0 && x_blocking >= min_x_blocking * 2)
 419             x_blocking /= 2;
 420         else if (x_blocking % 3 == 0 && x_blocking >= min_x_blocking * 3)
 421             x_blocking /= 3;
 422         else
 423             break;
 424     }
 425     if (x_blocking >= min_x_blocking * 4) x_blocking = 1;
 426     x_blocking *= conf_.x_block_;
 427     return x_blocking;
 428 }
 429
 430 template <impl::data_type_t data_type>
 431 void cpu_reducer_2d_t<data_type>::reduce_block(const data_t* space_base,
 432         data_t *dst, int job, int start_y, int start_x,
 433         int ny_start, int nx_start, int ny_step, int nx_step) const {
 434     data_t *d = dst + (start_y + ny_start) * conf_.dst_x_
 435                     + start_x + nx_start;
 436     const data_t *space = space_base + job * balancer().job_size_
 437                             + ny_start * conf_.job_size_x_ + nx_start;
 438 #ifdef SIMPLE_IMPL
 439     for (int idg = 0; idg < balancer().nthr_per_group_; ++idg) {
 440         const data_t *w = &space[idg * space_per_thread(balancer())];
 441         for (int y = 0; y < ny_step; ++y)
 442             for (int x = 0; x < nx_step; ++x) {
 443                 d[y * conf_.dst_x_ + x]
 444                     = (idg == 0 ? 0 : d[y * conf_.dst_x_ + x])
 445                     + w[y * conf_.job_size_x_ + x];
 446             }
 447     }
 448 #else
 449     (*drv_)(d, space, ny_step, nx_step);
 450 #endif
 451 }
 452
 453 template <impl::data_type_t data_type>
 454 void cpu_reducer_2d_t<data_type>::reduce_nolock(int ithr, data_t *dst,
 455         const memory_tracking::grantor_t &scratchpad) const {
 456     bool redundant_reduction = balancer().nthr_per_group_ == 1
 457         || balancer().idle(ithr);
 458     if (redundant_reduction) return;
 459
 460     const int id_in_grp = balancer().id_in_group(ithr);
 461     const int njobs_in_grp = balancer().ithr_njobs(ithr);
 462     const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_);
 463     const int global_job_start = balancer().ithr_job_off(ithr);
 464
 465     const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad);
 466
 467     const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_);
 468     const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps;
 469
 470     if (id_in_grp >= pr_grps * pr_nthr_per_grp)
 471         return; /* idle */
 472
 473     const int pr_my_grp = id_in_grp / pr_nthr_per_grp;
 474     const int pr_my_id = id_in_grp % pr_nthr_per_grp;
 475
 476     int pr_job_start{0}, pr_job_end{0};
 477     balance211(njobs_in_grp, pr_grps, pr_my_grp, pr_job_start, pr_job_end);
 478
 479     for (int j = pr_job_start; j < pr_job_end; ++j) {
 480         const int global_job = global_job_start + j;
 481         const int j_y = global_job / njobs_x;
 482         const int j_x = global_job % njobs_x;
 483         const int start_y = j_y * conf_.job_size_y_;
 484         const int start_x = j_x * conf_.job_size_x_;
 485         const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_);
 486         const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_);
 487         int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp);
 488
 489         int nxy_start{0}, nxy_end{0};
 490         balance211(ny * nx / x_blocking, pr_nthr_per_grp, pr_my_id,
 491                     nxy_start, nxy_end);
 492         if (nxy_start == nxy_end) continue;
 493         nxy_start *= x_blocking;
 494         nxy_end *= x_blocking;
 495
 496         int nxy = nxy_start;
 497         if (nxy % nx != 0) {
 498             int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy);
 499             reduce_block(space_base, dst, j, start_y, start_x,
 500                         nxy / nx, nxy % nx, 1, nx_step);
 501             nxy += nx_step;
 502         }
 503         if ((nxy_end - nxy) > nx) {
 504             int ny_step = (nxy_end - nxy) / nx;
 505             reduce_block(space_base, dst, j, start_y, start_x,
 506                         nxy / nx, nxy % nx, ny_step, nx);
 507             nxy += nx * ny_step;
 508         }
 509         if ((nxy_end - nxy) > 0) {
 510             reduce_block(space_base, dst, j, start_y, start_x,
 511                         nxy / nx, nxy % nx, 1, nxy_end - nxy);
 512         }
 513     }
 514 }
 515
 516 template struct cpu_reducer_2d_t<data_type::f32>;
 517 template struct cpu_reducer_2d_t<data_type::s32>;
 518
 519 /* accumulator section */
 520
 521 template <impl::data_type_t data_type>
 522 cpu_accumulator_1d_t<data_type>::cpu_accumulator_1d_t(): drv_(nullptr) {
 523     drv_ = create_reduce_2d_drv<data_type>(1, 0, 0, 0, false);
 524 }
 525
 526 template <impl::data_type_t data_type>
 527 cpu_accumulator_1d_t<data_type>::~cpu_accumulator_1d_t() {
 528     delete drv_;
 529 }
 530
 531 template <impl::data_type_t data_type>
 532 void cpu_accumulator_1d_t<data_type>::accumulate(data_t *dst,
 533         const data_t *src, size_t size) {
 534     (*drv_)(dst, src, 1, size);
 535 }
 536
 537 template struct cpu_accumulator_1d_t<data_type::f32>;
 538 template struct cpu_accumulator_1d_t<data_type::s32>;
 539
 540 }
 541 }
 542 }
 543
 544 // vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s