drivers/md/dm-kcopyd.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) 2002 Sistina Software (UK) Limited.
   4  * Copyright (C) 2006 Red Hat GmbH
   5  *
   6  * This file is released under the GPL.
   7  *
   8  * Kcopyd provides a simple interface for copying an area of one
   9  * block-device to one or more other block-devices, with an asynchronous
  10  * completion notification.
  11  */
  12
  13 #include <linux/types.h>
  14 #include <linux/atomic.h>
  15 #include <linux/blkdev.h>
  16 #include <linux/fs.h>
  17 #include <linux/init.h>
  18 #include <linux/list.h>
  19 #include <linux/mempool.h>
  20 #include <linux/module.h>
  21 #include <linux/pagemap.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/workqueue.h>
  25 #include <linux/mutex.h>
  26 #include <linux/delay.h>
  27 #include <linux/device-mapper.h>
  28 #include <linux/dm-kcopyd.h>
  29
  30 #include "dm-core.h"
  31
  32 #define SPLIT_COUNT     8
  33 #define MIN_JOBS        8
  34
  35 #define DEFAULT_SUB_JOB_SIZE_KB 512
  36 #define MAX_SUB_JOB_SIZE_KB     1024
  37
  38 static unsigned int kcopyd_subjob_size_kb = DEFAULT_SUB_JOB_SIZE_KB;
  39
  40 module_param(kcopyd_subjob_size_kb, uint, 0644);
  41 MODULE_PARM_DESC(kcopyd_subjob_size_kb, "Sub-job size for dm-kcopyd clients");
  42
  43 static unsigned int dm_get_kcopyd_subjob_size(void)
  44 {
  45         unsigned int sub_job_size_kb;
  46
  47         sub_job_size_kb = __dm_get_module_param(&kcopyd_subjob_size_kb,
  48                                                 DEFAULT_SUB_JOB_SIZE_KB,
  49                                                 MAX_SUB_JOB_SIZE_KB);
  50
  51         return sub_job_size_kb << 1;
  52 }
  53
  54 /*
  55  *----------------------------------------------------------------
  56  * Each kcopyd client has its own little pool of preallocated
  57  * pages for kcopyd io.
  58  *---------------------------------------------------------------
  59  */
  60 struct dm_kcopyd_client {
  61         struct page_list *pages;
  62         unsigned int nr_reserved_pages;
  63         unsigned int nr_free_pages;
  64         unsigned int sub_job_size;
  65
  66         struct dm_io_client *io_client;
  67
  68         wait_queue_head_t destroyq;
  69
  70         mempool_t job_pool;
  71
  72         struct workqueue_struct *kcopyd_wq;
  73         struct work_struct kcopyd_work;
  74
  75         struct dm_kcopyd_throttle *throttle;
  76
  77         atomic_t nr_jobs;
  78
  79 /*
  80  * We maintain four lists of jobs:
  81  *
  82  * i)   jobs waiting for pages
  83  * ii)  jobs that have pages, and are waiting for the io to be issued.
  84  * iii) jobs that don't need to do any IO and just run a callback
  85  * iv) jobs that have completed.
  86  *
  87  * All four of these are protected by job_lock.
  88  */
  89         spinlock_t job_lock;
  90         struct list_head callback_jobs;
  91         struct list_head complete_jobs;
  92         struct list_head io_jobs;
  93         struct list_head pages_jobs;
  94 };
  95
  96 static struct page_list zero_page_list;
  97
  98 static DEFINE_SPINLOCK(throttle_spinlock);
  99
 100 /*
 101  * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period.
 102  * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided
 103  * by 2.
 104  */
 105 #define ACCOUNT_INTERVAL_SHIFT          SHIFT_HZ
 106
 107 /*
 108  * Sleep this number of milliseconds.
 109  *
 110  * The value was decided experimentally.
 111  * Smaller values seem to cause an increased copy rate above the limit.
 112  * The reason for this is unknown but possibly due to jiffies rounding errors
 113  * or read/write cache inside the disk.
 114  */
 115 #define SLEEP_USEC                      100000
 116
 117 /*
 118  * Maximum number of sleep events. There is a theoretical livelock if more
 119  * kcopyd clients do work simultaneously which this limit avoids.
 120  */
 121 #define MAX_SLEEPS                      10
 122
 123 static void io_job_start(struct dm_kcopyd_throttle *t)
 124 {
 125         unsigned int throttle, now, difference;
 126         int slept = 0, skew;
 127
 128         if (unlikely(!t))
 129                 return;
 130
 131 try_again:
 132         spin_lock_irq(&throttle_spinlock);
 133
 134         throttle = READ_ONCE(t->throttle);
 135
 136         if (likely(throttle >= 100))
 137                 goto skip_limit;
 138
 139         now = jiffies;
 140         difference = now - t->last_jiffies;
 141         t->last_jiffies = now;
 142         if (t->num_io_jobs)
 143                 t->io_period += difference;
 144         t->total_period += difference;
 145
 146         /*
 147          * Maintain sane values if we got a temporary overflow.
 148          */
 149         if (unlikely(t->io_period > t->total_period))
 150                 t->io_period = t->total_period;
 151
 152         if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) {
 153                 int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT);
 154
 155                 t->total_period >>= shift;
 156                 t->io_period >>= shift;
 157         }
 158
 159         skew = t->io_period - throttle * t->total_period / 100;
 160
 161         if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
 162                 slept++;
 163                 spin_unlock_irq(&throttle_spinlock);
 164                 fsleep(SLEEP_USEC);
 165                 goto try_again;
 166         }
 167
 168 skip_limit:
 169         t->num_io_jobs++;
 170
 171         spin_unlock_irq(&throttle_spinlock);
 172 }
 173
 174 static void io_job_finish(struct dm_kcopyd_throttle *t)
 175 {
 176         unsigned long flags;
 177
 178         if (unlikely(!t))
 179                 return;
 180
 181         spin_lock_irqsave(&throttle_spinlock, flags);
 182
 183         t->num_io_jobs--;
 184
 185         if (likely(READ_ONCE(t->throttle) >= 100))
 186                 goto skip_limit;
 187
 188         if (!t->num_io_jobs) {
 189                 unsigned int now, difference;
 190
 191                 now = jiffies;
 192                 difference = now - t->last_jiffies;
 193                 t->last_jiffies = now;
 194
 195                 t->io_period += difference;
 196                 t->total_period += difference;
 197
 198                 /*
 199                  * Maintain sane values if we got a temporary overflow.
 200                  */
 201                 if (unlikely(t->io_period > t->total_period))
 202                         t->io_period = t->total_period;
 203         }
 204
 205 skip_limit:
 206         spin_unlock_irqrestore(&throttle_spinlock, flags);
 207 }
 208
 209
 210 static void wake(struct dm_kcopyd_client *kc)
 211 {
 212         queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
 213 }
 214
 215 /*
 216  * Obtain one page for the use of kcopyd.
 217  */
 218 static struct page_list *alloc_pl(gfp_t gfp)
 219 {
 220         struct page_list *pl;
 221
 222         pl = kmalloc(sizeof(*pl), gfp);
 223         if (!pl)
 224                 return NULL;
 225
 226         pl->page = alloc_page(gfp | __GFP_HIGHMEM);
 227         if (!pl->page) {
 228                 kfree(pl);
 229                 return NULL;
 230         }
 231
 232         return pl;
 233 }
 234
 235 static void free_pl(struct page_list *pl)
 236 {
 237         __free_page(pl->page);
 238         kfree(pl);
 239 }
 240
 241 /*
 242  * Add the provided pages to a client's free page list, releasing
 243  * back to the system any beyond the reserved_pages limit.
 244  */
 245 static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
 246 {
 247         struct page_list *next;
 248
 249         do {
 250                 next = pl->next;
 251
 252                 if (kc->nr_free_pages >= kc->nr_reserved_pages)
 253                         free_pl(pl);
 254                 else {
 255                         pl->next = kc->pages;
 256                         kc->pages = pl;
 257                         kc->nr_free_pages++;
 258                 }
 259
 260                 pl = next;
 261         } while (pl);
 262 }
 263
 264 static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
 265                             unsigned int nr, struct page_list **pages)
 266 {
 267         struct page_list *pl;
 268
 269         *pages = NULL;
 270
 271         do {
 272                 pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM);
 273                 if (unlikely(!pl)) {
 274                         /* Use reserved pages */
 275                         pl = kc->pages;
 276                         if (unlikely(!pl))
 277                                 goto out_of_memory;
 278                         kc->pages = pl->next;
 279                         kc->nr_free_pages--;
 280                 }
 281                 pl->next = *pages;
 282                 *pages = pl;
 283         } while (--nr);
 284
 285         return 0;
 286
 287 out_of_memory:
 288         if (*pages)
 289                 kcopyd_put_pages(kc, *pages);
 290         return -ENOMEM;
 291 }
 292
 293 /*
 294  * These three functions resize the page pool.
 295  */
 296 static void drop_pages(struct page_list *pl)
 297 {
 298         struct page_list *next;
 299
 300         while (pl) {
 301                 next = pl->next;
 302                 free_pl(pl);
 303                 pl = next;
 304         }
 305 }
 306
 307 /*
 308  * Allocate and reserve nr_pages for the use of a specific client.
 309  */
 310 static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned int nr_pages)
 311 {
 312         unsigned int i;
 313         struct page_list *pl = NULL, *next;
 314
 315         for (i = 0; i < nr_pages; i++) {
 316                 next = alloc_pl(GFP_KERNEL);
 317                 if (!next) {
 318                         if (pl)
 319                                 drop_pages(pl);
 320                         return -ENOMEM;
 321                 }
 322                 next->next = pl;
 323                 pl = next;
 324         }
 325
 326         kc->nr_reserved_pages += nr_pages;
 327         kcopyd_put_pages(kc, pl);
 328
 329         return 0;
 330 }
 331
 332 static void client_free_pages(struct dm_kcopyd_client *kc)
 333 {
 334         BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages);
 335         drop_pages(kc->pages);
 336         kc->pages = NULL;
 337         kc->nr_free_pages = kc->nr_reserved_pages = 0;
 338 }
 339
 340 /*
 341  *---------------------------------------------------------------
 342  * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
 343  * for this reason we use a mempool to prevent the client from
 344  * ever having to do io (which could cause a deadlock).
 345  *---------------------------------------------------------------
 346  */
 347 struct kcopyd_job {
 348         struct dm_kcopyd_client *kc;
 349         struct list_head list;
 350         unsigned int flags;
 351
 352         /*
 353          * Error state of the job.
 354          */
 355         int read_err;
 356         unsigned long write_err;
 357
 358         /*
 359          * REQ_OP_READ, REQ_OP_WRITE or REQ_OP_WRITE_ZEROES.
 360          */
 361         enum req_op op;
 362         struct dm_io_region source;
 363
 364         /*
 365          * The destinations for the transfer.
 366          */
 367         unsigned int num_dests;
 368         struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
 369
 370         struct page_list *pages;
 371
 372         /*
 373          * Set this to ensure you are notified when the job has
 374          * completed.  'context' is for callback to use.
 375          */
 376         dm_kcopyd_notify_fn fn;
 377         void *context;
 378
 379         /*
 380          * These fields are only used if the job has been split
 381          * into more manageable parts.
 382          */
 383         struct mutex lock;
 384         atomic_t sub_jobs;
 385         sector_t progress;
 386         sector_t write_offset;
 387
 388         struct kcopyd_job *master_job;
 389 };
 390
 391 static struct kmem_cache *_job_cache;
 392
 393 int __init dm_kcopyd_init(void)
 394 {
 395         _job_cache = kmem_cache_create("kcopyd_job",
 396                                 sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1),
 397                                 __alignof__(struct kcopyd_job), 0, NULL);
 398         if (!_job_cache)
 399                 return -ENOMEM;
 400
 401         zero_page_list.next = &zero_page_list;
 402         zero_page_list.page = ZERO_PAGE(0);
 403
 404         return 0;
 405 }
 406
 407 void dm_kcopyd_exit(void)
 408 {
 409         kmem_cache_destroy(_job_cache);
 410         _job_cache = NULL;
 411 }
 412
 413 /*
 414  * Functions to push and pop a job onto the head of a given job
 415  * list.
 416  */
 417 static struct kcopyd_job *pop_io_job(struct list_head *jobs,
 418                                      struct dm_kcopyd_client *kc)
 419 {
 420         struct kcopyd_job *job;
 421
 422         /*
 423          * For I/O jobs, pop any read, any write without sequential write
 424          * constraint and sequential writes that are at the right position.
 425          */
 426         list_for_each_entry(job, jobs, list) {
 427                 if (job->op == REQ_OP_READ ||
 428                     !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
 429                         list_del(&job->list);
 430                         return job;
 431                 }
 432
 433                 if (job->write_offset == job->master_job->write_offset) {
 434                         job->master_job->write_offset += job->source.count;
 435                         list_del(&job->list);
 436                         return job;
 437                 }
 438         }
 439
 440         return NULL;
 441 }
 442
 443 static struct kcopyd_job *pop(struct list_head *jobs,
 444                               struct dm_kcopyd_client *kc)
 445 {
 446         struct kcopyd_job *job = NULL;
 447
 448         spin_lock_irq(&kc->job_lock);
 449
 450         if (!list_empty(jobs)) {
 451                 if (jobs == &kc->io_jobs)
 452                         job = pop_io_job(jobs, kc);
 453                 else {
 454                         job = list_entry(jobs->next, struct kcopyd_job, list);
 455                         list_del(&job->list);
 456                 }
 457         }
 458         spin_unlock_irq(&kc->job_lock);
 459
 460         return job;
 461 }
 462
 463 static void push(struct list_head *jobs, struct kcopyd_job *job)
 464 {
 465         unsigned long flags;
 466         struct dm_kcopyd_client *kc = job->kc;
 467
 468         spin_lock_irqsave(&kc->job_lock, flags);
 469         list_add_tail(&job->list, jobs);
 470         spin_unlock_irqrestore(&kc->job_lock, flags);
 471 }
 472
 473
 474 static void push_head(struct list_head *jobs, struct kcopyd_job *job)
 475 {
 476         struct dm_kcopyd_client *kc = job->kc;
 477
 478         spin_lock_irq(&kc->job_lock);
 479         list_add(&job->list, jobs);
 480         spin_unlock_irq(&kc->job_lock);
 481 }
 482
 483 /*
 484  * These three functions process 1 item from the corresponding
 485  * job list.
 486  *
 487  * They return:
 488  * < 0: error
 489  *   0: success
 490  * > 0: can't process yet.
 491  */
 492 static int run_complete_job(struct kcopyd_job *job)
 493 {
 494         void *context = job->context;
 495         int read_err = job->read_err;
 496         unsigned long write_err = job->write_err;
 497         dm_kcopyd_notify_fn fn = job->fn;
 498         struct dm_kcopyd_client *kc = job->kc;
 499
 500         if (job->pages && job->pages != &zero_page_list)
 501                 kcopyd_put_pages(kc, job->pages);
 502         /*
 503          * If this is the master job, the sub jobs have already
 504          * completed so we can free everything.
 505          */
 506         if (job->master_job == job) {
 507                 mutex_destroy(&job->lock);
 508                 mempool_free(job, &kc->job_pool);
 509         }
 510         fn(read_err, write_err, context);
 511
 512         if (atomic_dec_and_test(&kc->nr_jobs))
 513                 wake_up(&kc->destroyq);
 514
 515         cond_resched();
 516
 517         return 0;
 518 }
 519
 520 static void complete_io(unsigned long error, void *context)
 521 {
 522         struct kcopyd_job *job = context;
 523         struct dm_kcopyd_client *kc = job->kc;
 524
 525         io_job_finish(kc->throttle);
 526
 527         if (error) {
 528                 if (op_is_write(job->op))
 529                         job->write_err |= error;
 530                 else
 531                         job->read_err = 1;
 532
 533                 if (!(job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))) {
 534                         push(&kc->complete_jobs, job);
 535                         wake(kc);
 536                         return;
 537                 }
 538         }
 539
 540         if (op_is_write(job->op))
 541                 push(&kc->complete_jobs, job);
 542
 543         else {
 544                 job->op = REQ_OP_WRITE;
 545                 push(&kc->io_jobs, job);
 546         }
 547
 548         wake(kc);
 549 }
 550
 551 /*
 552  * Request io on as many buffer heads as we can currently get for
 553  * a particular job.
 554  */
 555 static int run_io_job(struct kcopyd_job *job)
 556 {
 557         int r;
 558         struct dm_io_request io_req = {
 559                 .bi_opf = job->op,
 560                 .mem.type = DM_IO_PAGE_LIST,
 561                 .mem.ptr.pl = job->pages,
 562                 .mem.offset = 0,
 563                 .notify.fn = complete_io,
 564                 .notify.context = job,
 565                 .client = job->kc->io_client,
 566         };
 567
 568         /*
 569          * If we need to write sequentially and some reads or writes failed,
 570          * no point in continuing.
 571          */
 572         if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
 573             job->master_job->write_err) {
 574                 job->write_err = job->master_job->write_err;
 575                 return -EIO;
 576         }
 577
 578         io_job_start(job->kc->throttle);
 579
 580         if (job->op == REQ_OP_READ)
 581                 r = dm_io(&io_req, 1, &job->source, NULL);
 582         else
 583                 r = dm_io(&io_req, job->num_dests, job->dests, NULL);
 584
 585         return r;
 586 }
 587
 588 static int run_pages_job(struct kcopyd_job *job)
 589 {
 590         int r;
 591         unsigned int nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
 592
 593         r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
 594         if (!r) {
 595                 /* this job is ready for io */
 596                 push(&job->kc->io_jobs, job);
 597                 return 0;
 598         }
 599
 600         if (r == -ENOMEM)
 601                 /* can't complete now */
 602                 return 1;
 603
 604         return r;
 605 }
 606
 607 /*
 608  * Run through a list for as long as possible.  Returns the count
 609  * of successful jobs.
 610  */
 611 static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
 612                         int (*fn)(struct kcopyd_job *))
 613 {
 614         struct kcopyd_job *job;
 615         int r, count = 0;
 616
 617         while ((job = pop(jobs, kc))) {
 618
 619                 r = fn(job);
 620
 621                 if (r < 0) {
 622                         /* error this rogue job */
 623                         if (op_is_write(job->op))
 624                                 job->write_err = (unsigned long) -1L;
 625                         else
 626                                 job->read_err = 1;
 627                         push(&kc->complete_jobs, job);
 628                         wake(kc);
 629                         break;
 630                 }
 631
 632                 if (r > 0) {
 633                         /*
 634                          * We couldn't service this job ATM, so
 635                          * push this job back onto the list.
 636                          */
 637                         push_head(jobs, job);
 638                         break;
 639                 }
 640
 641                 count++;
 642         }
 643
 644         return count;
 645 }
 646
 647 /*
 648  * kcopyd does this every time it's woken up.
 649  */
 650 static void do_work(struct work_struct *work)
 651 {
 652         struct dm_kcopyd_client *kc = container_of(work,
 653                                         struct dm_kcopyd_client, kcopyd_work);
 654         struct blk_plug plug;
 655
 656         /*
 657          * The order that these are called is *very* important.
 658          * complete jobs can free some pages for pages jobs.
 659          * Pages jobs when successful will jump onto the io jobs
 660          * list.  io jobs call wake when they complete and it all
 661          * starts again.
 662          */
 663         spin_lock_irq(&kc->job_lock);
 664         list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs);
 665         spin_unlock_irq(&kc->job_lock);
 666
 667         blk_start_plug(&plug);
 668         process_jobs(&kc->complete_jobs, kc, run_complete_job);
 669         process_jobs(&kc->pages_jobs, kc, run_pages_job);
 670         process_jobs(&kc->io_jobs, kc, run_io_job);
 671         blk_finish_plug(&plug);
 672 }
 673
 674 /*
 675  * If we are copying a small region we just dispatch a single job
 676  * to do the copy, otherwise the io has to be split up into many
 677  * jobs.
 678  */
 679 static void dispatch_job(struct kcopyd_job *job)
 680 {
 681         struct dm_kcopyd_client *kc = job->kc;
 682
 683         atomic_inc(&kc->nr_jobs);
 684         if (unlikely(!job->source.count))
 685                 push(&kc->callback_jobs, job);
 686         else if (job->pages == &zero_page_list)
 687                 push(&kc->io_jobs, job);
 688         else
 689                 push(&kc->pages_jobs, job);
 690         wake(kc);
 691 }
 692
 693 static void segment_complete(int read_err, unsigned long write_err,
 694                              void *context)
 695 {
 696         /* FIXME: tidy this function */
 697         sector_t progress = 0;
 698         sector_t count = 0;
 699         struct kcopyd_job *sub_job = context;
 700         struct kcopyd_job *job = sub_job->master_job;
 701         struct dm_kcopyd_client *kc = job->kc;
 702
 703         mutex_lock(&job->lock);
 704
 705         /* update the error */
 706         if (read_err)
 707                 job->read_err = 1;
 708
 709         if (write_err)
 710                 job->write_err |= write_err;
 711
 712         /*
 713          * Only dispatch more work if there hasn't been an error.
 714          */
 715         if ((!job->read_err && !job->write_err) ||
 716             job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) {
 717                 /* get the next chunk of work */
 718                 progress = job->progress;
 719                 count = job->source.count - progress;
 720                 if (count) {
 721                         if (count > kc->sub_job_size)
 722                                 count = kc->sub_job_size;
 723
 724                         job->progress += count;
 725                 }
 726         }
 727         mutex_unlock(&job->lock);
 728
 729         if (count) {
 730                 int i;
 731
 732                 *sub_job = *job;
 733                 sub_job->write_offset = progress;
 734                 sub_job->source.sector += progress;
 735                 sub_job->source.count = count;
 736
 737                 for (i = 0; i < job->num_dests; i++) {
 738                         sub_job->dests[i].sector += progress;
 739                         sub_job->dests[i].count = count;
 740                 }
 741
 742                 sub_job->fn = segment_complete;
 743                 sub_job->context = sub_job;
 744                 dispatch_job(sub_job);
 745
 746         } else if (atomic_dec_and_test(&job->sub_jobs)) {
 747
 748                 /*
 749                  * Queue the completion callback to the kcopyd thread.
 750                  *
 751                  * Some callers assume that all the completions are called
 752                  * from a single thread and don't race with each other.
 753                  *
 754                  * We must not call the callback directly here because this
 755                  * code may not be executing in the thread.
 756                  */
 757                 push(&kc->complete_jobs, job);
 758                 wake(kc);
 759         }
 760 }
 761
 762 /*
 763  * Create some sub jobs to share the work between them.
 764  */
 765 static void split_job(struct kcopyd_job *master_job)
 766 {
 767         int i;
 768
 769         atomic_inc(&master_job->kc->nr_jobs);
 770
 771         atomic_set(&master_job->sub_jobs, SPLIT_COUNT);
 772         for (i = 0; i < SPLIT_COUNT; i++) {
 773                 master_job[i + 1].master_job = master_job;
 774                 segment_complete(0, 0u, &master_job[i + 1]);
 775         }
 776 }
 777
 778 void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 779                     unsigned int num_dests, struct dm_io_region *dests,
 780                     unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
 781 {
 782         struct kcopyd_job *job;
 783         int i;
 784
 785         /*
 786          * Allocate an array of jobs consisting of one master job
 787          * followed by SPLIT_COUNT sub jobs.
 788          */
 789         job = mempool_alloc(&kc->job_pool, GFP_NOIO);
 790         mutex_init(&job->lock);
 791
 792         /*
 793          * set up for the read.
 794          */
 795         job->kc = kc;
 796         job->flags = flags;
 797         job->read_err = 0;
 798         job->write_err = 0;
 799
 800         job->num_dests = num_dests;
 801         memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
 802
 803         /*
 804          * If one of the destination is a host-managed zoned block device,
 805          * we need to write sequentially. If one of the destination is a
 806          * host-aware device, then leave it to the caller to choose what to do.
 807          */
 808         if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
 809                 for (i = 0; i < job->num_dests; i++) {
 810                         if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) {
 811                                 job->flags |= BIT(DM_KCOPYD_WRITE_SEQ);
 812                                 break;
 813                         }
 814                 }
 815         }
 816
 817         /*
 818          * If we need to write sequentially, errors cannot be ignored.
 819          */
 820         if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
 821             job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))
 822                 job->flags &= ~BIT(DM_KCOPYD_IGNORE_ERROR);
 823
 824         if (from) {
 825                 job->source = *from;
 826                 job->pages = NULL;
 827                 job->op = REQ_OP_READ;
 828         } else {
 829                 memset(&job->source, 0, sizeof(job->source));
 830                 job->source.count = job->dests[0].count;
 831                 job->pages = &zero_page_list;
 832
 833                 /*
 834                  * Use WRITE ZEROES to optimize zeroing if all dests support it.
 835                  */
 836                 job->op = REQ_OP_WRITE_ZEROES;
 837                 for (i = 0; i < job->num_dests; i++)
 838                         if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) {
 839                                 job->op = REQ_OP_WRITE;
 840                                 break;
 841                         }
 842         }
 843
 844         job->fn = fn;
 845         job->context = context;
 846         job->master_job = job;
 847         job->write_offset = 0;
 848
 849         if (job->source.count <= kc->sub_job_size)
 850                 dispatch_job(job);
 851         else {
 852                 job->progress = 0;
 853                 split_job(job);
 854         }
 855 }
 856 EXPORT_SYMBOL(dm_kcopyd_copy);
 857
 858 void dm_kcopyd_zero(struct dm_kcopyd_client *kc,
 859                     unsigned int num_dests, struct dm_io_region *dests,
 860                     unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
 861 {
 862         dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
 863 }
 864 EXPORT_SYMBOL(dm_kcopyd_zero);
 865
 866 void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
 867                                  dm_kcopyd_notify_fn fn, void *context)
 868 {
 869         struct kcopyd_job *job;
 870
 871         job = mempool_alloc(&kc->job_pool, GFP_NOIO);
 872
 873         memset(job, 0, sizeof(struct kcopyd_job));
 874         job->kc = kc;
 875         job->fn = fn;
 876         job->context = context;
 877         job->master_job = job;
 878
 879         atomic_inc(&kc->nr_jobs);
 880
 881         return job;
 882 }
 883 EXPORT_SYMBOL(dm_kcopyd_prepare_callback);
 884
 885 void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
 886 {
 887         struct kcopyd_job *job = j;
 888         struct dm_kcopyd_client *kc = job->kc;
 889
 890         job->read_err = read_err;
 891         job->write_err = write_err;
 892
 893         push(&kc->callback_jobs, job);
 894         wake(kc);
 895 }
 896 EXPORT_SYMBOL(dm_kcopyd_do_callback);
 897
 898 /*
 899  * Cancels a kcopyd job, eg. someone might be deactivating a
 900  * mirror.
 901  */
 902 #if 0
 903 int kcopyd_cancel(struct kcopyd_job *job, int block)
 904 {
 905         /* FIXME: finish */
 906         return -1;
 907 }
 908 #endif  /*  0  */
 909
 910 /*
 911  *---------------------------------------------------------------
 912  * Client setup
 913  *---------------------------------------------------------------
 914  */
 915 struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
 916 {
 917         int r;
 918         unsigned int reserve_pages;
 919         struct dm_kcopyd_client *kc;
 920
 921         kc = kzalloc(sizeof(*kc), GFP_KERNEL);
 922         if (!kc)
 923                 return ERR_PTR(-ENOMEM);
 924
 925         spin_lock_init(&kc->job_lock);
 926         INIT_LIST_HEAD(&kc->callback_jobs);
 927         INIT_LIST_HEAD(&kc->complete_jobs);
 928         INIT_LIST_HEAD(&kc->io_jobs);
 929         INIT_LIST_HEAD(&kc->pages_jobs);
 930         kc->throttle = throttle;
 931
 932         r = mempool_init_slab_pool(&kc->job_pool, MIN_JOBS, _job_cache);
 933         if (r)
 934                 goto bad_slab;
 935
 936         INIT_WORK(&kc->kcopyd_work, do_work);
 937         kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
 938         if (!kc->kcopyd_wq) {
 939                 r = -ENOMEM;
 940                 goto bad_workqueue;
 941         }
 942
 943         kc->sub_job_size = dm_get_kcopyd_subjob_size();
 944         reserve_pages = DIV_ROUND_UP(kc->sub_job_size << SECTOR_SHIFT, PAGE_SIZE);
 945
 946         kc->pages = NULL;
 947         kc->nr_reserved_pages = kc->nr_free_pages = 0;
 948         r = client_reserve_pages(kc, reserve_pages);
 949         if (r)
 950                 goto bad_client_pages;
 951
 952         kc->io_client = dm_io_client_create();
 953         if (IS_ERR(kc->io_client)) {
 954                 r = PTR_ERR(kc->io_client);
 955                 goto bad_io_client;
 956         }
 957
 958         init_waitqueue_head(&kc->destroyq);
 959         atomic_set(&kc->nr_jobs, 0);
 960
 961         return kc;
 962
 963 bad_io_client:
 964         client_free_pages(kc);
 965 bad_client_pages:
 966         destroy_workqueue(kc->kcopyd_wq);
 967 bad_workqueue:
 968         mempool_exit(&kc->job_pool);
 969 bad_slab:
 970         kfree(kc);
 971
 972         return ERR_PTR(r);
 973 }
 974 EXPORT_SYMBOL(dm_kcopyd_client_create);
 975
 976 void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
 977 {
 978         /* Wait for completion of all jobs submitted by this client. */
 979         wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
 980
 981         BUG_ON(!list_empty(&kc->callback_jobs));
 982         BUG_ON(!list_empty(&kc->complete_jobs));
 983         BUG_ON(!list_empty(&kc->io_jobs));
 984         BUG_ON(!list_empty(&kc->pages_jobs));
 985         destroy_workqueue(kc->kcopyd_wq);
 986         dm_io_client_destroy(kc->io_client);
 987         client_free_pages(kc);
 988         mempool_exit(&kc->job_pool);
 989         kfree(kc);
 990 }
 991 EXPORT_SYMBOL(dm_kcopyd_client_destroy);
 992
 993 void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc)
 994 {
 995         flush_workqueue(kc->kcopyd_wq);
 996 }
 997 EXPORT_SYMBOL(dm_kcopyd_client_flush);