Merge tag 'libnvdimm-for-4.19_misc' of gitolite.kernel.org:pub/scm/linux/kernel/git...
[platform/kernel/linux-rpi.git] / drivers / md / dm-writecache.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2018 Red Hat. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7
8 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/vmalloc.h>
12 #include <linux/kthread.h>
13 #include <linux/dm-io.h>
14 #include <linux/dm-kcopyd.h>
15 #include <linux/dax.h>
16 #include <linux/pfn_t.h>
17 #include <linux/libnvdimm.h>
18
19 #define DM_MSG_PREFIX "writecache"
20
21 #define HIGH_WATERMARK                  50
22 #define LOW_WATERMARK                   45
23 #define MAX_WRITEBACK_JOBS              0
24 #define ENDIO_LATENCY                   16
25 #define WRITEBACK_LATENCY               64
26 #define AUTOCOMMIT_BLOCKS_SSD           65536
27 #define AUTOCOMMIT_BLOCKS_PMEM          64
28 #define AUTOCOMMIT_MSEC                 1000
29
30 #define BITMAP_GRANULARITY      65536
31 #if BITMAP_GRANULARITY < PAGE_SIZE
32 #undef BITMAP_GRANULARITY
33 #define BITMAP_GRANULARITY      PAGE_SIZE
34 #endif
35
36 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
37 #define DM_WRITECACHE_HAS_PMEM
38 #endif
39
40 #ifdef DM_WRITECACHE_HAS_PMEM
41 #define pmem_assign(dest, src)                                  \
42 do {                                                            \
43         typeof(dest) uniq = (src);                              \
44         memcpy_flushcache(&(dest), &uniq, sizeof(dest));        \
45 } while (0)
46 #else
47 #define pmem_assign(dest, src)  ((dest) = (src))
48 #endif
49
50 #if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
51 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
52 #endif
53
54 #define MEMORY_SUPERBLOCK_MAGIC         0x23489321
55 #define MEMORY_SUPERBLOCK_VERSION       1
56
57 struct wc_memory_entry {
58         __le64 original_sector;
59         __le64 seq_count;
60 };
61
62 struct wc_memory_superblock {
63         union {
64                 struct {
65                         __le32 magic;
66                         __le32 version;
67                         __le32 block_size;
68                         __le32 pad;
69                         __le64 n_blocks;
70                         __le64 seq_count;
71                 };
72                 __le64 padding[8];
73         };
74         struct wc_memory_entry entries[0];
75 };
76
77 struct wc_entry {
78         struct rb_node rb_node;
79         struct list_head lru;
80         unsigned short wc_list_contiguous;
81         bool write_in_progress
82 #if BITS_PER_LONG == 64
83                 :1
84 #endif
85         ;
86         unsigned long index
87 #if BITS_PER_LONG == 64
88                 :47
89 #endif
90         ;
91 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
92         uint64_t original_sector;
93         uint64_t seq_count;
94 #endif
95 };
96
97 #ifdef DM_WRITECACHE_HAS_PMEM
98 #define WC_MODE_PMEM(wc)                        ((wc)->pmem_mode)
99 #define WC_MODE_FUA(wc)                         ((wc)->writeback_fua)
100 #else
101 #define WC_MODE_PMEM(wc)                        false
102 #define WC_MODE_FUA(wc)                         false
103 #endif
104 #define WC_MODE_SORT_FREELIST(wc)               (!WC_MODE_PMEM(wc))
105
106 struct dm_writecache {
107         struct mutex lock;
108         struct list_head lru;
109         union {
110                 struct list_head freelist;
111                 struct {
112                         struct rb_root freetree;
113                         struct wc_entry *current_free;
114                 };
115         };
116         struct rb_root tree;
117
118         size_t freelist_size;
119         size_t writeback_size;
120         size_t freelist_high_watermark;
121         size_t freelist_low_watermark;
122
123         unsigned uncommitted_blocks;
124         unsigned autocommit_blocks;
125         unsigned max_writeback_jobs;
126
127         int error;
128
129         unsigned long autocommit_jiffies;
130         struct timer_list autocommit_timer;
131         struct wait_queue_head freelist_wait;
132
133         atomic_t bio_in_progress[2];
134         struct wait_queue_head bio_in_progress_wait[2];
135
136         struct dm_target *ti;
137         struct dm_dev *dev;
138         struct dm_dev *ssd_dev;
139         sector_t start_sector;
140         void *memory_map;
141         uint64_t memory_map_size;
142         size_t metadata_sectors;
143         size_t n_blocks;
144         uint64_t seq_count;
145         void *block_start;
146         struct wc_entry *entries;
147         unsigned block_size;
148         unsigned char block_size_bits;
149
150         bool pmem_mode:1;
151         bool writeback_fua:1;
152
153         bool overwrote_committed:1;
154         bool memory_vmapped:1;
155
156         bool high_wm_percent_set:1;
157         bool low_wm_percent_set:1;
158         bool max_writeback_jobs_set:1;
159         bool autocommit_blocks_set:1;
160         bool autocommit_time_set:1;
161         bool writeback_fua_set:1;
162         bool flush_on_suspend:1;
163
164         unsigned writeback_all;
165         struct workqueue_struct *writeback_wq;
166         struct work_struct writeback_work;
167         struct work_struct flush_work;
168
169         struct dm_io_client *dm_io;
170
171         raw_spinlock_t endio_list_lock;
172         struct list_head endio_list;
173         struct task_struct *endio_thread;
174
175         struct task_struct *flush_thread;
176         struct bio_list flush_list;
177
178         struct dm_kcopyd_client *dm_kcopyd;
179         unsigned long *dirty_bitmap;
180         unsigned dirty_bitmap_size;
181
182         struct bio_set bio_set;
183         mempool_t copy_pool;
184 };
185
186 #define WB_LIST_INLINE          16
187
188 struct writeback_struct {
189         struct list_head endio_entry;
190         struct dm_writecache *wc;
191         struct wc_entry **wc_list;
192         unsigned wc_list_n;
193         unsigned page_offset;
194         struct page *page;
195         struct wc_entry *wc_list_inline[WB_LIST_INLINE];
196         struct bio bio;
197 };
198
199 struct copy_struct {
200         struct list_head endio_entry;
201         struct dm_writecache *wc;
202         struct wc_entry *e;
203         unsigned n_entries;
204         int error;
205 };
206
207 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
208                                             "A percentage of time allocated for data copying");
209
210 static void wc_lock(struct dm_writecache *wc)
211 {
212         mutex_lock(&wc->lock);
213 }
214
215 static void wc_unlock(struct dm_writecache *wc)
216 {
217         mutex_unlock(&wc->lock);
218 }
219
220 #ifdef DM_WRITECACHE_HAS_PMEM
221 static int persistent_memory_claim(struct dm_writecache *wc)
222 {
223         int r;
224         loff_t s;
225         long p, da;
226         pfn_t pfn;
227         int id;
228         struct page **pages;
229
230         wc->memory_vmapped = false;
231
232         if (!wc->ssd_dev->dax_dev) {
233                 r = -EOPNOTSUPP;
234                 goto err1;
235         }
236         s = wc->memory_map_size;
237         p = s >> PAGE_SHIFT;
238         if (!p) {
239                 r = -EINVAL;
240                 goto err1;
241         }
242         if (p != s >> PAGE_SHIFT) {
243                 r = -EOVERFLOW;
244                 goto err1;
245         }
246
247         id = dax_read_lock();
248
249         da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
250         if (da < 0) {
251                 wc->memory_map = NULL;
252                 r = da;
253                 goto err2;
254         }
255         if (!pfn_t_has_page(pfn)) {
256                 wc->memory_map = NULL;
257                 r = -EOPNOTSUPP;
258                 goto err2;
259         }
260         if (da != p) {
261                 long i;
262                 wc->memory_map = NULL;
263                 pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
264                 if (!pages) {
265                         r = -ENOMEM;
266                         goto err2;
267                 }
268                 i = 0;
269                 do {
270                         long daa;
271                         daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
272                                                 NULL, &pfn);
273                         if (daa <= 0) {
274                                 r = daa ? daa : -EINVAL;
275                                 goto err3;
276                         }
277                         if (!pfn_t_has_page(pfn)) {
278                                 r = -EOPNOTSUPP;
279                                 goto err3;
280                         }
281                         while (daa-- && i < p) {
282                                 pages[i++] = pfn_t_to_page(pfn);
283                                 pfn.val++;
284                         }
285                 } while (i < p);
286                 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
287                 if (!wc->memory_map) {
288                         r = -ENOMEM;
289                         goto err3;
290                 }
291                 kvfree(pages);
292                 wc->memory_vmapped = true;
293         }
294
295         dax_read_unlock(id);
296
297         wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
298         wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
299
300         return 0;
301 err3:
302         kvfree(pages);
303 err2:
304         dax_read_unlock(id);
305 err1:
306         return r;
307 }
308 #else
309 static int persistent_memory_claim(struct dm_writecache *wc)
310 {
311         BUG();
312 }
313 #endif
314
315 static void persistent_memory_release(struct dm_writecache *wc)
316 {
317         if (wc->memory_vmapped)
318                 vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
319 }
320
321 static struct page *persistent_memory_page(void *addr)
322 {
323         if (is_vmalloc_addr(addr))
324                 return vmalloc_to_page(addr);
325         else
326                 return virt_to_page(addr);
327 }
328
329 static unsigned persistent_memory_page_offset(void *addr)
330 {
331         return (unsigned long)addr & (PAGE_SIZE - 1);
332 }
333
334 static void persistent_memory_flush_cache(void *ptr, size_t size)
335 {
336         if (is_vmalloc_addr(ptr))
337                 flush_kernel_vmap_range(ptr, size);
338 }
339
340 static void persistent_memory_invalidate_cache(void *ptr, size_t size)
341 {
342         if (is_vmalloc_addr(ptr))
343                 invalidate_kernel_vmap_range(ptr, size);
344 }
345
346 static struct wc_memory_superblock *sb(struct dm_writecache *wc)
347 {
348         return wc->memory_map;
349 }
350
351 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
352 {
353         if (is_power_of_2(sizeof(struct wc_entry)) && 0)
354                 return &sb(wc)->entries[e - wc->entries];
355         else
356                 return &sb(wc)->entries[e->index];
357 }
358
359 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
360 {
361         return (char *)wc->block_start + (e->index << wc->block_size_bits);
362 }
363
364 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
365 {
366         return wc->start_sector + wc->metadata_sectors +
367                 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
368 }
369
370 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
371 {
372 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
373         return e->original_sector;
374 #else
375         return le64_to_cpu(memory_entry(wc, e)->original_sector);
376 #endif
377 }
378
379 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
380 {
381 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
382         return e->seq_count;
383 #else
384         return le64_to_cpu(memory_entry(wc, e)->seq_count);
385 #endif
386 }
387
388 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
389 {
390 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
391         e->seq_count = -1;
392 #endif
393         pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
394 }
395
396 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
397                                             uint64_t original_sector, uint64_t seq_count)
398 {
399         struct wc_memory_entry me;
400 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
401         e->original_sector = original_sector;
402         e->seq_count = seq_count;
403 #endif
404         me.original_sector = cpu_to_le64(original_sector);
405         me.seq_count = cpu_to_le64(seq_count);
406         pmem_assign(*memory_entry(wc, e), me);
407 }
408
409 #define writecache_error(wc, err, msg, arg...)                          \
410 do {                                                                    \
411         if (!cmpxchg(&(wc)->error, 0, err))                             \
412                 DMERR(msg, ##arg);                                      \
413         wake_up(&(wc)->freelist_wait);                                  \
414 } while (0)
415
416 #define writecache_has_error(wc)        (unlikely(READ_ONCE((wc)->error)))
417
418 static void writecache_flush_all_metadata(struct dm_writecache *wc)
419 {
420         if (!WC_MODE_PMEM(wc))
421                 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
422 }
423
424 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
425 {
426         if (!WC_MODE_PMEM(wc))
427                 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
428                           wc->dirty_bitmap);
429 }
430
431 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
432
433 struct io_notify {
434         struct dm_writecache *wc;
435         struct completion c;
436         atomic_t count;
437 };
438
439 static void writecache_notify_io(unsigned long error, void *context)
440 {
441         struct io_notify *endio = context;
442
443         if (unlikely(error != 0))
444                 writecache_error(endio->wc, -EIO, "error writing metadata");
445         BUG_ON(atomic_read(&endio->count) <= 0);
446         if (atomic_dec_and_test(&endio->count))
447                 complete(&endio->c);
448 }
449
450 static void ssd_commit_flushed(struct dm_writecache *wc)
451 {
452         struct dm_io_region region;
453         struct dm_io_request req;
454         struct io_notify endio = {
455                 wc,
456                 COMPLETION_INITIALIZER_ONSTACK(endio.c),
457                 ATOMIC_INIT(1),
458         };
459         unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
460         unsigned i = 0;
461
462         while (1) {
463                 unsigned j;
464                 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
465                 if (unlikely(i == bitmap_bits))
466                         break;
467                 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
468
469                 region.bdev = wc->ssd_dev->bdev;
470                 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
471                 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
472
473                 if (unlikely(region.sector >= wc->metadata_sectors))
474                         break;
475                 if (unlikely(region.sector + region.count > wc->metadata_sectors))
476                         region.count = wc->metadata_sectors - region.sector;
477
478                 region.sector += wc->start_sector;
479                 atomic_inc(&endio.count);
480                 req.bi_op = REQ_OP_WRITE;
481                 req.bi_op_flags = REQ_SYNC;
482                 req.mem.type = DM_IO_VMA;
483                 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
484                 req.client = wc->dm_io;
485                 req.notify.fn = writecache_notify_io;
486                 req.notify.context = &endio;
487
488                 /* writing via async dm-io (implied by notify.fn above) won't return an error */
489                 (void) dm_io(&req, 1, &region, NULL);
490                 i = j;
491         }
492
493         writecache_notify_io(0, &endio);
494         wait_for_completion_io(&endio.c);
495
496         writecache_disk_flush(wc, wc->ssd_dev);
497
498         memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
499 }
500
501 static void writecache_commit_flushed(struct dm_writecache *wc)
502 {
503         if (WC_MODE_PMEM(wc))
504                 wmb();
505         else
506                 ssd_commit_flushed(wc);
507 }
508
509 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
510 {
511         int r;
512         struct dm_io_region region;
513         struct dm_io_request req;
514
515         region.bdev = dev->bdev;
516         region.sector = 0;
517         region.count = 0;
518         req.bi_op = REQ_OP_WRITE;
519         req.bi_op_flags = REQ_PREFLUSH;
520         req.mem.type = DM_IO_KMEM;
521         req.mem.ptr.addr = NULL;
522         req.client = wc->dm_io;
523         req.notify.fn = NULL;
524
525         r = dm_io(&req, 1, &region, NULL);
526         if (unlikely(r))
527                 writecache_error(wc, r, "error flushing metadata: %d", r);
528 }
529
530 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
531 {
532         wait_event(wc->bio_in_progress_wait[direction],
533                    !atomic_read(&wc->bio_in_progress[direction]));
534 }
535
536 #define WFE_RETURN_FOLLOWING    1
537 #define WFE_LOWEST_SEQ          2
538
539 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
540                                               uint64_t block, int flags)
541 {
542         struct wc_entry *e;
543         struct rb_node *node = wc->tree.rb_node;
544
545         if (unlikely(!node))
546                 return NULL;
547
548         while (1) {
549                 e = container_of(node, struct wc_entry, rb_node);
550                 if (read_original_sector(wc, e) == block)
551                         break;
552                 node = (read_original_sector(wc, e) >= block ?
553                         e->rb_node.rb_left : e->rb_node.rb_right);
554                 if (unlikely(!node)) {
555                         if (!(flags & WFE_RETURN_FOLLOWING)) {
556                                 return NULL;
557                         }
558                         if (read_original_sector(wc, e) >= block) {
559                                 break;
560                         } else {
561                                 node = rb_next(&e->rb_node);
562                                 if (unlikely(!node)) {
563                                         return NULL;
564                                 }
565                                 e = container_of(node, struct wc_entry, rb_node);
566                                 break;
567                         }
568                 }
569         }
570
571         while (1) {
572                 struct wc_entry *e2;
573                 if (flags & WFE_LOWEST_SEQ)
574                         node = rb_prev(&e->rb_node);
575                 else
576                         node = rb_next(&e->rb_node);
577                 if (!node)
578                         return e;
579                 e2 = container_of(node, struct wc_entry, rb_node);
580                 if (read_original_sector(wc, e2) != block)
581                         return e;
582                 e = e2;
583         }
584 }
585
586 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
587 {
588         struct wc_entry *e;
589         struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
590
591         while (*node) {
592                 e = container_of(*node, struct wc_entry, rb_node);
593                 parent = &e->rb_node;
594                 if (read_original_sector(wc, e) > read_original_sector(wc, ins))
595                         node = &parent->rb_left;
596                 else
597                         node = &parent->rb_right;
598         }
599         rb_link_node(&ins->rb_node, parent, node);
600         rb_insert_color(&ins->rb_node, &wc->tree);
601         list_add(&ins->lru, &wc->lru);
602 }
603
604 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
605 {
606         list_del(&e->lru);
607         rb_erase(&e->rb_node, &wc->tree);
608 }
609
610 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
611 {
612         if (WC_MODE_SORT_FREELIST(wc)) {
613                 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
614                 if (unlikely(!*node))
615                         wc->current_free = e;
616                 while (*node) {
617                         parent = *node;
618                         if (&e->rb_node < *node)
619                                 node = &parent->rb_left;
620                         else
621                                 node = &parent->rb_right;
622                 }
623                 rb_link_node(&e->rb_node, parent, node);
624                 rb_insert_color(&e->rb_node, &wc->freetree);
625         } else {
626                 list_add_tail(&e->lru, &wc->freelist);
627         }
628         wc->freelist_size++;
629 }
630
631 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
632 {
633         struct wc_entry *e;
634
635         if (WC_MODE_SORT_FREELIST(wc)) {
636                 struct rb_node *next;
637                 if (unlikely(!wc->current_free))
638                         return NULL;
639                 e = wc->current_free;
640                 next = rb_next(&e->rb_node);
641                 rb_erase(&e->rb_node, &wc->freetree);
642                 if (unlikely(!next))
643                         next = rb_first(&wc->freetree);
644                 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
645         } else {
646                 if (unlikely(list_empty(&wc->freelist)))
647                         return NULL;
648                 e = container_of(wc->freelist.next, struct wc_entry, lru);
649                 list_del(&e->lru);
650         }
651         wc->freelist_size--;
652         if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
653                 queue_work(wc->writeback_wq, &wc->writeback_work);
654
655         return e;
656 }
657
658 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
659 {
660         writecache_unlink(wc, e);
661         writecache_add_to_freelist(wc, e);
662         clear_seq_count(wc, e);
663         writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
664         if (unlikely(waitqueue_active(&wc->freelist_wait)))
665                 wake_up(&wc->freelist_wait);
666 }
667
668 static void writecache_wait_on_freelist(struct dm_writecache *wc)
669 {
670         DEFINE_WAIT(wait);
671
672         prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
673         wc_unlock(wc);
674         io_schedule();
675         finish_wait(&wc->freelist_wait, &wait);
676         wc_lock(wc);
677 }
678
679 static void writecache_poison_lists(struct dm_writecache *wc)
680 {
681         /*
682          * Catch incorrect access to these values while the device is suspended.
683          */
684         memset(&wc->tree, -1, sizeof wc->tree);
685         wc->lru.next = LIST_POISON1;
686         wc->lru.prev = LIST_POISON2;
687         wc->freelist.next = LIST_POISON1;
688         wc->freelist.prev = LIST_POISON2;
689 }
690
691 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
692 {
693         writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
694         if (WC_MODE_PMEM(wc))
695                 writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
696 }
697
698 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
699 {
700         return read_seq_count(wc, e) < wc->seq_count;
701 }
702
703 static void writecache_flush(struct dm_writecache *wc)
704 {
705         struct wc_entry *e, *e2;
706         bool need_flush_after_free;
707
708         wc->uncommitted_blocks = 0;
709         del_timer(&wc->autocommit_timer);
710
711         if (list_empty(&wc->lru))
712                 return;
713
714         e = container_of(wc->lru.next, struct wc_entry, lru);
715         if (writecache_entry_is_committed(wc, e)) {
716                 if (wc->overwrote_committed) {
717                         writecache_wait_for_ios(wc, WRITE);
718                         writecache_disk_flush(wc, wc->ssd_dev);
719                         wc->overwrote_committed = false;
720                 }
721                 return;
722         }
723         while (1) {
724                 writecache_flush_entry(wc, e);
725                 if (unlikely(e->lru.next == &wc->lru))
726                         break;
727                 e2 = container_of(e->lru.next, struct wc_entry, lru);
728                 if (writecache_entry_is_committed(wc, e2))
729                         break;
730                 e = e2;
731                 cond_resched();
732         }
733         writecache_commit_flushed(wc);
734
735         writecache_wait_for_ios(wc, WRITE);
736
737         wc->seq_count++;
738         pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
739         writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
740         writecache_commit_flushed(wc);
741
742         wc->overwrote_committed = false;
743
744         need_flush_after_free = false;
745         while (1) {
746                 /* Free another committed entry with lower seq-count */
747                 struct rb_node *rb_node = rb_prev(&e->rb_node);
748
749                 if (rb_node) {
750                         e2 = container_of(rb_node, struct wc_entry, rb_node);
751                         if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
752                             likely(!e2->write_in_progress)) {
753                                 writecache_free_entry(wc, e2);
754                                 need_flush_after_free = true;
755                         }
756                 }
757                 if (unlikely(e->lru.prev == &wc->lru))
758                         break;
759                 e = container_of(e->lru.prev, struct wc_entry, lru);
760                 cond_resched();
761         }
762
763         if (need_flush_after_free)
764                 writecache_commit_flushed(wc);
765 }
766
767 static void writecache_flush_work(struct work_struct *work)
768 {
769         struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
770
771         wc_lock(wc);
772         writecache_flush(wc);
773         wc_unlock(wc);
774 }
775
776 static void writecache_autocommit_timer(struct timer_list *t)
777 {
778         struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
779         if (!writecache_has_error(wc))
780                 queue_work(wc->writeback_wq, &wc->flush_work);
781 }
782
783 static void writecache_schedule_autocommit(struct dm_writecache *wc)
784 {
785         if (!timer_pending(&wc->autocommit_timer))
786                 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
787 }
788
789 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
790 {
791         struct wc_entry *e;
792         bool discarded_something = false;
793
794         e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
795         if (unlikely(!e))
796                 return;
797
798         while (read_original_sector(wc, e) < end) {
799                 struct rb_node *node = rb_next(&e->rb_node);
800
801                 if (likely(!e->write_in_progress)) {
802                         if (!discarded_something) {
803                                 writecache_wait_for_ios(wc, READ);
804                                 writecache_wait_for_ios(wc, WRITE);
805                                 discarded_something = true;
806                         }
807                         writecache_free_entry(wc, e);
808                 }
809
810                 if (!node)
811                         break;
812
813                 e = container_of(node, struct wc_entry, rb_node);
814         }
815
816         if (discarded_something)
817                 writecache_commit_flushed(wc);
818 }
819
820 static bool writecache_wait_for_writeback(struct dm_writecache *wc)
821 {
822         if (wc->writeback_size) {
823                 writecache_wait_on_freelist(wc);
824                 return true;
825         }
826         return false;
827 }
828
829 static void writecache_suspend(struct dm_target *ti)
830 {
831         struct dm_writecache *wc = ti->private;
832         bool flush_on_suspend;
833
834         del_timer_sync(&wc->autocommit_timer);
835
836         wc_lock(wc);
837         writecache_flush(wc);
838         flush_on_suspend = wc->flush_on_suspend;
839         if (flush_on_suspend) {
840                 wc->flush_on_suspend = false;
841                 wc->writeback_all++;
842                 queue_work(wc->writeback_wq, &wc->writeback_work);
843         }
844         wc_unlock(wc);
845
846         flush_workqueue(wc->writeback_wq);
847
848         wc_lock(wc);
849         if (flush_on_suspend)
850                 wc->writeback_all--;
851         while (writecache_wait_for_writeback(wc));
852
853         if (WC_MODE_PMEM(wc))
854                 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
855
856         writecache_poison_lists(wc);
857
858         wc_unlock(wc);
859 }
860
861 static int writecache_alloc_entries(struct dm_writecache *wc)
862 {
863         size_t b;
864
865         if (wc->entries)
866                 return 0;
867         wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
868         if (!wc->entries)
869                 return -ENOMEM;
870         for (b = 0; b < wc->n_blocks; b++) {
871                 struct wc_entry *e = &wc->entries[b];
872                 e->index = b;
873                 e->write_in_progress = false;
874         }
875
876         return 0;
877 }
878
879 static void writecache_resume(struct dm_target *ti)
880 {
881         struct dm_writecache *wc = ti->private;
882         size_t b;
883         bool need_flush = false;
884         __le64 sb_seq_count;
885         int r;
886
887         wc_lock(wc);
888
889         if (WC_MODE_PMEM(wc))
890                 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
891
892         wc->tree = RB_ROOT;
893         INIT_LIST_HEAD(&wc->lru);
894         if (WC_MODE_SORT_FREELIST(wc)) {
895                 wc->freetree = RB_ROOT;
896                 wc->current_free = NULL;
897         } else {
898                 INIT_LIST_HEAD(&wc->freelist);
899         }
900         wc->freelist_size = 0;
901
902         r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
903         if (r) {
904                 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
905                 sb_seq_count = cpu_to_le64(0);
906         }
907         wc->seq_count = le64_to_cpu(sb_seq_count);
908
909 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
910         for (b = 0; b < wc->n_blocks; b++) {
911                 struct wc_entry *e = &wc->entries[b];
912                 struct wc_memory_entry wme;
913                 if (writecache_has_error(wc)) {
914                         e->original_sector = -1;
915                         e->seq_count = -1;
916                         continue;
917                 }
918                 r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
919                 if (r) {
920                         writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
921                                          (unsigned long)b, r);
922                         e->original_sector = -1;
923                         e->seq_count = -1;
924                 } else {
925                         e->original_sector = le64_to_cpu(wme.original_sector);
926                         e->seq_count = le64_to_cpu(wme.seq_count);
927                 }
928         }
929 #endif
930         for (b = 0; b < wc->n_blocks; b++) {
931                 struct wc_entry *e = &wc->entries[b];
932                 if (!writecache_entry_is_committed(wc, e)) {
933                         if (read_seq_count(wc, e) != -1) {
934 erase_this:
935                                 clear_seq_count(wc, e);
936                                 need_flush = true;
937                         }
938                         writecache_add_to_freelist(wc, e);
939                 } else {
940                         struct wc_entry *old;
941
942                         old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
943                         if (!old) {
944                                 writecache_insert_entry(wc, e);
945                         } else {
946                                 if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
947                                         writecache_error(wc, -EINVAL,
948                                                  "two identical entries, position %llu, sector %llu, sequence %llu",
949                                                  (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
950                                                  (unsigned long long)read_seq_count(wc, e));
951                                 }
952                                 if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
953                                         goto erase_this;
954                                 } else {
955                                         writecache_free_entry(wc, old);
956                                         writecache_insert_entry(wc, e);
957                                         need_flush = true;
958                                 }
959                         }
960                 }
961                 cond_resched();
962         }
963
964         if (need_flush) {
965                 writecache_flush_all_metadata(wc);
966                 writecache_commit_flushed(wc);
967         }
968
969         wc_unlock(wc);
970 }
971
972 static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
973 {
974         if (argc != 1)
975                 return -EINVAL;
976
977         wc_lock(wc);
978         if (dm_suspended(wc->ti)) {
979                 wc_unlock(wc);
980                 return -EBUSY;
981         }
982         if (writecache_has_error(wc)) {
983                 wc_unlock(wc);
984                 return -EIO;
985         }
986
987         writecache_flush(wc);
988         wc->writeback_all++;
989         queue_work(wc->writeback_wq, &wc->writeback_work);
990         wc_unlock(wc);
991
992         flush_workqueue(wc->writeback_wq);
993
994         wc_lock(wc);
995         wc->writeback_all--;
996         if (writecache_has_error(wc)) {
997                 wc_unlock(wc);
998                 return -EIO;
999         }
1000         wc_unlock(wc);
1001
1002         return 0;
1003 }
1004
1005 static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1006 {
1007         if (argc != 1)
1008                 return -EINVAL;
1009
1010         wc_lock(wc);
1011         wc->flush_on_suspend = true;
1012         wc_unlock(wc);
1013
1014         return 0;
1015 }
1016
1017 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
1018                               char *result, unsigned maxlen)
1019 {
1020         int r = -EINVAL;
1021         struct dm_writecache *wc = ti->private;
1022
1023         if (!strcasecmp(argv[0], "flush"))
1024                 r = process_flush_mesg(argc, argv, wc);
1025         else if (!strcasecmp(argv[0], "flush_on_suspend"))
1026                 r = process_flush_on_suspend_mesg(argc, argv, wc);
1027         else
1028                 DMERR("unrecognised message received: %s", argv[0]);
1029
1030         return r;
1031 }
1032
1033 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1034 {
1035         void *buf;
1036         unsigned long flags;
1037         unsigned size;
1038         int rw = bio_data_dir(bio);
1039         unsigned remaining_size = wc->block_size;
1040
1041         do {
1042                 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1043                 buf = bvec_kmap_irq(&bv, &flags);
1044                 size = bv.bv_len;
1045                 if (unlikely(size > remaining_size))
1046                         size = remaining_size;
1047
1048                 if (rw == READ) {
1049                         int r;
1050                         r = memcpy_mcsafe(buf, data, size);
1051                         flush_dcache_page(bio_page(bio));
1052                         if (unlikely(r)) {
1053                                 writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1054                                 bio->bi_status = BLK_STS_IOERR;
1055                         }
1056                 } else {
1057                         flush_dcache_page(bio_page(bio));
1058                         memcpy_flushcache(data, buf, size);
1059                 }
1060
1061                 bvec_kunmap_irq(buf, &flags);
1062
1063                 data = (char *)data + size;
1064                 remaining_size -= size;
1065                 bio_advance(bio, size);
1066         } while (unlikely(remaining_size));
1067 }
1068
1069 static int writecache_flush_thread(void *data)
1070 {
1071         struct dm_writecache *wc = data;
1072
1073         while (1) {
1074                 struct bio *bio;
1075
1076                 wc_lock(wc);
1077                 bio = bio_list_pop(&wc->flush_list);
1078                 if (!bio) {
1079                         set_current_state(TASK_INTERRUPTIBLE);
1080                         wc_unlock(wc);
1081
1082                         if (unlikely(kthread_should_stop())) {
1083                                 set_current_state(TASK_RUNNING);
1084                                 break;
1085                         }
1086
1087                         schedule();
1088                         continue;
1089                 }
1090
1091                 if (bio_op(bio) == REQ_OP_DISCARD) {
1092                         writecache_discard(wc, bio->bi_iter.bi_sector,
1093                                            bio_end_sector(bio));
1094                         wc_unlock(wc);
1095                         bio_set_dev(bio, wc->dev->bdev);
1096                         generic_make_request(bio);
1097                 } else {
1098                         writecache_flush(wc);
1099                         wc_unlock(wc);
1100                         if (writecache_has_error(wc))
1101                                 bio->bi_status = BLK_STS_IOERR;
1102                         bio_endio(bio);
1103                 }
1104         }
1105
1106         return 0;
1107 }
1108
1109 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1110 {
1111         if (bio_list_empty(&wc->flush_list))
1112                 wake_up_process(wc->flush_thread);
1113         bio_list_add(&wc->flush_list, bio);
1114 }
1115
1116 static int writecache_map(struct dm_target *ti, struct bio *bio)
1117 {
1118         struct wc_entry *e;
1119         struct dm_writecache *wc = ti->private;
1120
1121         bio->bi_private = NULL;
1122
1123         wc_lock(wc);
1124
1125         if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1126                 if (writecache_has_error(wc))
1127                         goto unlock_error;
1128                 if (WC_MODE_PMEM(wc)) {
1129                         writecache_flush(wc);
1130                         if (writecache_has_error(wc))
1131                                 goto unlock_error;
1132                         goto unlock_submit;
1133                 } else {
1134                         writecache_offload_bio(wc, bio);
1135                         goto unlock_return;
1136                 }
1137         }
1138
1139         bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1140
1141         if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1142                                 (wc->block_size / 512 - 1)) != 0)) {
1143                 DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1144                       (unsigned long long)bio->bi_iter.bi_sector,
1145                       bio->bi_iter.bi_size, wc->block_size);
1146                 goto unlock_error;
1147         }
1148
1149         if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1150                 if (writecache_has_error(wc))
1151                         goto unlock_error;
1152                 if (WC_MODE_PMEM(wc)) {
1153                         writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1154                         goto unlock_remap_origin;
1155                 } else {
1156                         writecache_offload_bio(wc, bio);
1157                         goto unlock_return;
1158                 }
1159         }
1160
1161         if (bio_data_dir(bio) == READ) {
1162 read_next_block:
1163                 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1164                 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1165                         if (WC_MODE_PMEM(wc)) {
1166                                 bio_copy_block(wc, bio, memory_data(wc, e));
1167                                 if (bio->bi_iter.bi_size)
1168                                         goto read_next_block;
1169                                 goto unlock_submit;
1170                         } else {
1171                                 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1172                                 bio_set_dev(bio, wc->ssd_dev->bdev);
1173                                 bio->bi_iter.bi_sector = cache_sector(wc, e);
1174                                 if (!writecache_entry_is_committed(wc, e))
1175                                         writecache_wait_for_ios(wc, WRITE);
1176                                 goto unlock_remap;
1177                         }
1178                 } else {
1179                         if (e) {
1180                                 sector_t next_boundary =
1181                                         read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1182                                 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1183                                         dm_accept_partial_bio(bio, next_boundary);
1184                                 }
1185                         }
1186                         goto unlock_remap_origin;
1187                 }
1188         } else {
1189                 do {
1190                         if (writecache_has_error(wc))
1191                                 goto unlock_error;
1192                         e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1193                         if (e) {
1194                                 if (!writecache_entry_is_committed(wc, e))
1195                                         goto bio_copy;
1196                                 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1197                                         wc->overwrote_committed = true;
1198                                         goto bio_copy;
1199                                 }
1200                         }
1201                         e = writecache_pop_from_freelist(wc);
1202                         if (unlikely(!e)) {
1203                                 writecache_wait_on_freelist(wc);
1204                                 continue;
1205                         }
1206                         write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1207                         writecache_insert_entry(wc, e);
1208                         wc->uncommitted_blocks++;
1209 bio_copy:
1210                         if (WC_MODE_PMEM(wc)) {
1211                                 bio_copy_block(wc, bio, memory_data(wc, e));
1212                         } else {
1213                                 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1214                                 bio_set_dev(bio, wc->ssd_dev->bdev);
1215                                 bio->bi_iter.bi_sector = cache_sector(wc, e);
1216                                 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1217                                         wc->uncommitted_blocks = 0;
1218                                         queue_work(wc->writeback_wq, &wc->flush_work);
1219                                 } else {
1220                                         writecache_schedule_autocommit(wc);
1221                                 }
1222                                 goto unlock_remap;
1223                         }
1224                 } while (bio->bi_iter.bi_size);
1225
1226                 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks))
1227                         writecache_flush(wc);
1228                 else
1229                         writecache_schedule_autocommit(wc);
1230                 goto unlock_submit;
1231         }
1232
1233 unlock_remap_origin:
1234         bio_set_dev(bio, wc->dev->bdev);
1235         wc_unlock(wc);
1236         return DM_MAPIO_REMAPPED;
1237
1238 unlock_remap:
1239         /* make sure that writecache_end_io decrements bio_in_progress: */
1240         bio->bi_private = (void *)1;
1241         atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1242         wc_unlock(wc);
1243         return DM_MAPIO_REMAPPED;
1244
1245 unlock_submit:
1246         wc_unlock(wc);
1247         bio_endio(bio);
1248         return DM_MAPIO_SUBMITTED;
1249
1250 unlock_return:
1251         wc_unlock(wc);
1252         return DM_MAPIO_SUBMITTED;
1253
1254 unlock_error:
1255         wc_unlock(wc);
1256         bio_io_error(bio);
1257         return DM_MAPIO_SUBMITTED;
1258 }
1259
1260 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1261 {
1262         struct dm_writecache *wc = ti->private;
1263
1264         if (bio->bi_private != NULL) {
1265                 int dir = bio_data_dir(bio);
1266                 if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1267                         if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1268                                 wake_up(&wc->bio_in_progress_wait[dir]);
1269         }
1270         return 0;
1271 }
1272
1273 static int writecache_iterate_devices(struct dm_target *ti,
1274                                       iterate_devices_callout_fn fn, void *data)
1275 {
1276         struct dm_writecache *wc = ti->private;
1277
1278         return fn(ti, wc->dev, 0, ti->len, data);
1279 }
1280
1281 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1282 {
1283         struct dm_writecache *wc = ti->private;
1284
1285         if (limits->logical_block_size < wc->block_size)
1286                 limits->logical_block_size = wc->block_size;
1287
1288         if (limits->physical_block_size < wc->block_size)
1289                 limits->physical_block_size = wc->block_size;
1290
1291         if (limits->io_min < wc->block_size)
1292                 limits->io_min = wc->block_size;
1293 }
1294
1295
1296 static void writecache_writeback_endio(struct bio *bio)
1297 {
1298         struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1299         struct dm_writecache *wc = wb->wc;
1300         unsigned long flags;
1301
1302         raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1303         if (unlikely(list_empty(&wc->endio_list)))
1304                 wake_up_process(wc->endio_thread);
1305         list_add_tail(&wb->endio_entry, &wc->endio_list);
1306         raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1307 }
1308
1309 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1310 {
1311         struct copy_struct *c = ptr;
1312         struct dm_writecache *wc = c->wc;
1313
1314         c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1315
1316         raw_spin_lock_irq(&wc->endio_list_lock);
1317         if (unlikely(list_empty(&wc->endio_list)))
1318                 wake_up_process(wc->endio_thread);
1319         list_add_tail(&c->endio_entry, &wc->endio_list);
1320         raw_spin_unlock_irq(&wc->endio_list_lock);
1321 }
1322
1323 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1324 {
1325         unsigned i;
1326         struct writeback_struct *wb;
1327         struct wc_entry *e;
1328         unsigned long n_walked = 0;
1329
1330         do {
1331                 wb = list_entry(list->next, struct writeback_struct, endio_entry);
1332                 list_del(&wb->endio_entry);
1333
1334                 if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1335                         writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1336                                         "write error %d", wb->bio.bi_status);
1337                 i = 0;
1338                 do {
1339                         e = wb->wc_list[i];
1340                         BUG_ON(!e->write_in_progress);
1341                         e->write_in_progress = false;
1342                         INIT_LIST_HEAD(&e->lru);
1343                         if (!writecache_has_error(wc))
1344                                 writecache_free_entry(wc, e);
1345                         BUG_ON(!wc->writeback_size);
1346                         wc->writeback_size--;
1347                         n_walked++;
1348                         if (unlikely(n_walked >= ENDIO_LATENCY)) {
1349                                 writecache_commit_flushed(wc);
1350                                 wc_unlock(wc);
1351                                 wc_lock(wc);
1352                                 n_walked = 0;
1353                         }
1354                 } while (++i < wb->wc_list_n);
1355
1356                 if (wb->wc_list != wb->wc_list_inline)
1357                         kfree(wb->wc_list);
1358                 bio_put(&wb->bio);
1359         } while (!list_empty(list));
1360 }
1361
1362 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1363 {
1364         struct copy_struct *c;
1365         struct wc_entry *e;
1366
1367         do {
1368                 c = list_entry(list->next, struct copy_struct, endio_entry);
1369                 list_del(&c->endio_entry);
1370
1371                 if (unlikely(c->error))
1372                         writecache_error(wc, c->error, "copy error");
1373
1374                 e = c->e;
1375                 do {
1376                         BUG_ON(!e->write_in_progress);
1377                         e->write_in_progress = false;
1378                         INIT_LIST_HEAD(&e->lru);
1379                         if (!writecache_has_error(wc))
1380                                 writecache_free_entry(wc, e);
1381
1382                         BUG_ON(!wc->writeback_size);
1383                         wc->writeback_size--;
1384                         e++;
1385                 } while (--c->n_entries);
1386                 mempool_free(c, &wc->copy_pool);
1387         } while (!list_empty(list));
1388 }
1389
1390 static int writecache_endio_thread(void *data)
1391 {
1392         struct dm_writecache *wc = data;
1393
1394         while (1) {
1395                 struct list_head list;
1396
1397                 raw_spin_lock_irq(&wc->endio_list_lock);
1398                 if (!list_empty(&wc->endio_list))
1399                         goto pop_from_list;
1400                 set_current_state(TASK_INTERRUPTIBLE);
1401                 raw_spin_unlock_irq(&wc->endio_list_lock);
1402
1403                 if (unlikely(kthread_should_stop())) {
1404                         set_current_state(TASK_RUNNING);
1405                         break;
1406                 }
1407
1408                 schedule();
1409
1410                 continue;
1411
1412 pop_from_list:
1413                 list = wc->endio_list;
1414                 list.next->prev = list.prev->next = &list;
1415                 INIT_LIST_HEAD(&wc->endio_list);
1416                 raw_spin_unlock_irq(&wc->endio_list_lock);
1417
1418                 if (!WC_MODE_FUA(wc))
1419                         writecache_disk_flush(wc, wc->dev);
1420
1421                 wc_lock(wc);
1422
1423                 if (WC_MODE_PMEM(wc)) {
1424                         __writecache_endio_pmem(wc, &list);
1425                 } else {
1426                         __writecache_endio_ssd(wc, &list);
1427                         writecache_wait_for_ios(wc, READ);
1428                 }
1429
1430                 writecache_commit_flushed(wc);
1431
1432                 wc_unlock(wc);
1433         }
1434
1435         return 0;
1436 }
1437
1438 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
1439 {
1440         struct dm_writecache *wc = wb->wc;
1441         unsigned block_size = wc->block_size;
1442         void *address = memory_data(wc, e);
1443
1444         persistent_memory_flush_cache(address, block_size);
1445         return bio_add_page(&wb->bio, persistent_memory_page(address),
1446                             block_size, persistent_memory_page_offset(address)) != 0;
1447 }
1448
1449 struct writeback_list {
1450         struct list_head list;
1451         size_t size;
1452 };
1453
1454 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1455 {
1456         if (unlikely(wc->max_writeback_jobs)) {
1457                 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1458                         wc_lock(wc);
1459                         while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1460                                 writecache_wait_on_freelist(wc);
1461                         wc_unlock(wc);
1462                 }
1463         }
1464         cond_resched();
1465 }
1466
1467 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1468 {
1469         struct wc_entry *e, *f;
1470         struct bio *bio;
1471         struct writeback_struct *wb;
1472         unsigned max_pages;
1473
1474         while (wbl->size) {
1475                 wbl->size--;
1476                 e = container_of(wbl->list.prev, struct wc_entry, lru);
1477                 list_del(&e->lru);
1478
1479                 max_pages = e->wc_list_contiguous;
1480
1481                 bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
1482                 wb = container_of(bio, struct writeback_struct, bio);
1483                 wb->wc = wc;
1484                 wb->bio.bi_end_io = writecache_writeback_endio;
1485                 bio_set_dev(&wb->bio, wc->dev->bdev);
1486                 wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
1487                 wb->page_offset = PAGE_SIZE;
1488                 if (max_pages <= WB_LIST_INLINE ||
1489                     unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1490                                                            GFP_NOIO | __GFP_NORETRY |
1491                                                            __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
1492                         wb->wc_list = wb->wc_list_inline;
1493                         max_pages = WB_LIST_INLINE;
1494                 }
1495
1496                 BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
1497
1498                 wb->wc_list[0] = e;
1499                 wb->wc_list_n = 1;
1500
1501                 while (wbl->size && wb->wc_list_n < max_pages) {
1502                         f = container_of(wbl->list.prev, struct wc_entry, lru);
1503                         if (read_original_sector(wc, f) !=
1504                             read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1505                                 break;
1506                         if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
1507                                 break;
1508                         wbl->size--;
1509                         list_del(&f->lru);
1510                         wb->wc_list[wb->wc_list_n++] = f;
1511                         e = f;
1512                 }
1513                 bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
1514                 if (writecache_has_error(wc)) {
1515                         bio->bi_status = BLK_STS_IOERR;
1516                         bio_endio(&wb->bio);
1517                 } else {
1518                         submit_bio(&wb->bio);
1519                 }
1520
1521                 __writeback_throttle(wc, wbl);
1522         }
1523 }
1524
1525 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1526 {
1527         struct wc_entry *e, *f;
1528         struct dm_io_region from, to;
1529         struct copy_struct *c;
1530
1531         while (wbl->size) {
1532                 unsigned n_sectors;
1533
1534                 wbl->size--;
1535                 e = container_of(wbl->list.prev, struct wc_entry, lru);
1536                 list_del(&e->lru);
1537
1538                 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1539
1540                 from.bdev = wc->ssd_dev->bdev;
1541                 from.sector = cache_sector(wc, e);
1542                 from.count = n_sectors;
1543                 to.bdev = wc->dev->bdev;
1544                 to.sector = read_original_sector(wc, e);
1545                 to.count = n_sectors;
1546
1547                 c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1548                 c->wc = wc;
1549                 c->e = e;
1550                 c->n_entries = e->wc_list_contiguous;
1551
1552                 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1553                         wbl->size--;
1554                         f = container_of(wbl->list.prev, struct wc_entry, lru);
1555                         BUG_ON(f != e + 1);
1556                         list_del(&f->lru);
1557                         e = f;
1558                 }
1559
1560                 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1561
1562                 __writeback_throttle(wc, wbl);
1563         }
1564 }
1565
1566 static void writecache_writeback(struct work_struct *work)
1567 {
1568         struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1569         struct blk_plug plug;
1570         struct wc_entry *e, *f, *g;
1571         struct rb_node *node, *next_node;
1572         struct list_head skipped;
1573         struct writeback_list wbl;
1574         unsigned long n_walked;
1575
1576         wc_lock(wc);
1577 restart:
1578         if (writecache_has_error(wc)) {
1579                 wc_unlock(wc);
1580                 return;
1581         }
1582
1583         if (unlikely(wc->writeback_all)) {
1584                 if (writecache_wait_for_writeback(wc))
1585                         goto restart;
1586         }
1587
1588         if (wc->overwrote_committed) {
1589                 writecache_wait_for_ios(wc, WRITE);
1590         }
1591
1592         n_walked = 0;
1593         INIT_LIST_HEAD(&skipped);
1594         INIT_LIST_HEAD(&wbl.list);
1595         wbl.size = 0;
1596         while (!list_empty(&wc->lru) &&
1597                (wc->writeback_all ||
1598                 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) {
1599
1600                 n_walked++;
1601                 if (unlikely(n_walked > WRITEBACK_LATENCY) &&
1602                     likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
1603                         queue_work(wc->writeback_wq, &wc->writeback_work);
1604                         break;
1605                 }
1606
1607                 e = container_of(wc->lru.prev, struct wc_entry, lru);
1608                 BUG_ON(e->write_in_progress);
1609                 if (unlikely(!writecache_entry_is_committed(wc, e))) {
1610                         writecache_flush(wc);
1611                 }
1612                 node = rb_prev(&e->rb_node);
1613                 if (node) {
1614                         f = container_of(node, struct wc_entry, rb_node);
1615                         if (unlikely(read_original_sector(wc, f) ==
1616                                      read_original_sector(wc, e))) {
1617                                 BUG_ON(!f->write_in_progress);
1618                                 list_del(&e->lru);
1619                                 list_add(&e->lru, &skipped);
1620                                 cond_resched();
1621                                 continue;
1622                         }
1623                 }
1624                 wc->writeback_size++;
1625                 list_del(&e->lru);
1626                 list_add(&e->lru, &wbl.list);
1627                 wbl.size++;
1628                 e->write_in_progress = true;
1629                 e->wc_list_contiguous = 1;
1630
1631                 f = e;
1632
1633                 while (1) {
1634                         next_node = rb_next(&f->rb_node);
1635                         if (unlikely(!next_node))
1636                                 break;
1637                         g = container_of(next_node, struct wc_entry, rb_node);
1638                         if (read_original_sector(wc, g) ==
1639                             read_original_sector(wc, f)) {
1640                                 f = g;
1641                                 continue;
1642                         }
1643                         if (read_original_sector(wc, g) !=
1644                             read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
1645                                 break;
1646                         if (unlikely(g->write_in_progress))
1647                                 break;
1648                         if (unlikely(!writecache_entry_is_committed(wc, g)))
1649                                 break;
1650
1651                         if (!WC_MODE_PMEM(wc)) {
1652                                 if (g != f + 1)
1653                                         break;
1654                         }
1655
1656                         n_walked++;
1657                         //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
1658                         //      break;
1659
1660                         wc->writeback_size++;
1661                         list_del(&g->lru);
1662                         list_add(&g->lru, &wbl.list);
1663                         wbl.size++;
1664                         g->write_in_progress = true;
1665                         g->wc_list_contiguous = BIO_MAX_PAGES;
1666                         f = g;
1667                         e->wc_list_contiguous++;
1668                         if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES))
1669                                 break;
1670                 }
1671                 cond_resched();
1672         }
1673
1674         if (!list_empty(&skipped)) {
1675                 list_splice_tail(&skipped, &wc->lru);
1676                 /*
1677                  * If we didn't do any progress, we must wait until some
1678                  * writeback finishes to avoid burning CPU in a loop
1679                  */
1680                 if (unlikely(!wbl.size))
1681                         writecache_wait_for_writeback(wc);
1682         }
1683
1684         wc_unlock(wc);
1685
1686         blk_start_plug(&plug);
1687
1688         if (WC_MODE_PMEM(wc))
1689                 __writecache_writeback_pmem(wc, &wbl);
1690         else
1691                 __writecache_writeback_ssd(wc, &wbl);
1692
1693         blk_finish_plug(&plug);
1694
1695         if (unlikely(wc->writeback_all)) {
1696                 wc_lock(wc);
1697                 while (writecache_wait_for_writeback(wc));
1698                 wc_unlock(wc);
1699         }
1700 }
1701
1702 static int calculate_memory_size(uint64_t device_size, unsigned block_size,
1703                                  size_t *n_blocks_p, size_t *n_metadata_blocks_p)
1704 {
1705         uint64_t n_blocks, offset;
1706         struct wc_entry e;
1707
1708         n_blocks = device_size;
1709         do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
1710
1711         while (1) {
1712                 if (!n_blocks)
1713                         return -ENOSPC;
1714                 /* Verify the following entries[n_blocks] won't overflow */
1715                 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
1716                                  sizeof(struct wc_memory_entry)))
1717                         return -EFBIG;
1718                 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
1719                 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
1720                 if (offset + n_blocks * block_size <= device_size)
1721                         break;
1722                 n_blocks--;
1723         }
1724
1725         /* check if the bit field overflows */
1726         e.index = n_blocks;
1727         if (e.index != n_blocks)
1728                 return -EFBIG;
1729
1730         if (n_blocks_p)
1731                 *n_blocks_p = n_blocks;
1732         if (n_metadata_blocks_p)
1733                 *n_metadata_blocks_p = offset >> __ffs(block_size);
1734         return 0;
1735 }
1736
1737 static int init_memory(struct dm_writecache *wc)
1738 {
1739         size_t b;
1740         int r;
1741
1742         r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
1743         if (r)
1744                 return r;
1745
1746         r = writecache_alloc_entries(wc);
1747         if (r)
1748                 return r;
1749
1750         for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
1751                 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
1752         pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
1753         pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
1754         pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
1755         pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
1756
1757         for (b = 0; b < wc->n_blocks; b++)
1758                 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
1759
1760         writecache_flush_all_metadata(wc);
1761         writecache_commit_flushed(wc);
1762         pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
1763         writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
1764         writecache_commit_flushed(wc);
1765
1766         return 0;
1767 }
1768
1769 static void writecache_dtr(struct dm_target *ti)
1770 {
1771         struct dm_writecache *wc = ti->private;
1772
1773         if (!wc)
1774                 return;
1775
1776         if (wc->endio_thread)
1777                 kthread_stop(wc->endio_thread);
1778
1779         if (wc->flush_thread)
1780                 kthread_stop(wc->flush_thread);
1781
1782         bioset_exit(&wc->bio_set);
1783
1784         mempool_exit(&wc->copy_pool);
1785
1786         if (wc->writeback_wq)
1787                 destroy_workqueue(wc->writeback_wq);
1788
1789         if (wc->dev)
1790                 dm_put_device(ti, wc->dev);
1791
1792         if (wc->ssd_dev)
1793                 dm_put_device(ti, wc->ssd_dev);
1794
1795         if (wc->entries)
1796                 vfree(wc->entries);
1797
1798         if (wc->memory_map) {
1799                 if (WC_MODE_PMEM(wc))
1800                         persistent_memory_release(wc);
1801                 else
1802                         vfree(wc->memory_map);
1803         }
1804
1805         if (wc->dm_kcopyd)
1806                 dm_kcopyd_client_destroy(wc->dm_kcopyd);
1807
1808         if (wc->dm_io)
1809                 dm_io_client_destroy(wc->dm_io);
1810
1811         if (wc->dirty_bitmap)
1812                 vfree(wc->dirty_bitmap);
1813
1814         kfree(wc);
1815 }
1816
1817 static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1818 {
1819         struct dm_writecache *wc;
1820         struct dm_arg_set as;
1821         const char *string;
1822         unsigned opt_params;
1823         size_t offset, data_size;
1824         int i, r;
1825         char dummy;
1826         int high_wm_percent = HIGH_WATERMARK;
1827         int low_wm_percent = LOW_WATERMARK;
1828         uint64_t x;
1829         struct wc_memory_superblock s;
1830
1831         static struct dm_arg _args[] = {
1832                 {0, 10, "Invalid number of feature args"},
1833         };
1834
1835         as.argc = argc;
1836         as.argv = argv;
1837
1838         wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
1839         if (!wc) {
1840                 ti->error = "Cannot allocate writecache structure";
1841                 r = -ENOMEM;
1842                 goto bad;
1843         }
1844         ti->private = wc;
1845         wc->ti = ti;
1846
1847         mutex_init(&wc->lock);
1848         writecache_poison_lists(wc);
1849         init_waitqueue_head(&wc->freelist_wait);
1850         timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
1851
1852         for (i = 0; i < 2; i++) {
1853                 atomic_set(&wc->bio_in_progress[i], 0);
1854                 init_waitqueue_head(&wc->bio_in_progress_wait[i]);
1855         }
1856
1857         wc->dm_io = dm_io_client_create();
1858         if (IS_ERR(wc->dm_io)) {
1859                 r = PTR_ERR(wc->dm_io);
1860                 ti->error = "Unable to allocate dm-io client";
1861                 wc->dm_io = NULL;
1862                 goto bad;
1863         }
1864
1865         wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1);
1866         if (!wc->writeback_wq) {
1867                 r = -ENOMEM;
1868                 ti->error = "Could not allocate writeback workqueue";
1869                 goto bad;
1870         }
1871         INIT_WORK(&wc->writeback_work, writecache_writeback);
1872         INIT_WORK(&wc->flush_work, writecache_flush_work);
1873
1874         raw_spin_lock_init(&wc->endio_list_lock);
1875         INIT_LIST_HEAD(&wc->endio_list);
1876         wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
1877         if (IS_ERR(wc->endio_thread)) {
1878                 r = PTR_ERR(wc->endio_thread);
1879                 wc->endio_thread = NULL;
1880                 ti->error = "Couldn't spawn endio thread";
1881                 goto bad;
1882         }
1883         wake_up_process(wc->endio_thread);
1884
1885         /*
1886          * Parse the mode (pmem or ssd)
1887          */
1888         string = dm_shift_arg(&as);
1889         if (!string)
1890                 goto bad_arguments;
1891
1892         if (!strcasecmp(string, "s")) {
1893                 wc->pmem_mode = false;
1894         } else if (!strcasecmp(string, "p")) {
1895 #ifdef DM_WRITECACHE_HAS_PMEM
1896                 wc->pmem_mode = true;
1897                 wc->writeback_fua = true;
1898 #else
1899                 /*
1900                  * If the architecture doesn't support persistent memory or
1901                  * the kernel doesn't support any DAX drivers, this driver can
1902                  * only be used in SSD-only mode.
1903                  */
1904                 r = -EOPNOTSUPP;
1905                 ti->error = "Persistent memory or DAX not supported on this system";
1906                 goto bad;
1907 #endif
1908         } else {
1909                 goto bad_arguments;
1910         }
1911
1912         if (WC_MODE_PMEM(wc)) {
1913                 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
1914                                 offsetof(struct writeback_struct, bio),
1915                                 BIOSET_NEED_BVECS);
1916                 if (r) {
1917                         ti->error = "Could not allocate bio set";
1918                         goto bad;
1919                 }
1920         } else {
1921                 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
1922                 if (r) {
1923                         ti->error = "Could not allocate mempool";
1924                         goto bad;
1925                 }
1926         }
1927
1928         /*
1929          * Parse the origin data device
1930          */
1931         string = dm_shift_arg(&as);
1932         if (!string)
1933                 goto bad_arguments;
1934         r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
1935         if (r) {
1936                 ti->error = "Origin data device lookup failed";
1937                 goto bad;
1938         }
1939
1940         /*
1941          * Parse cache data device (be it pmem or ssd)
1942          */
1943         string = dm_shift_arg(&as);
1944         if (!string)
1945                 goto bad_arguments;
1946
1947         r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
1948         if (r) {
1949                 ti->error = "Cache data device lookup failed";
1950                 goto bad;
1951         }
1952         wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
1953
1954         /*
1955          * Parse the cache block size
1956          */
1957         string = dm_shift_arg(&as);
1958         if (!string)
1959                 goto bad_arguments;
1960         if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
1961             wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
1962             (wc->block_size & (wc->block_size - 1))) {
1963                 r = -EINVAL;
1964                 ti->error = "Invalid block size";
1965                 goto bad;
1966         }
1967         wc->block_size_bits = __ffs(wc->block_size);
1968
1969         wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
1970         wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
1971         wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
1972
1973         /*
1974          * Parse optional arguments
1975          */
1976         r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
1977         if (r)
1978                 goto bad;
1979
1980         while (opt_params) {
1981                 string = dm_shift_arg(&as), opt_params--;
1982                 if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
1983                         unsigned long long start_sector;
1984                         string = dm_shift_arg(&as), opt_params--;
1985                         if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
1986                                 goto invalid_optional;
1987                         wc->start_sector = start_sector;
1988                         if (wc->start_sector != start_sector ||
1989                             wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
1990                                 goto invalid_optional;
1991                 } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
1992                         string = dm_shift_arg(&as), opt_params--;
1993                         if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
1994                                 goto invalid_optional;
1995                         if (high_wm_percent < 0 || high_wm_percent > 100)
1996                                 goto invalid_optional;
1997                         wc->high_wm_percent_set = true;
1998                 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
1999                         string = dm_shift_arg(&as), opt_params--;
2000                         if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2001                                 goto invalid_optional;
2002                         if (low_wm_percent < 0 || low_wm_percent > 100)
2003                                 goto invalid_optional;
2004                         wc->low_wm_percent_set = true;
2005                 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2006                         string = dm_shift_arg(&as), opt_params--;
2007                         if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2008                                 goto invalid_optional;
2009                         wc->max_writeback_jobs_set = true;
2010                 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2011                         string = dm_shift_arg(&as), opt_params--;
2012                         if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2013                                 goto invalid_optional;
2014                         wc->autocommit_blocks_set = true;
2015                 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2016                         unsigned autocommit_msecs;
2017                         string = dm_shift_arg(&as), opt_params--;
2018                         if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2019                                 goto invalid_optional;
2020                         if (autocommit_msecs > 3600000)
2021                                 goto invalid_optional;
2022                         wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2023                         wc->autocommit_time_set = true;
2024                 } else if (!strcasecmp(string, "fua")) {
2025                         if (WC_MODE_PMEM(wc)) {
2026                                 wc->writeback_fua = true;
2027                                 wc->writeback_fua_set = true;
2028                         } else goto invalid_optional;
2029                 } else if (!strcasecmp(string, "nofua")) {
2030                         if (WC_MODE_PMEM(wc)) {
2031                                 wc->writeback_fua = false;
2032                                 wc->writeback_fua_set = true;
2033                         } else goto invalid_optional;
2034                 } else {
2035 invalid_optional:
2036                         r = -EINVAL;
2037                         ti->error = "Invalid optional argument";
2038                         goto bad;
2039                 }
2040         }
2041
2042         if (high_wm_percent < low_wm_percent) {
2043                 r = -EINVAL;
2044                 ti->error = "High watermark must be greater than or equal to low watermark";
2045                 goto bad;
2046         }
2047
2048         if (WC_MODE_PMEM(wc)) {
2049                 r = persistent_memory_claim(wc);
2050                 if (r) {
2051                         ti->error = "Unable to map persistent memory for cache";
2052                         goto bad;
2053                 }
2054         } else {
2055                 struct dm_io_region region;
2056                 struct dm_io_request req;
2057                 size_t n_blocks, n_metadata_blocks;
2058                 uint64_t n_bitmap_bits;
2059
2060                 wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2061
2062                 bio_list_init(&wc->flush_list);
2063                 wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
2064                 if (IS_ERR(wc->flush_thread)) {
2065                         r = PTR_ERR(wc->flush_thread);
2066                         wc->flush_thread = NULL;
2067                         ti->error = "Couldn't spawn endio thread";
2068                         goto bad;
2069                 }
2070                 wake_up_process(wc->flush_thread);
2071
2072                 r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2073                                           &n_blocks, &n_metadata_blocks);
2074                 if (r) {
2075                         ti->error = "Invalid device size";
2076                         goto bad;
2077                 }
2078
2079                 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2080                                  BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2081                 /* this is limitation of test_bit functions */
2082                 if (n_bitmap_bits > 1U << 31) {
2083                         r = -EFBIG;
2084                         ti->error = "Invalid device size";
2085                         goto bad;
2086                 }
2087
2088                 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2089                 if (!wc->memory_map) {
2090                         r = -ENOMEM;
2091                         ti->error = "Unable to allocate memory for metadata";
2092                         goto bad;
2093                 }
2094
2095                 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2096                 if (IS_ERR(wc->dm_kcopyd)) {
2097                         r = PTR_ERR(wc->dm_kcopyd);
2098                         ti->error = "Unable to allocate dm-kcopyd client";
2099                         wc->dm_kcopyd = NULL;
2100                         goto bad;
2101                 }
2102
2103                 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2104                 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2105                         BITS_PER_LONG * sizeof(unsigned long);
2106                 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2107                 if (!wc->dirty_bitmap) {
2108                         r = -ENOMEM;
2109                         ti->error = "Unable to allocate dirty bitmap";
2110                         goto bad;
2111                 }
2112
2113                 region.bdev = wc->ssd_dev->bdev;
2114                 region.sector = wc->start_sector;
2115                 region.count = wc->metadata_sectors;
2116                 req.bi_op = REQ_OP_READ;
2117                 req.bi_op_flags = REQ_SYNC;
2118                 req.mem.type = DM_IO_VMA;
2119                 req.mem.ptr.vma = (char *)wc->memory_map;
2120                 req.client = wc->dm_io;
2121                 req.notify.fn = NULL;
2122
2123                 r = dm_io(&req, 1, &region, NULL);
2124                 if (r) {
2125                         ti->error = "Unable to read metadata";
2126                         goto bad;
2127                 }
2128         }
2129
2130         r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2131         if (r) {
2132                 ti->error = "Hardware memory error when reading superblock";
2133                 goto bad;
2134         }
2135         if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2136                 r = init_memory(wc);
2137                 if (r) {
2138                         ti->error = "Unable to initialize device";
2139                         goto bad;
2140                 }
2141                 r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2142                 if (r) {
2143                         ti->error = "Hardware memory error when reading superblock";
2144                         goto bad;
2145                 }
2146         }
2147
2148         if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2149                 ti->error = "Invalid magic in the superblock";
2150                 r = -EINVAL;
2151                 goto bad;
2152         }
2153
2154         if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2155                 ti->error = "Invalid version in the superblock";
2156                 r = -EINVAL;
2157                 goto bad;
2158         }
2159
2160         if (le32_to_cpu(s.block_size) != wc->block_size) {
2161                 ti->error = "Block size does not match superblock";
2162                 r = -EINVAL;
2163                 goto bad;
2164         }
2165
2166         wc->n_blocks = le64_to_cpu(s.n_blocks);
2167
2168         offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2169         if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2170 overflow:
2171                 ti->error = "Overflow in size calculation";
2172                 r = -EINVAL;
2173                 goto bad;
2174         }
2175         offset += sizeof(struct wc_memory_superblock);
2176         if (offset < sizeof(struct wc_memory_superblock))
2177                 goto overflow;
2178         offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2179         data_size = wc->n_blocks * (size_t)wc->block_size;
2180         if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2181             (offset + data_size < offset))
2182                 goto overflow;
2183         if (offset + data_size > wc->memory_map_size) {
2184                 ti->error = "Memory area is too small";
2185                 r = -EINVAL;
2186                 goto bad;
2187         }
2188
2189         wc->metadata_sectors = offset >> SECTOR_SHIFT;
2190         wc->block_start = (char *)sb(wc) + offset;
2191
2192         x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2193         x += 50;
2194         do_div(x, 100);
2195         wc->freelist_high_watermark = x;
2196         x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2197         x += 50;
2198         do_div(x, 100);
2199         wc->freelist_low_watermark = x;
2200
2201         r = writecache_alloc_entries(wc);
2202         if (r) {
2203                 ti->error = "Cannot allocate memory";
2204                 goto bad;
2205         }
2206
2207         ti->num_flush_bios = 1;
2208         ti->flush_supported = true;
2209         ti->num_discard_bios = 1;
2210
2211         if (WC_MODE_PMEM(wc))
2212                 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2213
2214         return 0;
2215
2216 bad_arguments:
2217         r = -EINVAL;
2218         ti->error = "Bad arguments";
2219 bad:
2220         writecache_dtr(ti);
2221         return r;
2222 }
2223
2224 static void writecache_status(struct dm_target *ti, status_type_t type,
2225                               unsigned status_flags, char *result, unsigned maxlen)
2226 {
2227         struct dm_writecache *wc = ti->private;
2228         unsigned extra_args;
2229         unsigned sz = 0;
2230         uint64_t x;
2231
2232         switch (type) {
2233         case STATUSTYPE_INFO:
2234                 DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
2235                        (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2236                        (unsigned long long)wc->writeback_size);
2237                 break;
2238         case STATUSTYPE_TABLE:
2239                 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2240                                 wc->dev->name, wc->ssd_dev->name, wc->block_size);
2241                 extra_args = 0;
2242                 if (wc->start_sector)
2243                         extra_args += 2;
2244                 if (wc->high_wm_percent_set)
2245                         extra_args += 2;
2246                 if (wc->low_wm_percent_set)
2247                         extra_args += 2;
2248                 if (wc->max_writeback_jobs_set)
2249                         extra_args += 2;
2250                 if (wc->autocommit_blocks_set)
2251                         extra_args += 2;
2252                 if (wc->autocommit_time_set)
2253                         extra_args += 2;
2254                 if (wc->writeback_fua_set)
2255                         extra_args++;
2256
2257                 DMEMIT("%u", extra_args);
2258                 if (wc->start_sector)
2259                         DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
2260                 if (wc->high_wm_percent_set) {
2261                         x = (uint64_t)wc->freelist_high_watermark * 100;
2262                         x += wc->n_blocks / 2;
2263                         do_div(x, (size_t)wc->n_blocks);
2264                         DMEMIT(" high_watermark %u", 100 - (unsigned)x);
2265                 }
2266                 if (wc->low_wm_percent_set) {
2267                         x = (uint64_t)wc->freelist_low_watermark * 100;
2268                         x += wc->n_blocks / 2;
2269                         do_div(x, (size_t)wc->n_blocks);
2270                         DMEMIT(" low_watermark %u", 100 - (unsigned)x);
2271                 }
2272                 if (wc->max_writeback_jobs_set)
2273                         DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2274                 if (wc->autocommit_blocks_set)
2275                         DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2276                 if (wc->autocommit_time_set)
2277                         DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
2278                 if (wc->writeback_fua_set)
2279                         DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2280                 break;
2281         }
2282 }
2283
2284 static struct target_type writecache_target = {
2285         .name                   = "writecache",
2286         .version                = {1, 1, 1},
2287         .module                 = THIS_MODULE,
2288         .ctr                    = writecache_ctr,
2289         .dtr                    = writecache_dtr,
2290         .status                 = writecache_status,
2291         .postsuspend            = writecache_suspend,
2292         .resume                 = writecache_resume,
2293         .message                = writecache_message,
2294         .map                    = writecache_map,
2295         .end_io                 = writecache_end_io,
2296         .iterate_devices        = writecache_iterate_devices,
2297         .io_hints               = writecache_io_hints,
2298 };
2299
2300 static int __init dm_writecache_init(void)
2301 {
2302         int r;
2303
2304         r = dm_register_target(&writecache_target);
2305         if (r < 0) {
2306                 DMERR("register failed %d", r);
2307                 return r;
2308         }
2309
2310         return 0;
2311 }
2312
2313 static void __exit dm_writecache_exit(void)
2314 {
2315         dm_unregister_target(&writecache_target);
2316 }
2317
2318 module_init(dm_writecache_init);
2319 module_exit(dm_writecache_exit);
2320
2321 MODULE_DESCRIPTION(DM_NAME " writecache target");
2322 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2323 MODULE_LICENSE("GPL");