lightnvm: physical block device (pblk) target

author Javier González <jg@lightnvm.io>

Sat, 15 Apr 2017 18:55:50 +0000 (20:55 +0200)

committer Jens Axboe <axboe@fb.com>

Sun, 16 Apr 2017 16:06:33 +0000 (10:06 -0600)
author Javier González <jg@lightnvm.io>
Sat, 15 Apr 2017 18:55:50 +0000 (20:55 +0200)
committer Jens Axboe <axboe@fb.com>
Sun, 16 Apr 2017 16:06:33 +0000 (10:06 -0600)
diff --git a/Documentation/lightnvm/pblk.txt b/Documentation/lightnvm/pblk.txt

new file mode 100644 (file)

index 0000000..1040ed1
--- /dev/null
+++ b/Documentation/lightnvm/pblk.txt
@@ -0,0 +1,21 @@
+pblk: Physical Block Device Target
+==================================
+
+pblk implements a fully associative, host-based FTL that exposes a traditional
+block I/O interface. Its primary responsibilities are:
+
+  - Map logical addresses onto physical addresses (4KB granularity) in a
+    logical-to-physical (L2P) table.
+  - Maintain the integrity and consistency of the L2P table as well as its
+    recovery from normal tear down and power outage.
+  - Deal with controller- and media-specific constrains.
+  - Handle I/O errors.
+  - Implement garbage collection.
+  - Maintain consistency across the I/O stack during synchronization points.
+
+For more information please refer to:
+
+  http://lightnvm.io
+
+which maintains updated FAQs, manual pages, technical documentation, tools,
+contacts, etc.
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig

index 0527141..ead61a9 100644 (file)
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -33,4 +33,13 @@ config NVM_RRPC
         host. The target is implemented using a linear mapping table and
         cost-based garbage collection. It is optimized for 4K IO sizes.
  
+config NVM_PBLK
+       tristate "Physical Block Device Open-Channel SSD target"
+       ---help---
+       Allows an open-channel SSD to be exposed as a block device to the
+       host. The target assumes the device exposes raw flash and must be
+       explicitly managed by the host.
+
+       Please note the disk format is considered EXPERIMENTAL for now.
+
  endif # NVM
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile

index b2a39e2..82d1a11 100644 (file)
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -4,3 +4,8 @@
  
  obj-$(CONFIG_NVM)              := core.o
  obj-$(CONFIG_NVM_RRPC)         += rrpc.o
+obj-$(CONFIG_NVM_PBLK)         += pblk.o
+pblk-y                         := pblk-init.o pblk-core.o pblk-rb.o \
+                                  pblk-write.o pblk-cache.o pblk-read.o \
+                                  pblk-gc.o pblk-recovery.o pblk-map.o \
+                                  pblk-rl.o pblk-sysfs.o
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c

new file mode 100644 (file)

index 0000000..59bcea8
--- /dev/null
+++ b/drivers/lightnvm/pblk-cache.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-cache.c - pblk's write cache
+ */
+
+#include "pblk.h"
+
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
+{
+       struct pblk_w_ctx w_ctx;
+       sector_t lba = pblk_get_lba(bio);
+       unsigned int bpos, pos;
+       int nr_entries = pblk_get_secs(bio);
+       int i, ret;
+
+       /* Update the write buffer head (mem) with the entries that we can
+        * write. The write in itself cannot fail, so there is no need to
+        * rollback from here on.
+        */
+retry:
+       ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos);
+       if (ret == NVM_IO_REQUEUE) {
+               io_schedule();
+               goto retry;
+       }
+
+       if (unlikely(!bio_has_data(bio)))
+               goto out;
+
+       w_ctx.flags = flags;
+       pblk_ppa_set_empty(&w_ctx.ppa);
+
+       for (i = 0; i < nr_entries; i++) {
+               void *data = bio_data(bio);
+
+               w_ctx.lba = lba + i;
+
+               pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
+               pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);
+
+               bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(nr_entries, &pblk->inflight_writes);
+       atomic_long_add(nr_entries, &pblk->req_writes);
+#endif
+
+out:
+       pblk_write_should_kick(pblk);
+       return ret;
+}
+
+/*
+ * On GC the incoming lbas are not necessarily sequential. Also, some of the
+ * lbas might not be valid entries, which are marked as empty by the GC thread
+ */
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+                          unsigned int nr_entries, unsigned int nr_rec_entries,
+                          struct pblk_line *gc_line, unsigned long flags)
+{
+       struct pblk_w_ctx w_ctx;
+       unsigned int bpos, pos;
+       int i, valid_entries;
+
+       /* Update the write buffer head (mem) with the entries that we can
+        * write. The write in itself cannot fail, so there is no need to
+        * rollback from here on.
+        */
+retry:
+       if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) {
+               io_schedule();
+               goto retry;
+       }
+
+       w_ctx.flags = flags;
+       pblk_ppa_set_empty(&w_ctx.ppa);
+
+       for (i = 0, valid_entries = 0; i < nr_entries; i++) {
+               if (lba_list[i] == ADDR_EMPTY)
+                       continue;
+
+               w_ctx.lba = lba_list[i];
+
+               pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
+               pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos);
+
+               data += PBLK_EXPOSED_PAGE_SIZE;
+               valid_entries++;
+       }
+
+       WARN_ONCE(nr_rec_entries != valid_entries,
+                                       "pblk: inconsistent GC write\n");
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(valid_entries, &pblk->inflight_writes);
+       atomic_long_add(valid_entries, &pblk->recov_gc_writes);
+#endif
+
+       pblk_write_should_kick(pblk);
+       return NVM_IO_OK;
+}
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c

new file mode 100644 (file)

index 0000000..a2bcd09
--- /dev/null
+++ b/drivers/lightnvm/pblk-core.c
@@ -0,0 +1,1655 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-core.c - pblk's core functionality
+ *
+ */
+
+#include "pblk.h"
+#include <linux/time.h>
+
+static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
+                        struct ppa_addr *ppa)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       int pos = pblk_dev_ppa_to_pos(geo, *ppa);
+
+       pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
+       atomic_long_inc(&pblk->erase_failed);
+
+       if (test_and_set_bit(pos, line->blk_bitmap))
+               pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
+                                                       line->id, pos);
+
+       pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb);
+}
+
+static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       struct pblk_line *line;
+
+       line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)];
+       atomic_dec(&line->left_seblks);
+
+       if (rqd->error) {
+               struct ppa_addr *ppa;
+
+               ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC);
+               if (!ppa)
+                       return;
+
+               *ppa = rqd->ppa_addr;
+               pblk_mark_bb(pblk, line, ppa);
+       }
+}
+
+/* Erase completion assumes that only one block is erased at the time */
+static void pblk_end_io_erase(struct nvm_rq *rqd)
+{
+       struct pblk *pblk = rqd->private;
+
+       up(&pblk->erase_sem);
+       __pblk_end_io_erase(pblk, rqd);
+       mempool_free(rqd, pblk->r_rq_pool);
+}
+
+static void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
+                                 u64 paddr)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct list_head *move_list = NULL;
+
+       /* Lines being reclaimed (GC'ed) cannot be invalidated. Before the L2P
+        * table is modified with reclaimed sectors, a check is done to endure
+        * that newer updates are not overwritten.
+        */
+       spin_lock(&line->lock);
+       if (line->state == PBLK_LINESTATE_GC ||
+                                       line->state == PBLK_LINESTATE_FREE) {
+               spin_unlock(&line->lock);
+               return;
+       }
+
+       if (test_and_set_bit(paddr, line->invalid_bitmap)) {
+               WARN_ONCE(1, "pblk: double invalidate\n");
+               spin_unlock(&line->lock);
+               return;
+       }
+       line->vsc--;
+
+       if (line->state == PBLK_LINESTATE_CLOSED)
+               move_list = pblk_line_gc_list(pblk, line);
+       spin_unlock(&line->lock);
+
+       if (move_list) {
+               spin_lock(&l_mg->gc_lock);
+               spin_lock(&line->lock);
+               /* Prevent moving a line that has just been chosen for GC */
+               if (line->state == PBLK_LINESTATE_GC ||
+                                       line->state == PBLK_LINESTATE_FREE) {
+                       spin_unlock(&line->lock);
+                       spin_unlock(&l_mg->gc_lock);
+                       return;
+               }
+               spin_unlock(&line->lock);
+
+               list_move_tail(&line->list, move_list);
+               spin_unlock(&l_mg->gc_lock);
+       }
+}
+
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
+{
+       struct pblk_line *line;
+       u64 paddr;
+       int line_id;
+
+#ifdef CONFIG_NVM_DEBUG
+       /* Callers must ensure that the ppa points to a device address */
+       BUG_ON(pblk_addr_in_cache(ppa));
+       BUG_ON(pblk_ppa_empty(ppa));
+#endif
+
+       line_id = pblk_tgt_ppa_to_line(ppa);
+       line = &pblk->lines[line_id];
+       paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
+
+       __pblk_map_invalidate(pblk, line, paddr);
+}
+
+void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line,
+                            u64 paddr)
+{
+       __pblk_map_invalidate(pblk, line, paddr);
+
+       pblk_rb_sync_init(&pblk->rwb, NULL);
+       line->left_ssecs--;
+       if (!line->left_ssecs)
+               pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
+       pblk_rb_sync_end(&pblk->rwb, NULL);
+}
+
+static void pblk_invalidate_range(struct pblk *pblk, sector_t slba,
+                                 unsigned int nr_secs)
+{
+       sector_t lba;
+
+       spin_lock(&pblk->trans_lock);
+       for (lba = slba; lba < slba + nr_secs; lba++) {
+               struct ppa_addr ppa;
+
+               ppa = pblk_trans_map_get(pblk, lba);
+
+               if (!pblk_addr_in_cache(ppa) && !pblk_ppa_empty(ppa))
+                       pblk_map_invalidate(pblk, ppa);
+
+               pblk_ppa_set_empty(&ppa);
+               pblk_trans_map_set(pblk, lba, ppa);
+       }
+       spin_unlock(&pblk->trans_lock);
+}
+
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
+{
+       mempool_t *pool;
+       struct nvm_rq *rqd;
+       int rq_size;
+
+       if (rw == WRITE) {
+               pool = pblk->w_rq_pool;
+               rq_size = pblk_w_rq_size;
+       } else {
+               pool = pblk->r_rq_pool;
+               rq_size = pblk_r_rq_size;
+       }
+
+       rqd = mempool_alloc(pool, GFP_KERNEL);
+       memset(rqd, 0, rq_size);
+
+       return rqd;
+}
+
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw)
+{
+       mempool_t *pool;
+
+       if (rw == WRITE)
+               pool = pblk->w_rq_pool;
+       else
+               pool = pblk->r_rq_pool;
+
+       mempool_free(rqd, pool);
+}
+
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+                        int nr_pages)
+{
+       struct bio_vec bv;
+       int i;
+
+       WARN_ON(off + nr_pages != bio->bi_vcnt);
+
+       bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE);
+       for (i = off; i < nr_pages + off; i++) {
+               bv = bio->bi_io_vec[i];
+               mempool_free(bv.bv_page, pblk->page_pool);
+       }
+}
+
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+                      int nr_pages)
+{
+       struct request_queue *q = pblk->dev->q;
+       struct page *page;
+       int i, ret;
+
+       for (i = 0; i < nr_pages; i++) {
+               page = mempool_alloc(pblk->page_pool, flags);
+               if (!page)
+                       goto err;
+
+               ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
+               if (ret != PBLK_EXPOSED_PAGE_SIZE) {
+                       pr_err("pblk: could not add page to bio\n");
+                       mempool_free(page, pblk->page_pool);
+                       goto err;
+               }
+       }
+
+       return 0;
+err:
+       pblk_bio_free_pages(pblk, bio, 0, i - 1);
+       return -1;
+}
+
+static void pblk_write_kick(struct pblk *pblk)
+{
+       wake_up_process(pblk->writer_ts);
+       mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000));
+}
+
+void pblk_write_timer_fn(unsigned long data)
+{
+       struct pblk *pblk = (struct pblk *)data;
+
+       /* kick the write thread every tick to flush outstanding data */
+       pblk_write_kick(pblk);
+}
+
+void pblk_write_should_kick(struct pblk *pblk)
+{
+       unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
+
+       if (secs_avail >= pblk->min_write_pgs)
+               pblk_write_kick(pblk);
+}
+
+void pblk_end_bio_sync(struct bio *bio)
+{
+       struct completion *waiting = bio->bi_private;
+
+       complete(waiting);
+}
+
+void pblk_end_io_sync(struct nvm_rq *rqd)
+{
+       struct completion *waiting = rqd->private;
+
+       complete(waiting);
+}
+
+void pblk_flush_writer(struct pblk *pblk)
+{
+       struct bio *bio;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       bio = bio_alloc(GFP_KERNEL, 1);
+       if (!bio)
+               return;
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_OP_FLUSH);
+       bio->bi_private = &wait;
+       bio->bi_end_io = pblk_end_bio_sync;
+
+       ret = pblk_write_to_cache(pblk, bio, 0);
+       if (ret == NVM_IO_OK) {
+               if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+                       pr_err("pblk: flush cache timed out\n");
+               }
+       } else if (ret != NVM_IO_DONE) {
+               pr_err("pblk: tear down bio failed\n");
+       }
+
+       if (bio->bi_error)
+               pr_err("pblk: flush sync write failed (%u)\n", bio->bi_error);
+
+       bio_put(bio);
+}
+
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct list_head *move_list = NULL;
+
+       if (!line->vsc) {
+               if (line->gc_group != PBLK_LINEGC_FULL) {
+                       line->gc_group = PBLK_LINEGC_FULL;
+                       move_list = &l_mg->gc_full_list;
+               }
+       } else if (line->vsc < lm->mid_thrs) {
+               if (line->gc_group != PBLK_LINEGC_HIGH) {
+                       line->gc_group = PBLK_LINEGC_HIGH;
+                       move_list = &l_mg->gc_high_list;
+               }
+       } else if (line->vsc < lm->high_thrs) {
+               if (line->gc_group != PBLK_LINEGC_MID) {
+                       line->gc_group = PBLK_LINEGC_MID;
+                       move_list = &l_mg->gc_mid_list;
+               }
+       } else if (line->vsc < line->sec_in_line) {
+               if (line->gc_group != PBLK_LINEGC_LOW) {
+                       line->gc_group = PBLK_LINEGC_LOW;
+                       move_list = &l_mg->gc_low_list;
+               }
+       } else if (line->vsc == line->sec_in_line) {
+               if (line->gc_group != PBLK_LINEGC_EMPTY) {
+                       line->gc_group = PBLK_LINEGC_EMPTY;
+                       move_list = &l_mg->gc_empty_list;
+               }
+       } else {
+               line->state = PBLK_LINESTATE_CORRUPT;
+               line->gc_group = PBLK_LINEGC_NONE;
+               move_list =  &l_mg->corrupt_list;
+               pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
+                                               line->id, line->vsc,
+                                               line->sec_in_line,
+                                               lm->high_thrs, lm->mid_thrs);
+       }
+
+       return move_list;
+}
+
+void pblk_discard(struct pblk *pblk, struct bio *bio)
+{
+       sector_t slba = pblk_get_lba(bio);
+       sector_t nr_secs = pblk_get_secs(bio);
+
+       pblk_invalidate_range(pblk, slba, nr_secs);
+}
+
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba)
+{
+       struct ppa_addr ppa;
+
+       spin_lock(&pblk->trans_lock);
+       ppa = pblk_trans_map_get(pblk, lba);
+       spin_unlock(&pblk->trans_lock);
+
+       return ppa;
+}
+
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       atomic_long_inc(&pblk->write_failed);
+#ifdef CONFIG_NVM_DEBUG
+       pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       /* Empty page read is not necessarily an error (e.g., L2P recovery) */
+       if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+               atomic_long_inc(&pblk->read_empty);
+               return;
+       }
+
+       switch (rqd->error) {
+       case NVM_RSP_WARN_HIGHECC:
+               atomic_long_inc(&pblk->read_high_ecc);
+               break;
+       case NVM_RSP_ERR_FAILECC:
+       case NVM_RSP_ERR_FAILCRC:
+               atomic_long_inc(&pblk->read_failed);
+               break;
+       default:
+               pr_err("pblk: unknown read error:%d\n", rqd->error);
+       }
+#ifdef CONFIG_NVM_DEBUG
+       pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+
+#ifdef CONFIG_NVM_DEBUG
+       struct ppa_addr *ppa_list;
+
+       ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+       if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
+               WARN_ON(1);
+               return -EINVAL;
+       }
+
+       if (rqd->opcode == NVM_OP_PWRITE) {
+               struct pblk_line *line;
+               struct ppa_addr ppa;
+               int i;
+
+               for (i = 0; i < rqd->nr_ppas; i++) {
+                       ppa = ppa_list[i];
+                       line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+
+                       spin_lock(&line->lock);
+                       if (line->state != PBLK_LINESTATE_OPEN) {
+                               pr_err("pblk: bad ppa: line:%d,state:%d\n",
+                                                       line->id, line->state);
+                               WARN_ON(1);
+                               spin_unlock(&line->lock);
+                               return -EINVAL;
+                       }
+                       spin_unlock(&line->lock);
+               }
+       }
+#endif
+       return nvm_submit_io(dev, rqd);
+}
+
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+                             unsigned int nr_secs, unsigned int len,
+                             gfp_t gfp_mask)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       void *kaddr = data;
+       struct page *page;
+       struct bio *bio;
+       int i, ret;
+
+       if (l_mg->emeta_alloc_type == PBLK_KMALLOC_META)
+               return bio_map_kern(dev->q, kaddr, len, gfp_mask);
+
+       bio = bio_kmalloc(gfp_mask, nr_secs);
+       if (!bio)
+               return ERR_PTR(-ENOMEM);
+
+       for (i = 0; i < nr_secs; i++) {
+               page = vmalloc_to_page(kaddr);
+               if (!page) {
+                       pr_err("pblk: could not map vmalloc bio\n");
+                       bio_put(bio);
+                       bio = ERR_PTR(-ENOMEM);
+                       goto out;
+               }
+
+               ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
+               if (ret != PAGE_SIZE) {
+                       pr_err("pblk: could not add page to bio\n");
+                       bio_put(bio);
+                       bio = ERR_PTR(-ENOMEM);
+                       goto out;
+               }
+
+               kaddr += PAGE_SIZE;
+       }
+out:
+       return bio;
+}
+
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+                  unsigned long secs_to_flush)
+{
+       int max = pblk->max_write_pgs;
+       int min = pblk->min_write_pgs;
+       int secs_to_sync = 0;
+
+       if (secs_avail >= max)
+               secs_to_sync = max;
+       else if (secs_avail >= min)
+               secs_to_sync = min * (secs_avail / min);
+       else if (secs_to_flush)
+               secs_to_sync = min;
+
+       return secs_to_sync;
+}
+
+static u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line,
+                            int nr_secs)
+{
+       u64 addr;
+       int i;
+
+       /* logic error: ppa out-of-bounds. Prevent generating bad address */
+       if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) {
+               WARN(1, "pblk: page allocation out of bounds\n");
+               nr_secs = pblk->lm.sec_per_line - line->cur_sec;
+       }
+
+       line->cur_sec = addr = find_next_zero_bit(line->map_bitmap,
+                                       pblk->lm.sec_per_line, line->cur_sec);
+       for (i = 0; i < nr_secs; i++, line->cur_sec++)
+               WARN_ON(test_and_set_bit(line->cur_sec, line->map_bitmap));
+
+       return addr;
+}
+
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
+{
+       u64 addr;
+
+       /* Lock needed in case a write fails and a recovery needs to remap
+        * failed write buffer entries
+        */
+       spin_lock(&line->lock);
+       addr = __pblk_alloc_page(pblk, line, nr_secs);
+       line->left_msecs -= nr_secs;
+       WARN(line->left_msecs < 0, "pblk: page allocation out of bounds\n");
+       spin_unlock(&line->lock);
+
+       return addr;
+}
+
+/*
+ * Submit emeta to one LUN in the raid line at the time to avoid a deadlock when
+ * taking the per LUN semaphore.
+ */
+static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
+                                    u64 paddr, int dir)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct bio *bio;
+       struct nvm_rq rqd;
+       struct ppa_addr *ppa_list;
+       dma_addr_t dma_ppa_list;
+       void *emeta = line->emeta;
+       int min = pblk->min_write_pgs;
+       int left_ppas = lm->emeta_sec;
+       int id = line->id;
+       int rq_ppas, rq_len;
+       int cmd_op, bio_op;
+       int flags;
+       int i, j;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       if (dir == WRITE) {
+               bio_op = REQ_OP_WRITE;
+               cmd_op = NVM_OP_PWRITE;
+               flags = pblk_set_progr_mode(pblk, WRITE);
+       } else if (dir == READ) {
+               bio_op = REQ_OP_READ;
+               cmd_op = NVM_OP_PREAD;
+               flags = pblk_set_read_mode(pblk);
+       } else
+               return -EINVAL;
+
+       ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_ppa_list);
+       if (!ppa_list)
+               return -ENOMEM;
+
+next_rq:
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+
+       rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+       rq_len = rq_ppas * geo->sec_size;
+
+       bio = pblk_bio_map_addr(pblk, emeta, rq_ppas, rq_len, GFP_KERNEL);
+       if (IS_ERR(bio)) {
+               ret = PTR_ERR(bio);
+               goto free_rqd_dma;
+       }
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, bio_op, 0);
+
+       rqd.bio = bio;
+       rqd.opcode = cmd_op;
+       rqd.flags = flags;
+       rqd.nr_ppas = rq_ppas;
+       rqd.ppa_list = ppa_list;
+       rqd.dma_ppa_list = dma_ppa_list;
+       rqd.end_io = pblk_end_io_sync;
+       rqd.private = &wait;
+
+       if (dir == WRITE) {
+               for (i = 0; i < rqd.nr_ppas; ) {
+                       spin_lock(&line->lock);
+                       paddr = __pblk_alloc_page(pblk, line, min);
+                       spin_unlock(&line->lock);
+                       for (j = 0; j < min; j++, i++, paddr++)
+                               rqd.ppa_list[i] =
+                                       addr_to_gen_ppa(pblk, paddr, id);
+               }
+       } else {
+               for (i = 0; i < rqd.nr_ppas; ) {
+                       struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
+                       int pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+                       while (test_bit(pos, line->blk_bitmap)) {
+                               paddr += min;
+                               if (pblk_boundary_paddr_checks(pblk, paddr)) {
+                                       pr_err("pblk: corrupt emeta line:%d\n",
+                                                               line->id);
+                                       bio_put(bio);
+                                       ret = -EINTR;
+                                       goto free_rqd_dma;
+                               }
+
+                               ppa = addr_to_gen_ppa(pblk, paddr, id);
+                               pos = pblk_dev_ppa_to_pos(geo, ppa);
+                       }
+
+                       if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
+                               pr_err("pblk: corrupt emeta line:%d\n",
+                                                               line->id);
+                               bio_put(bio);
+                               ret = -EINTR;
+                               goto free_rqd_dma;
+                       }
+
+                       for (j = 0; j < min; j++, i++, paddr++)
+                               rqd.ppa_list[i] =
+                                       addr_to_gen_ppa(pblk, paddr, line->id);
+               }
+       }
+
+       ret = pblk_submit_io(pblk, &rqd);
+       if (ret) {
+               pr_err("pblk: emeta I/O submission failed: %d\n", ret);
+               bio_put(bio);
+               goto free_rqd_dma;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: emeta I/O timed out\n");
+       }
+       reinit_completion(&wait);
+
+       bio_put(bio);
+
+       if (rqd.error) {
+               if (dir == WRITE)
+                       pblk_log_write_err(pblk, &rqd);
+               else
+                       pblk_log_read_err(pblk, &rqd);
+       }
+
+       emeta += rq_len;
+       left_ppas -= rq_ppas;
+       if (left_ppas)
+               goto next_rq;
+free_rqd_dma:
+       nvm_dev_dma_free(dev->parent, ppa_list, dma_ppa_list);
+       return ret;
+}
+
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       int bit;
+
+       /* This usually only happens on bad lines */
+       bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+       if (bit >= lm->blk_per_line)
+               return -1;
+
+       return bit * geo->sec_per_pl;
+}
+
+static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
+                                    u64 paddr, int dir)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct bio *bio;
+       struct nvm_rq rqd;
+       __le64 *lba_list = NULL;
+       int i, ret;
+       int cmd_op, bio_op;
+       int flags;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       if (dir == WRITE) {
+               bio_op = REQ_OP_WRITE;
+               cmd_op = NVM_OP_PWRITE;
+               flags = pblk_set_progr_mode(pblk, WRITE);
+               lba_list = pblk_line_emeta_to_lbas(line->emeta);
+       } else if (dir == READ) {
+               bio_op = REQ_OP_READ;
+               cmd_op = NVM_OP_PREAD;
+               flags = pblk_set_read_mode(pblk);
+       } else
+               return -EINVAL;
+
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+
+       rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+                                                       &rqd.dma_ppa_list);
+       if (!rqd.ppa_list)
+               return -ENOMEM;
+
+       bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
+       if (IS_ERR(bio)) {
+               ret = PTR_ERR(bio);
+               goto free_ppa_list;
+       }
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, bio_op, 0);
+
+       rqd.bio = bio;
+       rqd.opcode = cmd_op;
+       rqd.flags = flags;
+       rqd.nr_ppas = lm->smeta_sec;
+       rqd.end_io = pblk_end_io_sync;
+       rqd.private = &wait;
+
+       for (i = 0; i < lm->smeta_sec; i++, paddr++) {
+               rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+               if (dir == WRITE)
+                       lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
+       }
+
+       /*
+        * This I/O is sent by the write thread when a line is replace. Since
+        * the write thread is the only one sending write and erase commands,
+        * there is no need to take the LUN semaphore.
+        */
+       ret = pblk_submit_io(pblk, &rqd);
+       if (ret) {
+               pr_err("pblk: smeta I/O submission failed: %d\n", ret);
+               bio_put(bio);
+               goto free_ppa_list;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: smeta I/O timed out\n");
+       }
+
+       if (rqd.error) {
+               if (dir == WRITE)
+                       pblk_log_write_err(pblk, &rqd);
+               else
+                       pblk_log_read_err(pblk, &rqd);
+       }
+
+free_ppa_list:
+       nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+
+       return ret;
+}
+
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
+{
+       u64 bpaddr = pblk_line_smeta_start(pblk, line);
+
+       return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ);
+}
+
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line)
+{
+       return pblk_line_submit_emeta_io(pblk, line, line->emeta_ssec, READ);
+}
+
+static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                           struct ppa_addr ppa)
+{
+       rqd->opcode = NVM_OP_ERASE;
+       rqd->ppa_addr = ppa;
+       rqd->nr_ppas = 1;
+       rqd->flags = pblk_set_progr_mode(pblk, ERASE);
+       rqd->bio = NULL;
+}
+
+static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
+{
+       struct nvm_rq rqd;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+
+       pblk_setup_e_rq(pblk, &rqd, ppa);
+
+       rqd.end_io = pblk_end_io_sync;
+       rqd.private = &wait;
+
+       /* The write thread schedules erases so that it minimizes disturbances
+        * with writes. Thus, there is no need to take the LUN semaphore.
+        */
+       ret = pblk_submit_io(pblk, &rqd);
+       if (ret) {
+               struct nvm_tgt_dev *dev = pblk->dev;
+               struct nvm_geo *geo = &dev->geo;
+
+               pr_err("pblk: could not sync erase line:%d,blk:%d\n",
+                                       pblk_dev_ppa_to_line(ppa),
+                                       pblk_dev_ppa_to_pos(geo, ppa));
+
+               rqd.error = ret;
+               goto out;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: sync erase timed out\n");
+       }
+
+out:
+       rqd.private = pblk;
+       __pblk_end_io_erase(pblk, &rqd);
+
+       return 0;
+}
+
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct ppa_addr ppa;
+       int bit = -1;
+
+       /* Erase one block at the time and only erase good blocks */
+       while ((bit = find_next_zero_bit(line->erase_bitmap, lm->blk_per_line,
+                                               bit + 1)) < lm->blk_per_line) {
+               ppa = pblk->luns[bit].bppa; /* set ch and lun */
+               ppa.g.blk = line->id;
+
+               /* If the erase fails, the block is bad and should be marked */
+               line->left_eblks--;
+               WARN_ON(test_and_set_bit(bit, line->erase_bitmap));
+
+               if (pblk_blk_erase_sync(pblk, ppa)) {
+                       pr_err("pblk: failed to erase line %d\n", line->id);
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_set_metadata(struct pblk *pblk, struct pblk_line *line,
+                                 struct pblk_line *cur)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct line_smeta *smeta = line->smeta;
+       struct line_emeta *emeta = line->emeta;
+       int nr_blk_line;
+
+       /* After erasing the line, new bad blocks might appear and we risk
+        * having an invalid line
+        */
+       nr_blk_line = lm->blk_per_line -
+                       bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+       if (nr_blk_line < lm->min_blk_line) {
+               spin_lock(&l_mg->free_lock);
+               spin_lock(&line->lock);
+               line->state = PBLK_LINESTATE_BAD;
+               spin_unlock(&line->lock);
+
+               list_add_tail(&line->list, &l_mg->bad_list);
+               spin_unlock(&l_mg->free_lock);
+
+               pr_debug("pblk: line %d is bad\n", line->id);
+
+               return 0;
+       }
+
+       /* Run-time metadata */
+       line->lun_bitmap = ((void *)(smeta)) + sizeof(struct line_smeta);
+
+       /* Mark LUNs allocated in this line (all for now) */
+       bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len);
+
+       smeta->header.identifier = cpu_to_le32(PBLK_MAGIC);
+       memcpy(smeta->header.uuid, pblk->instance_uuid, 16);
+       smeta->header.id = cpu_to_le32(line->id);
+       smeta->header.type = cpu_to_le16(line->type);
+       smeta->header.version = cpu_to_le16(1);
+
+       /* Start metadata */
+       smeta->seq_nr = cpu_to_le64(line->seq_nr);
+       smeta->window_wr_lun = cpu_to_le32(geo->nr_luns);
+
+       /* Fill metadata among lines */
+       if (cur) {
+               memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len);
+               smeta->prev_id = cpu_to_le32(cur->id);
+               cur->emeta->next_id = cpu_to_le32(line->id);
+       } else {
+               smeta->prev_id = cpu_to_le32(PBLK_LINE_EMPTY);
+       }
+
+       /* All smeta must be set at this point */
+       smeta->header.crc = cpu_to_le32(pblk_calc_meta_header_crc(pblk, smeta));
+       smeta->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta));
+
+       /* End metadata */
+       memcpy(&emeta->header, &smeta->header, sizeof(struct line_header));
+       emeta->seq_nr = cpu_to_le64(line->seq_nr);
+       emeta->nr_lbas = cpu_to_le64(line->sec_in_line);
+       emeta->nr_valid_lbas = cpu_to_le64(0);
+       emeta->next_id = cpu_to_le32(PBLK_LINE_EMPTY);
+       emeta->crc = cpu_to_le32(0);
+       emeta->prev_id = smeta->prev_id;
+
+       return 1;
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
+                            int init)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       int nr_bb = 0;
+       u64 off;
+       int bit = -1;
+
+       line->sec_in_line = lm->sec_per_line;
+
+       /* Capture bad block information on line mapping bitmaps */
+       while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line,
+                                       bit + 1)) < lm->blk_per_line) {
+               off = bit * geo->sec_per_pl;
+               bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off,
+                                                       lm->sec_per_line);
+               bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
+                                                       lm->sec_per_line);
+               line->sec_in_line -= geo->sec_per_blk;
+               if (bit >= lm->emeta_bb)
+                       nr_bb++;
+       }
+
+       /* Mark smeta metadata sectors as bad sectors */
+       bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+       off = bit * geo->sec_per_pl;
+retry_smeta:
+       bitmap_set(line->map_bitmap, off, lm->smeta_sec);
+       line->sec_in_line -= lm->smeta_sec;
+       line->smeta_ssec = off;
+       line->cur_sec = off + lm->smeta_sec;
+
+       if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) {
+               pr_debug("pblk: line smeta I/O failed. Retry\n");
+               off += geo->sec_per_pl;
+               goto retry_smeta;
+       }
+
+       bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line);
+
+       /* Mark emeta metadata sectors as bad sectors. We need to consider bad
+        * blocks to make sure that there are enough sectors to store emeta
+        */
+       bit = lm->sec_per_line;
+       off = lm->sec_per_line - lm->emeta_sec;
+       bitmap_set(line->invalid_bitmap, off, lm->emeta_sec);
+       while (nr_bb) {
+               off -= geo->sec_per_pl;
+               if (!test_bit(off, line->invalid_bitmap)) {
+                       bitmap_set(line->invalid_bitmap, off, geo->sec_per_pl);
+                       nr_bb--;
+               }
+       }
+
+       line->sec_in_line -= lm->emeta_sec;
+       line->emeta_ssec = off;
+       line->vsc = line->left_ssecs = line->left_msecs = line->sec_in_line;
+
+       if (lm->sec_per_line - line->sec_in_line !=
+               bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) {
+               spin_lock(&line->lock);
+               line->state = PBLK_LINESTATE_BAD;
+               spin_unlock(&line->lock);
+
+               list_add_tail(&line->list, &l_mg->bad_list);
+               pr_err("pblk: unexpected line %d is bad\n", line->id);
+
+               return 0;
+       }
+
+       return 1;
+}
+
+static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+
+       line->map_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+       if (!line->map_bitmap)
+               return -ENOMEM;
+       memset(line->map_bitmap, 0, lm->sec_bitmap_len);
+
+       /* invalid_bitmap is special since it is used when line is closed. No
+        * need to zeroized; it will be initialized using bb info form
+        * map_bitmap
+        */
+       line->invalid_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+       if (!line->invalid_bitmap) {
+               mempool_free(line->map_bitmap, pblk->line_meta_pool);
+               return -ENOMEM;
+       }
+
+       spin_lock(&line->lock);
+       if (line->state != PBLK_LINESTATE_FREE) {
+               spin_unlock(&line->lock);
+               WARN(1, "pblk: corrupted line state\n");
+               return -EINTR;
+       }
+       line->state = PBLK_LINESTATE_OPEN;
+       spin_unlock(&line->lock);
+
+       /* Bad blocks do not need to be erased */
+       bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
+       line->left_eblks = line->blk_in_line;
+       atomic_set(&line->left_seblks, line->left_eblks);
+
+       kref_init(&line->ref);
+
+       return 0;
+}
+
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       int ret;
+
+       spin_lock(&l_mg->free_lock);
+       l_mg->data_line = line;
+       list_del(&line->list);
+       spin_unlock(&l_mg->free_lock);
+
+       ret = pblk_line_prepare(pblk, line);
+       if (ret) {
+               list_add(&line->list, &l_mg->free_list);
+               return ret;
+       }
+
+       pblk_rl_free_lines_dec(&pblk->rl, line);
+
+       if (!pblk_line_init_bb(pblk, line, 0)) {
+               list_add(&line->list, &l_mg->free_list);
+               return -EINTR;
+       }
+
+       return 0;
+}
+
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
+{
+       mempool_free(line->map_bitmap, pblk->line_meta_pool);
+       line->map_bitmap = NULL;
+       line->smeta = NULL;
+       line->emeta = NULL;
+}
+
+struct pblk_line *pblk_line_get(struct pblk *pblk)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line *line = NULL;
+       int bit;
+
+       lockdep_assert_held(&l_mg->free_lock);
+
+retry_get:
+       if (list_empty(&l_mg->free_list)) {
+               pr_err("pblk: no free lines\n");
+               goto out;
+       }
+
+       line = list_first_entry(&l_mg->free_list, struct pblk_line, list);
+       list_del(&line->list);
+       l_mg->nr_free_lines--;
+
+       bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+       if (unlikely(bit >= lm->blk_per_line)) {
+               spin_lock(&line->lock);
+               line->state = PBLK_LINESTATE_BAD;
+               spin_unlock(&line->lock);
+
+               list_add_tail(&line->list, &l_mg->bad_list);
+
+               pr_debug("pblk: line %d is bad\n", line->id);
+               goto retry_get;
+       }
+
+       if (pblk_line_prepare(pblk, line)) {
+               pr_err("pblk: failed to prepare line %d\n", line->id);
+               list_add(&line->list, &l_mg->free_list);
+               return NULL;
+       }
+
+out:
+       return line;
+}
+
+static struct pblk_line *pblk_line_retry(struct pblk *pblk,
+                                        struct pblk_line *line)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *retry_line;
+
+       spin_lock(&l_mg->free_lock);
+       retry_line = pblk_line_get(pblk);
+       if (!retry_line) {
+               spin_unlock(&l_mg->free_lock);
+               return NULL;
+       }
+
+       retry_line->smeta = line->smeta;
+       retry_line->emeta = line->emeta;
+       retry_line->meta_line = line->meta_line;
+       retry_line->map_bitmap = line->map_bitmap;
+       retry_line->invalid_bitmap = line->invalid_bitmap;
+
+       line->map_bitmap = NULL;
+       line->invalid_bitmap = NULL;
+       line->smeta = NULL;
+       line->emeta = NULL;
+       spin_unlock(&l_mg->free_lock);
+
+       if (pblk_line_erase(pblk, retry_line))
+               return NULL;
+
+       pblk_rl_free_lines_dec(&pblk->rl, retry_line);
+
+       l_mg->data_line = retry_line;
+
+       return retry_line;
+}
+
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *line;
+       int meta_line;
+       int is_next = 0;
+
+       spin_lock(&l_mg->free_lock);
+       line = pblk_line_get(pblk);
+       if (!line) {
+               spin_unlock(&l_mg->free_lock);
+               return NULL;
+       }
+
+       line->seq_nr = l_mg->d_seq_nr++;
+       line->type = PBLK_LINETYPE_DATA;
+       l_mg->data_line = line;
+
+       meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+       set_bit(meta_line, &l_mg->meta_bitmap);
+       line->smeta = l_mg->sline_meta[meta_line].meta;
+       line->emeta = l_mg->eline_meta[meta_line].meta;
+       line->meta_line = meta_line;
+
+       /* Allocate next line for preparation */
+       l_mg->data_next = pblk_line_get(pblk);
+       if (l_mg->data_next) {
+               l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+               l_mg->data_next->type = PBLK_LINETYPE_DATA;
+               is_next = 1;
+       }
+       spin_unlock(&l_mg->free_lock);
+
+       pblk_rl_free_lines_dec(&pblk->rl, line);
+       if (is_next)
+               pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+       if (pblk_line_erase(pblk, line))
+               return NULL;
+
+retry_setup:
+       if (!pblk_line_set_metadata(pblk, line, NULL)) {
+               line = pblk_line_retry(pblk, line);
+               if (!line)
+                       return NULL;
+
+               goto retry_setup;
+       }
+
+       if (!pblk_line_init_bb(pblk, line, 1)) {
+               line = pblk_line_retry(pblk, line);
+               if (!line)
+                       return NULL;
+
+               goto retry_setup;
+       }
+
+       return line;
+}
+
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *cur, *new;
+       unsigned int left_seblks;
+       int meta_line;
+       int is_next = 0;
+
+       cur = l_mg->data_line;
+       new = l_mg->data_next;
+       if (!new)
+               return NULL;
+       l_mg->data_line = new;
+
+retry_line:
+       left_seblks = atomic_read(&new->left_seblks);
+       if (left_seblks) {
+               /* If line is not fully erased, erase it */
+               if (new->left_eblks) {
+                       if (pblk_line_erase(pblk, new))
+                               return NULL;
+               } else {
+                       io_schedule();
+               }
+               goto retry_line;
+       }
+
+       spin_lock(&l_mg->free_lock);
+       /* Allocate next line for preparation */
+       l_mg->data_next = pblk_line_get(pblk);
+       if (l_mg->data_next) {
+               l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+               l_mg->data_next->type = PBLK_LINETYPE_DATA;
+               is_next = 1;
+       }
+
+retry_meta:
+       meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+       if (meta_line == PBLK_DATA_LINES) {
+               spin_unlock(&l_mg->free_lock);
+               io_schedule();
+               spin_lock(&l_mg->free_lock);
+               goto retry_meta;
+       }
+
+       set_bit(meta_line, &l_mg->meta_bitmap);
+       new->smeta = l_mg->sline_meta[meta_line].meta;
+       new->emeta = l_mg->eline_meta[meta_line].meta;
+       new->meta_line = meta_line;
+
+       memset(new->smeta, 0, lm->smeta_len);
+       memset(new->emeta, 0, lm->emeta_len);
+       spin_unlock(&l_mg->free_lock);
+
+       if (is_next)
+               pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+retry_setup:
+       if (!pblk_line_set_metadata(pblk, new, cur)) {
+               new = pblk_line_retry(pblk, new);
+               if (new)
+                       return NULL;
+
+               goto retry_setup;
+       }
+
+       if (!pblk_line_init_bb(pblk, new, 1)) {
+               new = pblk_line_retry(pblk, new);
+               if (!new)
+                       return NULL;
+
+               goto retry_setup;
+       }
+
+       return new;
+}
+
+void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
+{
+       if (line->map_bitmap)
+               mempool_free(line->map_bitmap, pblk->line_meta_pool);
+       if (line->invalid_bitmap)
+               mempool_free(line->invalid_bitmap, pblk->line_meta_pool);
+
+       line->map_bitmap = NULL;
+       line->invalid_bitmap = NULL;
+}
+
+void pblk_line_put(struct kref *ref)
+{
+       struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+       struct pblk *pblk = line->pblk;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+
+       spin_lock(&line->lock);
+       WARN_ON(line->state != PBLK_LINESTATE_GC);
+       line->state = PBLK_LINESTATE_FREE;
+       line->gc_group = PBLK_LINEGC_NONE;
+       pblk_line_free(pblk, line);
+       spin_unlock(&line->lock);
+
+       spin_lock(&l_mg->free_lock);
+       list_add_tail(&line->list, &l_mg->free_list);
+       l_mg->nr_free_lines++;
+       spin_unlock(&l_mg->free_lock);
+
+       pblk_rl_free_lines_inc(&pblk->rl, line);
+}
+
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
+{
+       struct nvm_rq *rqd;
+       int err;
+
+       rqd = mempool_alloc(pblk->r_rq_pool, GFP_KERNEL);
+       memset(rqd, 0, pblk_r_rq_size);
+
+       pblk_setup_e_rq(pblk, rqd, ppa);
+
+       rqd->end_io = pblk_end_io_erase;
+       rqd->private = pblk;
+
+       /* The write thread schedules erases so that it minimizes disturbances
+        * with writes. Thus, there is no need to take the LUN semaphore.
+        */
+       err = pblk_submit_io(pblk, rqd);
+       if (err) {
+               struct nvm_tgt_dev *dev = pblk->dev;
+               struct nvm_geo *geo = &dev->geo;
+
+               pr_err("pblk: could not async erase line:%d,blk:%d\n",
+                                       pblk_dev_ppa_to_line(ppa),
+                                       pblk_dev_ppa_to_pos(geo, ppa));
+       }
+
+       return err;
+}
+
+struct pblk_line *pblk_line_get_data(struct pblk *pblk)
+{
+       return pblk->l_mg.data_line;
+}
+
+struct pblk_line *pblk_line_get_data_next(struct pblk *pblk)
+{
+       return pblk->l_mg.data_next;
+}
+
+int pblk_line_is_full(struct pblk_line *line)
+{
+       return (line->left_msecs == 0);
+}
+
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct list_head *move_list;
+
+       line->emeta->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, line->emeta));
+
+       if (pblk_line_submit_emeta_io(pblk, line, line->cur_sec, WRITE))
+               pr_err("pblk: line %d close I/O failed\n", line->id);
+
+       WARN(!bitmap_full(line->map_bitmap, line->sec_in_line),
+                               "pblk: corrupt closed line %d\n", line->id);
+
+       spin_lock(&l_mg->free_lock);
+       WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap));
+       spin_unlock(&l_mg->free_lock);
+
+       spin_lock(&l_mg->gc_lock);
+       spin_lock(&line->lock);
+       WARN_ON(line->state != PBLK_LINESTATE_OPEN);
+       line->state = PBLK_LINESTATE_CLOSED;
+       move_list = pblk_line_gc_list(pblk, line);
+
+       list_add_tail(&line->list, move_list);
+
+       mempool_free(line->map_bitmap, pblk->line_meta_pool);
+       line->map_bitmap = NULL;
+       line->smeta = NULL;
+       line->emeta = NULL;
+
+       spin_unlock(&line->lock);
+       spin_unlock(&l_mg->gc_lock);
+}
+
+void pblk_line_close_ws(struct work_struct *work)
+{
+       struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+                                                                       ws);
+       struct pblk *pblk = line_ws->pblk;
+       struct pblk_line *line = line_ws->line;
+
+       pblk_line_close(pblk, line);
+       mempool_free(line_ws, pblk->line_ws_pool);
+}
+
+void pblk_line_mark_bb(struct work_struct *work)
+{
+       struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+                                                                       ws);
+       struct pblk *pblk = line_ws->pblk;
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct ppa_addr *ppa = line_ws->priv;
+       int ret;
+
+       ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
+       if (ret) {
+               struct pblk_line *line;
+               int pos;
+
+               line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
+               pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
+
+               pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
+                               line->id, pos);
+       }
+
+       kfree(ppa);
+       mempool_free(line_ws, pblk->line_ws_pool);
+}
+
+void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+                     void (*work)(struct work_struct *))
+{
+       struct pblk_line_ws *line_ws;
+
+       line_ws = mempool_alloc(pblk->line_ws_pool, GFP_ATOMIC);
+       if (!line_ws)
+               return;
+
+       line_ws->pblk = pblk;
+       line_ws->line = line;
+       line_ws->priv = priv;
+
+       INIT_WORK(&line_ws->ws, work);
+       queue_work(pblk->kw_wq, &line_ws->ws);
+}
+
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+                 unsigned long *lun_bitmap)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_lun *rlun;
+       int lun_id = ppa_list[0].g.ch * geo->luns_per_chnl + ppa_list[0].g.lun;
+       int ret;
+
+       /*
+        * Only send one inflight I/O per LUN. Since we map at a page
+        * granurality, all ppas in the I/O will map to the same LUN
+        */
+#ifdef CONFIG_NVM_DEBUG
+       int i;
+
+       for (i = 1; i < nr_ppas; i++)
+               WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun ||
+                               ppa_list[0].g.ch != ppa_list[i].g.ch);
+#endif
+       /* If the LUN has been locked for this same request, do no attempt to
+        * lock it again
+        */
+       if (test_and_set_bit(lun_id, lun_bitmap))
+               return;
+
+       rlun = &pblk->luns[lun_id];
+       ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000));
+       if (ret) {
+               switch (ret) {
+               case -ETIME:
+                       pr_err("pblk: lun semaphore timed out\n");
+                       break;
+               case -EINTR:
+                       pr_err("pblk: lun semaphore timed out\n");
+                       break;
+               }
+       }
+}
+
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+               unsigned long *lun_bitmap)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_lun *rlun;
+       int nr_luns = geo->nr_luns;
+       int bit = -1;
+
+       while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
+               rlun = &pblk->luns[bit];
+               up(&rlun->wr_sem);
+       }
+
+       kfree(lun_bitmap);
+}
+
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+       struct ppa_addr l2p_ppa;
+
+       /* logic error: lba out-of-bounds. Ignore update */
+       if (!(lba < pblk->rl.nr_secs)) {
+               WARN(1, "pblk: corrupted L2P map request\n");
+               return;
+       }
+
+       spin_lock(&pblk->trans_lock);
+       l2p_ppa = pblk_trans_map_get(pblk, lba);
+
+       if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa))
+               pblk_map_invalidate(pblk, l2p_ppa);
+
+       pblk_trans_map_set(pblk, lba, ppa);
+       spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+#ifdef CONFIG_NVM_DEBUG
+       /* Callers must ensure that the ppa points to a cache address */
+       BUG_ON(!pblk_addr_in_cache(ppa));
+       BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+#endif
+
+       pblk_update_map(pblk, lba, ppa);
+}
+
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+                      struct pblk_line *gc_line)
+{
+       struct ppa_addr l2p_ppa;
+       int ret = 1;
+
+#ifdef CONFIG_NVM_DEBUG
+       /* Callers must ensure that the ppa points to a cache address */
+       BUG_ON(!pblk_addr_in_cache(ppa));
+       BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+#endif
+
+       /* logic error: lba out-of-bounds. Ignore update */
+       if (!(lba < pblk->rl.nr_secs)) {
+               WARN(1, "pblk: corrupted L2P map request\n");
+               return 0;
+       }
+
+       spin_lock(&pblk->trans_lock);
+       l2p_ppa = pblk_trans_map_get(pblk, lba);
+
+       /* Prevent updated entries to be overwritten by GC */
+       if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) ||
+                               pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) {
+               ret = 0;
+               goto out;
+       }
+
+       pblk_trans_map_set(pblk, lba, ppa);
+out:
+       spin_unlock(&pblk->trans_lock);
+       return ret;
+}
+
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+                        struct ppa_addr entry_line)
+{
+       struct ppa_addr l2p_line;
+
+#ifdef CONFIG_NVM_DEBUG
+       /* Callers must ensure that the ppa points to a device address */
+       BUG_ON(pblk_addr_in_cache(ppa));
+#endif
+       /* Invalidate and discard padded entries */
+       if (lba == ADDR_EMPTY) {
+#ifdef CONFIG_NVM_DEBUG
+               atomic_long_inc(&pblk->padded_wb);
+#endif
+               pblk_map_invalidate(pblk, ppa);
+               return;
+       }
+
+       /* logic error: lba out-of-bounds. Ignore update */
+       if (!(lba < pblk->rl.nr_secs)) {
+               WARN(1, "pblk: corrupted L2P map request\n");
+               return;
+       }
+
+       spin_lock(&pblk->trans_lock);
+       l2p_line = pblk_trans_map_get(pblk, lba);
+
+       /* Do not update L2P if the cacheline has been updated. In this case,
+        * the mapped ppa must be invalidated
+        */
+       if (l2p_line.ppa != entry_line.ppa) {
+               if (!pblk_ppa_empty(ppa))
+                       pblk_map_invalidate(pblk, ppa);
+               goto out;
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line));
+#endif
+
+       pblk_trans_map_set(pblk, lba, ppa);
+out:
+       spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+                        sector_t blba, int nr_secs)
+{
+       int i;
+
+       spin_lock(&pblk->trans_lock);
+       for (i = 0; i < nr_secs; i++)
+               ppas[i] = pblk_trans_map_get(pblk, blba + i);
+       spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+                         u64 *lba_list, int nr_secs)
+{
+       sector_t lba;
+       int i;
+
+       spin_lock(&pblk->trans_lock);
+       for (i = 0; i < nr_secs; i++) {
+               lba = lba_list[i];
+               if (lba == ADDR_EMPTY) {
+                       ppas[i].ppa = ADDR_EMPTY;
+               } else {
+                       /* logic error: lba out-of-bounds. Ignore update */
+                       if (!(lba < pblk->rl.nr_secs)) {
+                               WARN(1, "pblk: corrupted L2P map request\n");
+                               continue;
+                       }
+                       ppas[i] = pblk_trans_map_get(pblk, lba);
+               }
+       }
+       spin_unlock(&pblk->trans_lock);
+}
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c

new file mode 100644 (file)

index 0000000..9b147cf
--- /dev/null
+++ b/drivers/lightnvm/pblk-gc.c
@@ -0,0 +1,555 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-gc.c - pblk's garbage collector
+ */
+
+#include "pblk.h"
+#include <linux/delay.h>
+
+static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
+{
+       kfree(gc_rq->data);
+       kfree(gc_rq->lba_list);
+       kfree(gc_rq);
+}
+
+static int pblk_gc_write(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+       struct pblk_gc_rq *gc_rq, *tgc_rq;
+       LIST_HEAD(w_list);
+
+       spin_lock(&gc->w_lock);
+       if (list_empty(&gc->w_list)) {
+               spin_unlock(&gc->w_lock);
+               return 1;
+       }
+
+       list_for_each_entry_safe(gc_rq, tgc_rq, &gc->w_list, list) {
+               list_move_tail(&gc_rq->list, &w_list);
+               gc->w_entries--;
+       }
+       spin_unlock(&gc->w_lock);
+
+       list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
+               pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list,
+                               gc_rq->nr_secs, gc_rq->secs_to_gc,
+                               gc_rq->line, PBLK_IOTYPE_GC);
+
+               kref_put(&gc_rq->line->ref, pblk_line_put);
+
+               list_del(&gc_rq->list);
+               pblk_gc_free_gc_rq(gc_rq);
+       }
+
+       return 0;
+}
+
+static void pblk_gc_writer_kick(struct pblk_gc *gc)
+{
+       wake_up_process(gc->gc_writer_ts);
+}
+
+/*
+ * Responsible for managing all memory related to a gc request. Also in case of
+ * failure
+ */
+static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_line *line,
+                                  u64 *lba_list, unsigned int nr_secs)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_gc *gc = &pblk->gc;
+       struct pblk_gc_rq *gc_rq;
+       void *data;
+       unsigned int secs_to_gc;
+       int ret = NVM_IO_OK;
+
+       data = kmalloc(nr_secs * geo->sec_size, GFP_KERNEL);
+       if (!data) {
+               ret = NVM_IO_ERR;
+               goto free_lba_list;
+       }
+
+       /* Read from GC victim block */
+       if (pblk_submit_read_gc(pblk, lba_list, data, nr_secs,
+                                                       &secs_to_gc, line)) {
+               ret = NVM_IO_ERR;
+               goto free_data;
+       }
+
+       if (!secs_to_gc)
+               goto free_data;
+
+       gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
+       if (!gc_rq) {
+               ret = NVM_IO_ERR;
+               goto free_data;
+       }
+
+       gc_rq->line = line;
+       gc_rq->data = data;
+       gc_rq->lba_list = lba_list;
+       gc_rq->nr_secs = nr_secs;
+       gc_rq->secs_to_gc = secs_to_gc;
+
+       kref_get(&line->ref);
+
+retry:
+       spin_lock(&gc->w_lock);
+       if (gc->w_entries > 256) {
+               spin_unlock(&gc->w_lock);
+               usleep_range(256, 1024);
+               goto retry;
+       }
+       gc->w_entries++;
+       list_add_tail(&gc_rq->list, &gc->w_list);
+       spin_unlock(&gc->w_lock);
+
+       pblk_gc_writer_kick(&pblk->gc);
+
+       return NVM_IO_OK;
+
+free_data:
+       kfree(data);
+free_lba_list:
+       kfree(lba_list);
+
+       return ret;
+}
+
+static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct list_head *move_list;
+
+       spin_lock(&line->lock);
+       WARN_ON(line->state != PBLK_LINESTATE_GC);
+       line->state = PBLK_LINESTATE_CLOSED;
+       move_list = pblk_line_gc_list(pblk, line);
+       spin_unlock(&line->lock);
+
+       if (move_list) {
+               spin_lock(&l_mg->gc_lock);
+               list_add_tail(&line->list, move_list);
+               spin_unlock(&l_mg->gc_lock);
+       }
+}
+
+static void pblk_gc_line_ws(struct work_struct *work)
+{
+       struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+                                                                       ws);
+       struct pblk *pblk = line_ws->pblk;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *line = line_ws->line;
+       struct pblk_line_meta *lm = &pblk->lm;
+       __le64 *lba_list = line_ws->priv;
+       u64 *gc_list;
+       int sec_left;
+       int nr_ppas, bit;
+       int put_line = 1;
+
+       pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
+
+       spin_lock(&line->lock);
+       sec_left = line->vsc;
+       if (!sec_left) {
+               /* Lines are erased before being used (l_mg->data_/log_next) */
+               spin_unlock(&line->lock);
+               goto out;
+       }
+       spin_unlock(&line->lock);
+
+       if (sec_left < 0) {
+               pr_err("pblk: corrupted GC line (%d)\n", line->id);
+               put_line = 0;
+               pblk_put_line_back(pblk, line);
+               goto out;
+       }
+
+       bit = -1;
+next_rq:
+       gc_list = kmalloc_array(pblk->max_write_pgs, sizeof(u64), GFP_KERNEL);
+       if (!gc_list) {
+               put_line = 0;
+               pblk_put_line_back(pblk, line);
+               goto out;
+       }
+
+       nr_ppas = 0;
+       do {
+               bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line,
+                                                               bit + 1);
+               if (bit > line->emeta_ssec)
+                       break;
+
+               gc_list[nr_ppas++] = le64_to_cpu(lba_list[bit]);
+       } while (nr_ppas < pblk->max_write_pgs);
+
+       if (unlikely(!nr_ppas)) {
+               kfree(gc_list);
+               goto out;
+       }
+
+       if (pblk_gc_move_valid_secs(pblk, line, gc_list, nr_ppas)) {
+               pr_err("pblk: could not GC all sectors: line:%d (%d/%d/%d)\n",
+                                               line->id, line->vsc,
+                                               nr_ppas, nr_ppas);
+               put_line = 0;
+               pblk_put_line_back(pblk, line);
+               goto out;
+       }
+
+       sec_left -= nr_ppas;
+       if (sec_left > 0)
+               goto next_rq;
+
+out:
+       pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
+       mempool_free(line_ws, pblk->line_ws_pool);
+       atomic_dec(&pblk->gc.inflight_gc);
+       if (put_line)
+               kref_put(&line->ref, pblk_line_put);
+}
+
+static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_ws *line_ws;
+       __le64 *lba_list;
+       int ret;
+
+       line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL);
+       line->emeta = pblk_malloc(lm->emeta_len, l_mg->emeta_alloc_type,
+                                                               GFP_KERNEL);
+       if (!line->emeta) {
+               pr_err("pblk: cannot use GC emeta\n");
+               goto fail_free_ws;
+       }
+
+       ret = pblk_line_read_emeta(pblk, line);
+       if (ret) {
+               pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret);
+               goto fail_free_emeta;
+       }
+
+       /* If this read fails, it means that emeta is corrupted. For now, leave
+        * the line untouched. TODO: Implement a recovery routine that scans and
+        * moves all sectors on the line.
+        */
+       lba_list = pblk_recov_get_lba_list(pblk, line->emeta);
+       if (!lba_list) {
+               pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
+               goto fail_free_emeta;
+       }
+
+       line_ws->pblk = pblk;
+       line_ws->line = line;
+       line_ws->priv = lba_list;
+
+       INIT_WORK(&line_ws->ws, pblk_gc_line_ws);
+       queue_work(pblk->gc.gc_reader_wq, &line_ws->ws);
+
+       return 0;
+
+fail_free_emeta:
+       pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
+fail_free_ws:
+       mempool_free(line_ws, pblk->line_ws_pool);
+       pblk_put_line_back(pblk, line);
+
+       return 1;
+}
+
+static void pblk_gc_lines(struct pblk *pblk, struct list_head *gc_list)
+{
+       struct pblk_line *line, *tline;
+
+       list_for_each_entry_safe(line, tline, gc_list, list) {
+               if (pblk_gc_line(pblk, line))
+                       pr_err("pblk: failed to GC line %d\n", line->id);
+               list_del(&line->list);
+       }
+}
+
+/*
+ * Lines with no valid sectors will be returned to the free list immediately. If
+ * GC is activated - either because the free block count is under the determined
+ * threshold, or because it is being forced from user space - only lines with a
+ * high count of invalid sectors will be recycled.
+ */
+static void pblk_gc_run(struct pblk *pblk)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_gc *gc = &pblk->gc;
+       struct pblk_line *line, *tline;
+       unsigned int nr_blocks_free, nr_blocks_need;
+       struct list_head *group_list;
+       int run_gc, gc_group = 0;
+       int prev_gc = 0;
+       int inflight_gc = atomic_read(&gc->inflight_gc);
+       LIST_HEAD(gc_list);
+
+       spin_lock(&l_mg->gc_lock);
+       list_for_each_entry_safe(line, tline, &l_mg->gc_full_list, list) {
+               spin_lock(&line->lock);
+               WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+               line->state = PBLK_LINESTATE_GC;
+               spin_unlock(&line->lock);
+
+               list_del(&line->list);
+               kref_put(&line->ref, pblk_line_put);
+       }
+       spin_unlock(&l_mg->gc_lock);
+
+       nr_blocks_need = pblk_rl_gc_thrs(&pblk->rl);
+       nr_blocks_free = pblk_rl_nr_free_blks(&pblk->rl);
+       run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
+
+next_gc_group:
+       group_list = l_mg->gc_lists[gc_group++];
+       spin_lock(&l_mg->gc_lock);
+       while (run_gc && !list_empty(group_list)) {
+               /* No need to queue up more GC lines than we can handle */
+               if (!run_gc || inflight_gc > gc->gc_jobs_active) {
+                       spin_unlock(&l_mg->gc_lock);
+                       pblk_gc_lines(pblk, &gc_list);
+                       return;
+               }
+
+               line = list_first_entry(group_list, struct pblk_line, list);
+               nr_blocks_free += line->blk_in_line;
+
+               spin_lock(&line->lock);
+               WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+               line->state = PBLK_LINESTATE_GC;
+               list_move_tail(&line->list, &gc_list);
+               atomic_inc(&gc->inflight_gc);
+               inflight_gc++;
+               spin_unlock(&line->lock);
+
+               prev_gc = 1;
+               run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
+       }
+       spin_unlock(&l_mg->gc_lock);
+
+       pblk_gc_lines(pblk, &gc_list);
+
+       if (!prev_gc && pblk->rl.rb_state > gc_group &&
+                                               gc_group < PBLK_NR_GC_LISTS)
+               goto next_gc_group;
+}
+
+
+static void pblk_gc_kick(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       wake_up_process(gc->gc_ts);
+       pblk_gc_writer_kick(gc);
+       mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+}
+
+static void pblk_gc_timer(unsigned long data)
+{
+       struct pblk *pblk = (struct pblk *)data;
+
+       pblk_gc_kick(pblk);
+}
+
+static int pblk_gc_ts(void *data)
+{
+       struct pblk *pblk = data;
+
+       while (!kthread_should_stop()) {
+               pblk_gc_run(pblk);
+               set_current_state(TASK_INTERRUPTIBLE);
+               io_schedule();
+       }
+
+       return 0;
+}
+
+static int pblk_gc_writer_ts(void *data)
+{
+       struct pblk *pblk = data;
+
+       while (!kthread_should_stop()) {
+               if (!pblk_gc_write(pblk))
+                       continue;
+               set_current_state(TASK_INTERRUPTIBLE);
+               io_schedule();
+       }
+
+       return 0;
+}
+
+static void pblk_gc_start(struct pblk *pblk)
+{
+       pblk->gc.gc_active = 1;
+
+       pr_debug("pblk: gc start\n");
+}
+
+int pblk_gc_status(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+       int ret;
+
+       spin_lock(&gc->lock);
+       ret = gc->gc_active;
+       spin_unlock(&gc->lock);
+
+       return ret;
+}
+
+static void __pblk_gc_should_start(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       lockdep_assert_held(&gc->lock);
+
+       if (gc->gc_enabled && !gc->gc_active)
+               pblk_gc_start(pblk);
+}
+
+void pblk_gc_should_start(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       spin_lock(&gc->lock);
+       __pblk_gc_should_start(pblk);
+       spin_unlock(&gc->lock);
+}
+
+/*
+ * If flush_wq == 1 then no lock should be held by the caller since
+ * flush_workqueue can sleep
+ */
+static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
+{
+       spin_lock(&pblk->gc.lock);
+       pblk->gc.gc_active = 0;
+       spin_unlock(&pblk->gc.lock);
+
+       pr_debug("pblk: gc stop\n");
+}
+
+void pblk_gc_should_stop(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       if (gc->gc_active && !gc->gc_forced)
+               pblk_gc_stop(pblk, 0);
+}
+
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+                             int *gc_active)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       spin_lock(&gc->lock);
+       *gc_enabled = gc->gc_enabled;
+       *gc_active = gc->gc_active;
+       spin_unlock(&gc->lock);
+}
+
+void pblk_gc_sysfs_force(struct pblk *pblk, int force)
+{
+       struct pblk_gc *gc = &pblk->gc;
+       int rsv = 0;
+
+       spin_lock(&gc->lock);
+       if (force) {
+               gc->gc_enabled = 1;
+               rsv = 64;
+       }
+       pblk_rl_set_gc_rsc(&pblk->rl, rsv);
+       gc->gc_forced = force;
+       __pblk_gc_should_start(pblk);
+       spin_unlock(&gc->lock);
+}
+
+int pblk_gc_init(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+       int ret;
+
+       gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
+       if (IS_ERR(gc->gc_ts)) {
+               pr_err("pblk: could not allocate GC main kthread\n");
+               return PTR_ERR(gc->gc_ts);
+       }
+
+       gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
+                                                       "pblk-gc-writer-ts");
+       if (IS_ERR(gc->gc_writer_ts)) {
+               pr_err("pblk: could not allocate GC writer kthread\n");
+               ret = PTR_ERR(gc->gc_writer_ts);
+               goto fail_free_main_kthread;
+       }
+
+       setup_timer(&gc->gc_timer, pblk_gc_timer, (unsigned long)pblk);
+       mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+
+       gc->gc_active = 0;
+       gc->gc_forced = 0;
+       gc->gc_enabled = 1;
+       gc->gc_jobs_active = 8;
+       gc->w_entries = 0;
+       atomic_set(&gc->inflight_gc, 0);
+
+       gc->gc_reader_wq = alloc_workqueue("pblk-gc-reader-wq",
+                       WQ_MEM_RECLAIM | WQ_UNBOUND, gc->gc_jobs_active);
+       if (!gc->gc_reader_wq) {
+               pr_err("pblk: could not allocate GC reader workqueue\n");
+               ret = -ENOMEM;
+               goto fail_free_writer_kthread;
+       }
+
+       spin_lock_init(&gc->lock);
+       spin_lock_init(&gc->w_lock);
+       INIT_LIST_HEAD(&gc->w_list);
+
+       return 0;
+
+fail_free_main_kthread:
+       kthread_stop(gc->gc_ts);
+fail_free_writer_kthread:
+       kthread_stop(gc->gc_writer_ts);
+
+       return ret;
+}
+
+void pblk_gc_exit(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       flush_workqueue(gc->gc_reader_wq);
+
+       del_timer(&gc->gc_timer);
+       pblk_gc_stop(pblk, 1);
+
+       if (gc->gc_ts)
+               kthread_stop(gc->gc_ts);
+
+       if (pblk->gc.gc_reader_wq)
+               destroy_workqueue(pblk->gc.gc_reader_wq);
+
+       if (gc->gc_writer_ts)
+               kthread_stop(gc->gc_writer_ts);
+}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c

new file mode 100644 (file)

index 0000000..94653b1
--- /dev/null
+++ b/drivers/lightnvm/pblk-init.c
@@ -0,0 +1,949 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-init.c - pblk's initialization.
+ */
+
+#include "pblk.h"
+
+static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_r_rq_cache,
+                                       *pblk_w_rq_cache, *pblk_line_meta_cache;
+static DECLARE_RWSEM(pblk_lock);
+
+static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
+                         struct bio *bio)
+{
+       int ret;
+
+       /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
+        * constraint. Writes can be of arbitrary size.
+        */
+       if (bio_data_dir(bio) == READ) {
+               blk_queue_split(q, &bio, q->bio_split);
+               ret = pblk_submit_read(pblk, bio);
+               if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED))
+                       bio_put(bio);
+
+               return ret;
+       }
+
+       /* Prevent deadlock in the case of a modest LUN configuration and large
+        * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
+        * available for user I/O.
+        */
+       if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl)))
+               blk_queue_split(q, &bio, q->bio_split);
+
+       return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
+}
+
+static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
+{
+       struct pblk *pblk = q->queuedata;
+
+       if (bio_op(bio) == REQ_OP_DISCARD) {
+               pblk_discard(pblk, bio);
+               if (!(bio->bi_opf & REQ_PREFLUSH)) {
+                       bio_endio(bio);
+                       return BLK_QC_T_NONE;
+               }
+       }
+
+       switch (pblk_rw_io(q, pblk, bio)) {
+       case NVM_IO_ERR:
+               bio_io_error(bio);
+               break;
+       case NVM_IO_DONE:
+               bio_endio(bio);
+               break;
+       }
+
+       return BLK_QC_T_NONE;
+}
+
+static void pblk_l2p_free(struct pblk *pblk)
+{
+       vfree(pblk->trans_map);
+}
+
+static int pblk_l2p_init(struct pblk *pblk)
+{
+       sector_t i;
+       struct ppa_addr ppa;
+       int entry_size = 8;
+
+       if (pblk->ppaf_bitsize < 32)
+               entry_size = 4;
+
+       pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs);
+       if (!pblk->trans_map)
+               return -ENOMEM;
+
+       pblk_ppa_set_empty(&ppa);
+
+       for (i = 0; i < pblk->rl.nr_secs; i++)
+               pblk_trans_map_set(pblk, i, ppa);
+
+       return 0;
+}
+
+static void pblk_rwb_free(struct pblk *pblk)
+{
+       if (pblk_rb_tear_down_check(&pblk->rwb))
+               pr_err("pblk: write buffer error on tear down\n");
+
+       pblk_rb_data_free(&pblk->rwb);
+       vfree(pblk_rb_entries_ref(&pblk->rwb));
+}
+
+static int pblk_rwb_init(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_rb_entry *entries;
+       unsigned long nr_entries;
+       unsigned int power_size, power_seg_sz;
+
+       nr_entries = pblk_rb_calculate_size(pblk->pgs_in_buffer);
+
+       entries = vzalloc(nr_entries * sizeof(struct pblk_rb_entry));
+       if (!entries)
+               return -ENOMEM;
+
+       power_size = get_count_order(nr_entries);
+       power_seg_sz = get_count_order(geo->sec_size);
+
+       return pblk_rb_init(&pblk->rwb, entries, power_size, power_seg_sz);
+}
+
+/* Minimum pages needed within a lun */
+#define PAGE_POOL_SIZE 16
+#define ADDR_POOL_SIZE 64
+
+static int pblk_set_ppaf(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct nvm_addr_format ppaf = geo->ppaf;
+       int power_len;
+
+       /* Re-calculate channel and lun format to adapt to configuration */
+       power_len = get_count_order(geo->nr_chnls);
+       if (1 << power_len != geo->nr_chnls) {
+               pr_err("pblk: supports only power-of-two channel config.\n");
+               return -EINVAL;
+       }
+       ppaf.ch_len = power_len;
+
+       power_len = get_count_order(geo->luns_per_chnl);
+       if (1 << power_len != geo->luns_per_chnl) {
+               pr_err("pblk: supports only power-of-two LUN config.\n");
+               return -EINVAL;
+       }
+       ppaf.lun_len = power_len;
+
+       pblk->ppaf.sec_offset = 0;
+       pblk->ppaf.pln_offset = ppaf.sect_len;
+       pblk->ppaf.ch_offset = pblk->ppaf.pln_offset + ppaf.pln_len;
+       pblk->ppaf.lun_offset = pblk->ppaf.ch_offset + ppaf.ch_len;
+       pblk->ppaf.pg_offset = pblk->ppaf.lun_offset + ppaf.lun_len;
+       pblk->ppaf.blk_offset = pblk->ppaf.pg_offset + ppaf.pg_len;
+       pblk->ppaf.sec_mask = (1ULL << ppaf.sect_len) - 1;
+       pblk->ppaf.pln_mask = ((1ULL << ppaf.pln_len) - 1) <<
+                                                       pblk->ppaf.pln_offset;
+       pblk->ppaf.ch_mask = ((1ULL << ppaf.ch_len) - 1) <<
+                                                       pblk->ppaf.ch_offset;
+       pblk->ppaf.lun_mask = ((1ULL << ppaf.lun_len) - 1) <<
+                                                       pblk->ppaf.lun_offset;
+       pblk->ppaf.pg_mask = ((1ULL << ppaf.pg_len) - 1) <<
+                                                       pblk->ppaf.pg_offset;
+       pblk->ppaf.blk_mask = ((1ULL << ppaf.blk_len) - 1) <<
+                                                       pblk->ppaf.blk_offset;
+
+       pblk->ppaf_bitsize = pblk->ppaf.blk_offset + ppaf.blk_len;
+
+       return 0;
+}
+
+static int pblk_init_global_caches(struct pblk *pblk)
+{
+       char cache_name[PBLK_CACHE_NAME_LEN];
+
+       down_write(&pblk_lock);
+       pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws",
+                               sizeof(struct pblk_line_ws), 0, 0, NULL);
+       if (!pblk_blk_ws_cache) {
+               up_write(&pblk_lock);
+               return -ENOMEM;
+       }
+
+       pblk_rec_cache = kmem_cache_create("pblk_rec",
+                               sizeof(struct pblk_rec_ctx), 0, 0, NULL);
+       if (!pblk_rec_cache) {
+               kmem_cache_destroy(pblk_blk_ws_cache);
+               up_write(&pblk_lock);
+               return -ENOMEM;
+       }
+
+       pblk_r_rq_cache = kmem_cache_create("pblk_r_rq", pblk_r_rq_size,
+                               0, 0, NULL);
+       if (!pblk_r_rq_cache) {
+               kmem_cache_destroy(pblk_blk_ws_cache);
+               kmem_cache_destroy(pblk_rec_cache);
+               up_write(&pblk_lock);
+               return -ENOMEM;
+       }
+
+       pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
+                               0, 0, NULL);
+       if (!pblk_w_rq_cache) {
+               kmem_cache_destroy(pblk_blk_ws_cache);
+               kmem_cache_destroy(pblk_rec_cache);
+               kmem_cache_destroy(pblk_r_rq_cache);
+               up_write(&pblk_lock);
+               return -ENOMEM;
+       }
+
+       snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s",
+                                                       pblk->disk->disk_name);
+       pblk_line_meta_cache = kmem_cache_create(cache_name,
+                               pblk->lm.sec_bitmap_len, 0, 0, NULL);
+       if (!pblk_line_meta_cache) {
+               kmem_cache_destroy(pblk_blk_ws_cache);
+               kmem_cache_destroy(pblk_rec_cache);
+               kmem_cache_destroy(pblk_r_rq_cache);
+               kmem_cache_destroy(pblk_w_rq_cache);
+               up_write(&pblk_lock);
+               return -ENOMEM;
+       }
+       up_write(&pblk_lock);
+
+       return 0;
+}
+
+static int pblk_core_init(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       int max_write_ppas;
+       int mod;
+
+       pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
+       max_write_ppas = pblk->min_write_pgs * geo->nr_luns;
+       pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
+                               max_write_ppas : nvm_max_phys_sects(dev);
+       pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
+                                               geo->nr_planes * geo->nr_luns;
+
+       if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+               pr_err("pblk: cannot support device max_phys_sect\n");
+               return -EINVAL;
+       }
+
+       div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod);
+       if (mod) {
+               pr_err("pblk: bad configuration of sectors/pages\n");
+               return -EINVAL;
+       }
+
+       if (pblk_init_global_caches(pblk))
+               return -ENOMEM;
+
+       pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
+       if (!pblk->page_pool)
+               return -ENOMEM;
+
+       pblk->line_ws_pool = mempool_create_slab_pool(geo->nr_luns,
+                                                       pblk_blk_ws_cache);
+       if (!pblk->line_ws_pool)
+               goto free_page_pool;
+
+       pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
+       if (!pblk->rec_pool)
+               goto free_blk_ws_pool;
+
+       pblk->r_rq_pool = mempool_create_slab_pool(64, pblk_r_rq_cache);
+       if (!pblk->r_rq_pool)
+               goto free_rec_pool;
+
+       pblk->w_rq_pool = mempool_create_slab_pool(64, pblk_w_rq_cache);
+       if (!pblk->w_rq_pool)
+               goto free_r_rq_pool;
+
+       pblk->line_meta_pool =
+                       mempool_create_slab_pool(16, pblk_line_meta_cache);
+       if (!pblk->line_meta_pool)
+               goto free_w_rq_pool;
+
+       pblk->kw_wq = alloc_workqueue("pblk-aux-wq",
+                                       WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+       if (!pblk->kw_wq)
+               goto free_line_meta_pool;
+
+       if (pblk_set_ppaf(pblk))
+               goto free_kw_wq;
+
+       if (pblk_rwb_init(pblk))
+               goto free_kw_wq;
+
+       INIT_LIST_HEAD(&pblk->compl_list);
+       return 0;
+
+free_kw_wq:
+       destroy_workqueue(pblk->kw_wq);
+free_line_meta_pool:
+       mempool_destroy(pblk->line_meta_pool);
+free_w_rq_pool:
+       mempool_destroy(pblk->w_rq_pool);
+free_r_rq_pool:
+       mempool_destroy(pblk->r_rq_pool);
+free_rec_pool:
+       mempool_destroy(pblk->rec_pool);
+free_blk_ws_pool:
+       mempool_destroy(pblk->line_ws_pool);
+free_page_pool:
+       mempool_destroy(pblk->page_pool);
+       return -ENOMEM;
+}
+
+static void pblk_core_free(struct pblk *pblk)
+{
+       if (pblk->kw_wq)
+               destroy_workqueue(pblk->kw_wq);
+
+       mempool_destroy(pblk->page_pool);
+       mempool_destroy(pblk->line_ws_pool);
+       mempool_destroy(pblk->rec_pool);
+       mempool_destroy(pblk->r_rq_pool);
+       mempool_destroy(pblk->w_rq_pool);
+       mempool_destroy(pblk->line_meta_pool);
+
+       kmem_cache_destroy(pblk_blk_ws_cache);
+       kmem_cache_destroy(pblk_rec_cache);
+       kmem_cache_destroy(pblk_r_rq_cache);
+       kmem_cache_destroy(pblk_w_rq_cache);
+       kmem_cache_destroy(pblk_line_meta_cache);
+}
+
+static void pblk_luns_free(struct pblk *pblk)
+{
+       kfree(pblk->luns);
+}
+
+static void pblk_lines_free(struct pblk *pblk)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *line;
+       int i;
+
+       spin_lock(&l_mg->free_lock);
+       for (i = 0; i < l_mg->nr_lines; i++) {
+               line = &pblk->lines[i];
+
+               pblk_line_free(pblk, line);
+               kfree(line->blk_bitmap);
+               kfree(line->erase_bitmap);
+       }
+       spin_unlock(&l_mg->free_lock);
+}
+
+static void pblk_line_meta_free(struct pblk *pblk)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       int i;
+
+       kfree(l_mg->bb_template);
+       kfree(l_mg->bb_aux);
+
+       for (i = 0; i < PBLK_DATA_LINES; i++) {
+               pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
+               pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
+       }
+
+       kfree(pblk->lines);
+}
+
+static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
+{
+       struct nvm_geo *geo = &dev->geo;
+       struct ppa_addr ppa;
+       u8 *blks;
+       int nr_blks, ret;
+
+       nr_blks = geo->blks_per_lun * geo->plane_mode;
+       blks = kmalloc(nr_blks, GFP_KERNEL);
+       if (!blks)
+               return -ENOMEM;
+
+       ppa.ppa = 0;
+       ppa.g.ch = rlun->bppa.g.ch;
+       ppa.g.lun = rlun->bppa.g.lun;
+
+       ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
+       if (ret)
+               goto out;
+
+       nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
+       if (nr_blks < 0) {
+               kfree(blks);
+               ret = nr_blks;
+       }
+
+       rlun->bb_list = blks;
+
+out:
+       return ret;
+}
+
+static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_lun *rlun;
+       int bb_cnt = 0;
+       int i;
+
+       line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+       if (!line->blk_bitmap)
+               return -ENOMEM;
+
+       line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+       if (!line->erase_bitmap) {
+               kfree(line->blk_bitmap);
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < lm->blk_per_line; i++) {
+               rlun = &pblk->luns[i];
+               if (rlun->bb_list[line->id] == NVM_BLK_T_FREE)
+                       continue;
+
+               set_bit(i, line->blk_bitmap);
+               bb_cnt++;
+       }
+
+       return bb_cnt;
+}
+
+static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_lun *rlun;
+       int i, ret;
+
+       /* TODO: Implement unbalanced LUN support */
+       if (geo->luns_per_chnl < 0) {
+               pr_err("pblk: unbalanced LUN config.\n");
+               return -EINVAL;
+       }
+
+       pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL);
+       if (!pblk->luns)
+               return -ENOMEM;
+
+       for (i = 0; i < geo->nr_luns; i++) {
+               /* Stripe across channels */
+               int ch = i % geo->nr_chnls;
+               int lun_raw = i / geo->nr_chnls;
+               int lunid = lun_raw + ch * geo->luns_per_chnl;
+
+               rlun = &pblk->luns[i];
+               rlun->bppa = luns[lunid];
+
+               sema_init(&rlun->wr_sem, 1);
+
+               ret = pblk_bb_discovery(dev, rlun);
+               if (ret) {
+                       while (--i >= 0)
+                               kfree(pblk->luns[i].bb_list);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static int pblk_lines_configure(struct pblk *pblk, int flags)
+{
+       struct pblk_line *line = NULL;
+       int ret = 0;
+
+       if (!(flags & NVM_TARGET_FACTORY)) {
+               line = pblk_recov_l2p(pblk);
+               if (IS_ERR(line)) {
+                       pr_err("pblk: could not recover l2p table\n");
+                       ret = -EFAULT;
+               }
+       }
+
+       if (!line) {
+               /* Configure next line for user data */
+               line = pblk_line_get_first_data(pblk);
+               if (!line) {
+                       pr_err("pblk: line list corrupted\n");
+                       ret = -EFAULT;
+               }
+       }
+
+       return ret;
+}
+
+/* See comment over struct line_emeta definition */
+static unsigned int calc_emeta_len(struct pblk *pblk, struct pblk_line_meta *lm)
+{
+       return (sizeof(struct line_emeta) +
+                       ((lm->sec_per_line - lm->emeta_sec) * sizeof(u64)) +
+                       (pblk->l_mg.nr_lines * sizeof(u32)) +
+                       lm->blk_bitmap_len);
+}
+
+static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       sector_t provisioned;
+
+       pblk->over_pct = 20;
+
+       provisioned = nr_free_blks;
+       provisioned *= (100 - pblk->over_pct);
+       sector_div(provisioned, 100);
+
+       /* Internally pblk manages all free blocks, but all calculations based
+        * on user capacity consider only provisioned blocks
+        */
+       pblk->rl.total_blocks = nr_free_blks;
+       pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk;
+       pblk->capacity = provisioned * geo->sec_per_blk;
+       atomic_set(&pblk->rl.free_blocks, nr_free_blks);
+}
+
+static int pblk_lines_init(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line *line;
+       unsigned int smeta_len, emeta_len;
+       long nr_bad_blks, nr_meta_blks, nr_free_blks;
+       int bb_distance;
+       int i;
+       int ret = 0;
+
+       lm->sec_per_line = geo->sec_per_blk * geo->nr_luns;
+       lm->blk_per_line = geo->nr_luns;
+       lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+       lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
+       lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+       lm->high_thrs = lm->sec_per_line / 2;
+       lm->mid_thrs = lm->sec_per_line / 4;
+
+       /* Calculate necessary pages for smeta. See comment over struct
+        * line_smeta definition
+        */
+       lm->smeta_len = sizeof(struct line_smeta) +
+                               PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
+
+       i = 1;
+add_smeta_page:
+       lm->smeta_sec = i * geo->sec_per_pl;
+       lm->smeta_len = lm->smeta_sec * geo->sec_size;
+
+       smeta_len = sizeof(struct line_smeta) +
+                               PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
+       if (smeta_len > lm->smeta_len) {
+               i++;
+               goto add_smeta_page;
+       }
+
+       /* Calculate necessary pages for emeta. See comment over struct
+        * line_emeta definition
+        */
+       i = 1;
+add_emeta_page:
+       lm->emeta_sec = i * geo->sec_per_pl;
+       lm->emeta_len = lm->emeta_sec * geo->sec_size;
+
+       emeta_len = calc_emeta_len(pblk, lm);
+       if (emeta_len > lm->emeta_len) {
+               i++;
+               goto add_emeta_page;
+       }
+       lm->emeta_bb = geo->nr_luns - i;
+
+       nr_meta_blks = (lm->smeta_sec + lm->emeta_sec +
+                               (geo->sec_per_blk / 2)) / geo->sec_per_blk;
+       lm->min_blk_line = nr_meta_blks + 1;
+
+       l_mg->nr_lines = geo->blks_per_lun;
+       l_mg->log_line = l_mg->data_line = NULL;
+       l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
+       l_mg->nr_free_lines = 0;
+       bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+
+       /* smeta is always small enough to fit on a kmalloc memory allocation,
+        * emeta depends on the number of LUNs allocated to the pblk instance
+        */
+       l_mg->smeta_alloc_type = PBLK_KMALLOC_META;
+       for (i = 0; i < PBLK_DATA_LINES; i++) {
+               l_mg->sline_meta[i].meta = kmalloc(lm->smeta_len, GFP_KERNEL);
+               if (!l_mg->sline_meta[i].meta)
+                       while (--i >= 0) {
+                               kfree(l_mg->sline_meta[i].meta);
+                               ret = -ENOMEM;
+                               goto fail;
+                       }
+       }
+
+       if (lm->emeta_len > KMALLOC_MAX_CACHE_SIZE) {
+               l_mg->emeta_alloc_type = PBLK_VMALLOC_META;
+
+               for (i = 0; i < PBLK_DATA_LINES; i++) {
+                       l_mg->eline_meta[i].meta = vmalloc(lm->emeta_len);
+                       if (!l_mg->eline_meta[i].meta)
+                               while (--i >= 0) {
+                                       vfree(l_mg->eline_meta[i].meta);
+                                       ret = -ENOMEM;
+                                       goto fail;
+                               }
+               }
+       } else {
+               l_mg->emeta_alloc_type = PBLK_KMALLOC_META;
+
+               for (i = 0; i < PBLK_DATA_LINES; i++) {
+                       l_mg->eline_meta[i].meta =
+                                       kmalloc(lm->emeta_len, GFP_KERNEL);
+                       if (!l_mg->eline_meta[i].meta)
+                               while (--i >= 0) {
+                                       kfree(l_mg->eline_meta[i].meta);
+                                       ret = -ENOMEM;
+                                       goto fail;
+                               }
+               }
+       }
+
+       l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+       if (!l_mg->bb_template)
+               goto fail_free_meta;
+
+       l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+       if (!l_mg->bb_aux)
+               goto fail_free_bb_template;
+
+       bb_distance = (geo->nr_luns) * geo->sec_per_pl;
+       for (i = 0; i < lm->sec_per_line; i += bb_distance)
+               bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
+
+       INIT_LIST_HEAD(&l_mg->free_list);
+       INIT_LIST_HEAD(&l_mg->corrupt_list);
+       INIT_LIST_HEAD(&l_mg->bad_list);
+       INIT_LIST_HEAD(&l_mg->gc_full_list);
+       INIT_LIST_HEAD(&l_mg->gc_high_list);
+       INIT_LIST_HEAD(&l_mg->gc_mid_list);
+       INIT_LIST_HEAD(&l_mg->gc_low_list);
+       INIT_LIST_HEAD(&l_mg->gc_empty_list);
+
+       l_mg->gc_lists[0] = &l_mg->gc_high_list;
+       l_mg->gc_lists[1] = &l_mg->gc_mid_list;
+       l_mg->gc_lists[2] = &l_mg->gc_low_list;
+
+       spin_lock_init(&l_mg->free_lock);
+       spin_lock_init(&l_mg->gc_lock);
+
+       pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line),
+                                                               GFP_KERNEL);
+       if (!pblk->lines)
+               goto fail_free_bb_aux;
+
+       nr_free_blks = 0;
+       for (i = 0; i < l_mg->nr_lines; i++) {
+               line = &pblk->lines[i];
+
+               line->pblk = pblk;
+               line->id = i;
+               line->type = PBLK_LINETYPE_FREE;
+               line->state = PBLK_LINESTATE_FREE;
+               line->gc_group = PBLK_LINEGC_NONE;
+               spin_lock_init(&line->lock);
+
+               nr_bad_blks = pblk_bb_line(pblk, line);
+               if (nr_bad_blks < 0 || nr_bad_blks > lm->blk_per_line)
+                       goto fail_free_lines;
+
+               line->blk_in_line = lm->blk_per_line - nr_bad_blks;
+               if (line->blk_in_line < lm->min_blk_line) {
+                       line->state = PBLK_LINESTATE_BAD;
+                       list_add_tail(&line->list, &l_mg->bad_list);
+                       continue;
+               }
+
+               nr_free_blks += line->blk_in_line;
+
+               l_mg->nr_free_lines++;
+               list_add_tail(&line->list, &l_mg->free_list);
+       }
+
+       pblk_set_provision(pblk, nr_free_blks);
+
+       sema_init(&pblk->erase_sem, 1);
+
+       /* Cleanup per-LUN bad block lists - managed within lines on run-time */
+       for (i = 0; i < geo->nr_luns; i++)
+               kfree(pblk->luns[i].bb_list);
+
+       return 0;
+fail_free_lines:
+       kfree(pblk->lines);
+fail_free_bb_aux:
+       kfree(l_mg->bb_aux);
+fail_free_bb_template:
+       kfree(l_mg->bb_template);
+fail_free_meta:
+       for (i = 0; i < PBLK_DATA_LINES; i++) {
+               pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
+               pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
+       }
+fail:
+       for (i = 0; i < geo->nr_luns; i++)
+               kfree(pblk->luns[i].bb_list);
+
+       return ret;
+}
+
+static int pblk_writer_init(struct pblk *pblk)
+{
+       setup_timer(&pblk->wtimer, pblk_write_timer_fn, (unsigned long)pblk);
+       mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
+
+       pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
+       if (IS_ERR(pblk->writer_ts)) {
+               pr_err("pblk: could not allocate writer kthread\n");
+               return 1;
+       }
+
+       return 0;
+}
+
+static void pblk_writer_stop(struct pblk *pblk)
+{
+       if (pblk->writer_ts)
+               kthread_stop(pblk->writer_ts);
+       del_timer(&pblk->wtimer);
+}
+
+static void pblk_free(struct pblk *pblk)
+{
+       pblk_luns_free(pblk);
+       pblk_lines_free(pblk);
+       pblk_line_meta_free(pblk);
+       pblk_core_free(pblk);
+       pblk_l2p_free(pblk);
+
+       kfree(pblk);
+}
+
+static void pblk_tear_down(struct pblk *pblk)
+{
+       pblk_flush_writer(pblk);
+       pblk_writer_stop(pblk);
+       pblk_rb_sync_l2p(&pblk->rwb);
+       pblk_recov_pad(pblk);
+       pblk_rwb_free(pblk);
+       pblk_rl_free(&pblk->rl);
+
+       pr_debug("pblk: consistent tear down\n");
+}
+
+static void pblk_exit(void *private)
+{
+       struct pblk *pblk = private;
+
+       down_write(&pblk_lock);
+       pblk_gc_exit(pblk);
+       pblk_tear_down(pblk);
+       pblk_free(pblk);
+       up_write(&pblk_lock);
+}
+
+static sector_t pblk_capacity(void *private)
+{
+       struct pblk *pblk = private;
+
+       return pblk->capacity * NR_PHY_IN_LOG;
+}
+
+static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
+                      int flags)
+{
+       struct nvm_geo *geo = &dev->geo;
+       struct request_queue *bqueue = dev->q;
+       struct request_queue *tqueue = tdisk->queue;
+       struct pblk *pblk;
+       int ret;
+
+       if (dev->identity.dom & NVM_RSP_L2P) {
+               pr_err("pblk: device-side L2P table not supported. (%x)\n",
+                                                       dev->identity.dom);
+               return ERR_PTR(-EINVAL);
+       }
+
+       pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
+       if (!pblk)
+               return ERR_PTR(-ENOMEM);
+
+       pblk->dev = dev;
+       pblk->disk = tdisk;
+
+       spin_lock_init(&pblk->trans_lock);
+       spin_lock_init(&pblk->lock);
+
+       if (flags & NVM_TARGET_FACTORY)
+               pblk_setup_uuid(pblk);
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_set(&pblk->inflight_writes, 0);
+       atomic_long_set(&pblk->padded_writes, 0);
+       atomic_long_set(&pblk->padded_wb, 0);
+       atomic_long_set(&pblk->nr_flush, 0);
+       atomic_long_set(&pblk->req_writes, 0);
+       atomic_long_set(&pblk->sub_writes, 0);
+       atomic_long_set(&pblk->sync_writes, 0);
+       atomic_long_set(&pblk->compl_writes, 0);
+       atomic_long_set(&pblk->inflight_reads, 0);
+       atomic_long_set(&pblk->sync_reads, 0);
+       atomic_long_set(&pblk->recov_writes, 0);
+       atomic_long_set(&pblk->recov_writes, 0);
+       atomic_long_set(&pblk->recov_gc_writes, 0);
+#endif
+
+       atomic_long_set(&pblk->read_failed, 0);
+       atomic_long_set(&pblk->read_empty, 0);
+       atomic_long_set(&pblk->read_high_ecc, 0);
+       atomic_long_set(&pblk->read_failed_gc, 0);
+       atomic_long_set(&pblk->write_failed, 0);
+       atomic_long_set(&pblk->erase_failed, 0);
+
+       ret = pblk_luns_init(pblk, dev->luns);
+       if (ret) {
+               pr_err("pblk: could not initialize luns\n");
+               goto fail;
+       }
+
+       ret = pblk_lines_init(pblk);
+       if (ret) {
+               pr_err("pblk: could not initialize lines\n");
+               goto fail_free_luns;
+       }
+
+       ret = pblk_core_init(pblk);
+       if (ret) {
+               pr_err("pblk: could not initialize core\n");
+               goto fail_free_line_meta;
+       }
+
+       ret = pblk_l2p_init(pblk);
+       if (ret) {
+               pr_err("pblk: could not initialize maps\n");
+               goto fail_free_core;
+       }
+
+       ret = pblk_lines_configure(pblk, flags);
+       if (ret) {
+               pr_err("pblk: could not configure lines\n");
+               goto fail_free_l2p;
+       }
+
+       ret = pblk_writer_init(pblk);
+       if (ret) {
+               pr_err("pblk: could not initialize write thread\n");
+               goto fail_free_lines;
+       }
+
+       ret = pblk_gc_init(pblk);
+       if (ret) {
+               pr_err("pblk: could not initialize gc\n");
+               goto fail_stop_writer;
+       }
+
+       /* inherit the size from the underlying device */
+       blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
+       blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
+
+       blk_queue_write_cache(tqueue, true, false);
+
+       tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size;
+       tqueue->limits.discard_alignment = 0;
+       blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
+       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
+
+       pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
+                       geo->nr_luns, pblk->l_mg.nr_lines,
+                       (unsigned long long)pblk->rl.nr_secs,
+                       pblk->rwb.nr_entries);
+
+       wake_up_process(pblk->writer_ts);
+       return pblk;
+
+fail_stop_writer:
+       pblk_writer_stop(pblk);
+fail_free_lines:
+       pblk_lines_free(pblk);
+fail_free_l2p:
+       pblk_l2p_free(pblk);
+fail_free_core:
+       pblk_core_free(pblk);
+fail_free_line_meta:
+       pblk_line_meta_free(pblk);
+fail_free_luns:
+       pblk_luns_free(pblk);
+fail:
+       kfree(pblk);
+       return ERR_PTR(ret);
+}
+
+/* physical block device target */
+static struct nvm_tgt_type tt_pblk = {
+       .name           = "pblk",
+       .version        = {1, 0, 0},
+
+       .make_rq        = pblk_make_rq,
+       .capacity       = pblk_capacity,
+
+       .init           = pblk_init,
+       .exit           = pblk_exit,
+
+       .sysfs_init     = pblk_sysfs_init,
+       .sysfs_exit     = pblk_sysfs_exit,
+};
+
+static int __init pblk_module_init(void)
+{
+       return nvm_register_tgt_type(&tt_pblk);
+}
+
+static void pblk_module_exit(void)
+{
+       nvm_unregister_tgt_type(&tt_pblk);
+}
+
+module_init(pblk_module_init);
+module_exit(pblk_module_exit);
+MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
+MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c

new file mode 100644 (file)

index 0000000..3f8bab4
--- /dev/null
+++ b/drivers/lightnvm/pblk-map.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-map.c - pblk's lba-ppa mapping strategy
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
+                              struct ppa_addr *ppa_list,
+                              unsigned long *lun_bitmap,
+                              struct pblk_sec_meta *meta_list,
+                              unsigned int valid_secs)
+{
+       struct pblk_line *line = pblk_line_get_data(pblk);
+       struct line_emeta *emeta = line->emeta;
+       struct pblk_w_ctx *w_ctx;
+       __le64 *lba_list = pblk_line_emeta_to_lbas(emeta);
+       u64 paddr;
+       int nr_secs = pblk->min_write_pgs;
+       int i;
+
+       paddr = pblk_alloc_page(pblk, line, nr_secs);
+
+       for (i = 0; i < nr_secs; i++, paddr++) {
+               /* ppa to be sent to the device */
+               ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+
+               /* Write context for target bio completion on write buffer. Note
+                * that the write buffer is protected by the sync backpointer,
+                * and a single writer thread have access to each specific entry
+                * at a time. Thus, it is safe to modify the context for the
+                * entry we are setting up for submission without taking any
+                * lock or memory barrier.
+                */
+               if (i < valid_secs) {
+                       kref_get(&line->ref);
+                       w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
+                       w_ctx->ppa = ppa_list[i];
+                       meta_list[i].lba = cpu_to_le64(w_ctx->lba);
+                       lba_list[paddr] = cpu_to_le64(w_ctx->lba);
+                       le64_add_cpu(&line->emeta->nr_valid_lbas, 1);
+               } else {
+                       meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+                       lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
+                       pblk_map_pad_invalidate(pblk, line, paddr);
+               }
+       }
+
+       if (pblk_line_is_full(line)) {
+               line = pblk_line_replace_data(pblk);
+               if (!line)
+                       return;
+       }
+
+       pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
+}
+
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+                unsigned long *lun_bitmap, unsigned int valid_secs,
+                unsigned int off)
+{
+       struct pblk_sec_meta *meta_list = rqd->meta_list;
+       unsigned int map_secs;
+       int min = pblk->min_write_pgs;
+       int i;
+
+       for (i = off; i < rqd->nr_ppas; i += min) {
+               map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+               pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+                                       lun_bitmap, &meta_list[i], map_secs);
+       }
+}
+
+/* only if erase_ppa is set, acquire erase semaphore */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                      unsigned int sentry, unsigned long *lun_bitmap,
+                      unsigned int valid_secs, struct ppa_addr *erase_ppa)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line *e_line = pblk_line_get_data_next(pblk);
+       struct pblk_sec_meta *meta_list = rqd->meta_list;
+       unsigned int map_secs;
+       int min = pblk->min_write_pgs;
+       int i, erase_lun;
+
+       for (i = 0; i < rqd->nr_ppas; i += min) {
+               map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+               pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+                                       lun_bitmap, &meta_list[i], map_secs);
+
+               erase_lun = rqd->ppa_list[i].g.lun * geo->nr_chnls +
+                                                       rqd->ppa_list[i].g.ch;
+
+               if (!test_bit(erase_lun, e_line->erase_bitmap)) {
+                       if (down_trylock(&pblk->erase_sem))
+                               continue;
+
+                       set_bit(erase_lun, e_line->erase_bitmap);
+                       e_line->left_eblks--;
+                       *erase_ppa = rqd->ppa_list[i];
+                       erase_ppa->g.blk = e_line->id;
+
+                       /* Avoid evaluating e_line->left_eblks */
+                       return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
+                                                       valid_secs, i + min);
+               }
+       }
+
+       /* Erase blocks that are bad in this line but might not be in next */
+       if (unlikely(ppa_empty(*erase_ppa))) {
+               struct pblk_line_meta *lm = &pblk->lm;
+
+               i = find_first_zero_bit(e_line->erase_bitmap, lm->blk_per_line);
+               if (i == lm->blk_per_line)
+                       return;
+
+               set_bit(i, e_line->erase_bitmap);
+               e_line->left_eblks--;
+               *erase_ppa = pblk->luns[i].bppa; /* set ch and lun */
+               erase_ppa->g.blk = e_line->id;
+       }
+}
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c

new file mode 100644 (file)

index 0000000..045384d
--- /dev/null
+++ b/drivers/lightnvm/pblk-rb.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * Based upon the circular ringbuffer.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rb.c - pblk's write buffer
+ */
+
+#include <linux/circ_buf.h>
+
+#include "pblk.h"
+
+static DECLARE_RWSEM(pblk_rb_lock);
+
+void pblk_rb_data_free(struct pblk_rb *rb)
+{
+       struct pblk_rb_pages *p, *t;
+
+       down_write(&pblk_rb_lock);
+       list_for_each_entry_safe(p, t, &rb->pages, list) {
+               free_pages((unsigned long)page_address(p->pages), p->order);
+               list_del(&p->list);
+               kfree(p);
+       }
+       up_write(&pblk_rb_lock);
+}
+
+/*
+ * Initialize ring buffer. The data and metadata buffers must be previously
+ * allocated and their size must be a power of two
+ * (Documentation/circular-buffers.txt)
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+                unsigned int power_size, unsigned int power_seg_sz)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       unsigned int init_entry = 0;
+       unsigned int alloc_order = power_size;
+       unsigned int max_order = MAX_ORDER - 1;
+       unsigned int order, iter;
+
+       down_write(&pblk_rb_lock);
+       rb->entries = rb_entry_base;
+       rb->seg_size = (1 << power_seg_sz);
+       rb->nr_entries = (1 << power_size);
+       rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
+       rb->sync_point = EMPTY_ENTRY;
+
+       spin_lock_init(&rb->w_lock);
+       spin_lock_init(&rb->s_lock);
+
+       INIT_LIST_HEAD(&rb->pages);
+
+       if (alloc_order >= max_order) {
+               order = max_order;
+               iter = (1 << (alloc_order - max_order));
+       } else {
+               order = alloc_order;
+               iter = 1;
+       }
+
+       do {
+               struct pblk_rb_entry *entry;
+               struct pblk_rb_pages *page_set;
+               void *kaddr;
+               unsigned long set_size;
+               int i;
+
+               page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL);
+               if (!page_set) {
+                       up_write(&pblk_rb_lock);
+                       return -ENOMEM;
+               }
+
+               page_set->order = order;
+               page_set->pages = alloc_pages(GFP_KERNEL, order);
+               if (!page_set->pages) {
+                       kfree(page_set);
+                       pblk_rb_data_free(rb);
+                       up_write(&pblk_rb_lock);
+                       return -ENOMEM;
+               }
+               kaddr = page_address(page_set->pages);
+
+               entry = &rb->entries[init_entry];
+               entry->data = kaddr;
+               entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+               entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+
+               set_size = (1 << order);
+               for (i = 1; i < set_size; i++) {
+                       entry = &rb->entries[init_entry];
+                       entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+                       entry->data = kaddr + (i * rb->seg_size);
+                       entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+                       bio_list_init(&entry->w_ctx.bios);
+               }
+
+               list_add_tail(&page_set->list, &rb->pages);
+               iter--;
+       } while (iter > 0);
+       up_write(&pblk_rb_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_set(&rb->inflight_sync_point, 0);
+#endif
+
+       /*
+        * Initialize rate-limiter, which controls access to the write buffer
+        * but user and GC I/O
+        */
+       pblk_rl_init(&pblk->rl, rb->nr_entries);
+
+       return 0;
+}
+
+/*
+ * pblk_rb_calculate_size -- calculate the size of the write buffer
+ */
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries)
+{
+       /* Alloc a write buffer that can at least fit 128 entries */
+       return (1 << max(get_count_order(nr_entries), 7));
+}
+
+void *pblk_rb_entries_ref(struct pblk_rb *rb)
+{
+       return rb->entries;
+}
+
+static void clean_wctx(struct pblk_w_ctx *w_ctx)
+{
+       int flags;
+
+try:
+       flags = READ_ONCE(w_ctx->flags);
+       if (!(flags & PBLK_SUBMITTED_ENTRY))
+               goto try;
+
+       /* Release flags on context. Protect from writes and reads */
+       smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
+       pblk_ppa_set_empty(&w_ctx->ppa);
+}
+
+#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size)
+#define pblk_rb_ring_space(rb, head, tail, size) \
+                                       (CIRC_SPACE(head, tail, size))
+
+/*
+ * Buffer space is calculated with respect to the back pointer signaling
+ * synchronized entries to the media.
+ */
+static unsigned int pblk_rb_space(struct pblk_rb *rb)
+{
+       unsigned int mem = READ_ONCE(rb->mem);
+       unsigned int sync = READ_ONCE(rb->sync);
+
+       return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries);
+}
+
+/*
+ * Buffer count is calculated with respect to the submission entry signaling the
+ * entries that are available to send to the media
+ */
+unsigned int pblk_rb_read_count(struct pblk_rb *rb)
+{
+       unsigned int mem = READ_ONCE(rb->mem);
+       unsigned int subm = READ_ONCE(rb->subm);
+
+       return pblk_rb_ring_count(mem, subm, rb->nr_entries);
+}
+
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
+{
+       unsigned int subm;
+
+       subm = READ_ONCE(rb->subm);
+       /* Commit read means updating submission pointer */
+       smp_store_release(&rb->subm,
+                               (subm + nr_entries) & (rb->nr_entries - 1));
+
+       return subm;
+}
+
+static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
+                               unsigned int to_update)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       struct pblk_line *line;
+       struct pblk_rb_entry *entry;
+       struct pblk_w_ctx *w_ctx;
+       unsigned int i;
+
+       for (i = 0; i < to_update; i++) {
+               entry = &rb->entries[*l2p_upd];
+               w_ctx = &entry->w_ctx;
+
+               pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
+                                                       entry->cacheline);
+
+               line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
+               kref_put(&line->ref, pblk_line_put);
+               clean_wctx(w_ctx);
+               *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1);
+       }
+
+       return 0;
+}
+
+/*
+ * When we move the l2p_update pointer, we update the l2p table - lookups will
+ * point to the physical address instead of to the cacheline in the write buffer
+ * from this moment on.
+ */
+static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
+                             unsigned int mem, unsigned int sync)
+{
+       unsigned int space, count;
+       int ret = 0;
+
+       lockdep_assert_held(&rb->w_lock);
+
+       /* Update l2p only as buffer entries are being overwritten */
+       space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries);
+       if (space > nr_entries)
+               goto out;
+
+       count = nr_entries - space;
+       /* l2p_update used exclusively under rb->w_lock */
+       ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count);
+
+out:
+       return ret;
+}
+
+/*
+ * Update the l2p entry for all sectors stored on the write buffer. This means
+ * that all future lookups to the l2p table will point to a device address, not
+ * to the cacheline in the write buffer.
+ */
+void pblk_rb_sync_l2p(struct pblk_rb *rb)
+{
+       unsigned int sync;
+       unsigned int to_update;
+
+       spin_lock(&rb->w_lock);
+
+       /* Protect from reads and writes */
+       sync = smp_load_acquire(&rb->sync);
+
+       to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
+       __pblk_rb_update_l2p(rb, &rb->l2p_update, to_update);
+
+       spin_unlock(&rb->w_lock);
+}
+
+/*
+ * Write @nr_entries to ring buffer from @data buffer if there is enough space.
+ * Typically, 4KB data chunks coming from a bio will be copied to the ring
+ * buffer, thus the write will fail if not all incoming data can be copied.
+ *
+ */
+static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data,
+                                 struct pblk_w_ctx w_ctx,
+                                 struct pblk_rb_entry *entry)
+{
+       memcpy(entry->data, data, rb->seg_size);
+
+       entry->w_ctx.lba = w_ctx.lba;
+       entry->w_ctx.ppa = w_ctx.ppa;
+}
+
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+                             struct pblk_w_ctx w_ctx, unsigned int ring_pos)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       struct pblk_rb_entry *entry;
+       int flags;
+
+       entry = &rb->entries[ring_pos];
+       flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_DEBUG
+       /* Caller must guarantee that the entry is free */
+       BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+       __pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+       pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
+       flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+       /* Release flags on write context. Protect from writes */
+       smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+                           struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
+                           unsigned int ring_pos)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       struct pblk_rb_entry *entry;
+       int flags;
+
+       entry = &rb->entries[ring_pos];
+       flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_DEBUG
+       /* Caller must guarantee that the entry is free */
+       BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+       __pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+       if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line))
+               entry->w_ctx.lba = ADDR_EMPTY;
+
+       flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+       /* Release flags on write context. Protect from writes */
+       smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
+                                 unsigned int pos)
+{
+       struct pblk_rb_entry *entry;
+       unsigned int subm, sync_point;
+       int flags;
+
+       subm = READ_ONCE(rb->subm);
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_inc(&rb->inflight_sync_point);
+#endif
+
+       if (pos == subm)
+               return 0;
+
+       sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
+       entry = &rb->entries[sync_point];
+
+       flags = READ_ONCE(entry->w_ctx.flags);
+       flags |= PBLK_FLUSH_ENTRY;
+
+       /* Release flags on context. Protect from writes */
+       smp_store_release(&entry->w_ctx.flags, flags);
+
+       /* Protect syncs */
+       smp_store_release(&rb->sync_point, sync_point);
+
+       spin_lock_irq(&rb->s_lock);
+       bio_list_add(&entry->w_ctx.bios, bio);
+       spin_unlock_irq(&rb->s_lock);
+
+       return 1;
+}
+
+static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+                              unsigned int *pos)
+{
+       unsigned int mem;
+       unsigned int sync;
+
+       sync = READ_ONCE(rb->sync);
+       mem = READ_ONCE(rb->mem);
+
+       if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries)
+               return 0;
+
+       if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
+               return 0;
+
+       *pos = mem;
+
+       return 1;
+}
+
+static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+                            unsigned int *pos)
+{
+       if (!__pblk_rb_may_write(rb, nr_entries, pos))
+               return 0;
+
+       /* Protect from read count */
+       smp_store_release(&rb->mem, (*pos + nr_entries) & (rb->nr_entries - 1));
+       return 1;
+}
+
+static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
+                                  unsigned int *pos, struct bio *bio,
+                                  int *io_ret)
+{
+       unsigned int mem;
+
+       if (!__pblk_rb_may_write(rb, nr_entries, pos))
+               return 0;
+
+       mem = (*pos + nr_entries) & (rb->nr_entries - 1);
+       *io_ret = NVM_IO_DONE;
+
+       if (bio->bi_opf & REQ_PREFLUSH) {
+               struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+#ifdef CONFIG_NVM_DEBUG
+               atomic_long_inc(&pblk->nr_flush);
+#endif
+               if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem))
+                       *io_ret = NVM_IO_OK;
+       }
+
+       /* Protect from read count */
+       smp_store_release(&rb->mem, mem);
+       return 1;
+}
+
+/*
+ * Atomically check that (i) there is space on the write buffer for the
+ * incoming I/O, and (ii) the current I/O type has enough budget in the write
+ * buffer (rate-limiter).
+ */
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+                          unsigned int nr_entries, unsigned int *pos)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       int flush_done;
+
+       spin_lock(&rb->w_lock);
+       if (!pblk_rl_user_may_insert(&pblk->rl, nr_entries)) {
+               spin_unlock(&rb->w_lock);
+               return NVM_IO_REQUEUE;
+       }
+
+       if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &flush_done)) {
+               spin_unlock(&rb->w_lock);
+               return NVM_IO_REQUEUE;
+       }
+
+       pblk_rl_user_in(&pblk->rl, nr_entries);
+       spin_unlock(&rb->w_lock);
+
+       return flush_done;
+}
+
+/*
+ * Look at pblk_rb_may_write_user comment
+ */
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+                        unsigned int *pos)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+       spin_lock(&rb->w_lock);
+       if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) {
+               spin_unlock(&rb->w_lock);
+               return 0;
+       }
+
+       if (!pblk_rb_may_write(rb, nr_entries, pos)) {
+               spin_unlock(&rb->w_lock);
+               return 0;
+       }
+
+       pblk_rl_gc_in(&pblk->rl, nr_entries);
+       spin_unlock(&rb->w_lock);
+
+       return 1;
+}
+
+/*
+ * The caller of this function must ensure that the backpointer will not
+ * overwrite the entries passed on the list.
+ */
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+                                     struct list_head *list,
+                                     unsigned int max)
+{
+       struct pblk_rb_entry *entry, *tentry;
+       struct page *page;
+       unsigned int read = 0;
+       int ret;
+
+       list_for_each_entry_safe(entry, tentry, list, index) {
+               if (read > max) {
+                       pr_err("pblk: too many entries on list\n");
+                       goto out;
+               }
+
+               page = virt_to_page(entry->data);
+               if (!page) {
+                       pr_err("pblk: could not allocate write bio page\n");
+                       goto out;
+               }
+
+               ret = bio_add_page(bio, page, rb->seg_size, 0);
+               if (ret != rb->seg_size) {
+                       pr_err("pblk: could not add page to write bio\n");
+                       goto out;
+               }
+
+               list_del(&entry->index);
+               read++;
+       }
+
+out:
+       return read;
+}
+
+/*
+ * Read available entries on rb and add them to the given bio. To avoid a memory
+ * copy, a page reference to the write buffer is used to be added to the bio.
+ *
+ * This function is used by the write thread to form the write bio that will
+ * persist data on the write buffer to the media.
+ */
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+                                struct pblk_c_ctx *c_ctx,
+                                unsigned int pos,
+                                unsigned int nr_entries,
+                                unsigned int count)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       struct pblk_rb_entry *entry;
+       struct page *page;
+       unsigned int pad = 0, read = 0, to_read = nr_entries;
+       unsigned int user_io = 0, gc_io = 0;
+       unsigned int i;
+       int flags;
+       int ret;
+
+       if (count < nr_entries) {
+               pad = nr_entries - count;
+               to_read = count;
+       }
+
+       c_ctx->sentry = pos;
+       c_ctx->nr_valid = to_read;
+       c_ctx->nr_padded = pad;
+
+       for (i = 0; i < to_read; i++) {
+               entry = &rb->entries[pos];
+
+               /* A write has been allowed into the buffer, but data is still
+                * being copied to it. It is ok to busy wait.
+                */
+try:
+               flags = READ_ONCE(entry->w_ctx.flags);
+               if (!(flags & PBLK_WRITTEN_DATA))
+                       goto try;
+
+               if (flags & PBLK_IOTYPE_USER)
+                       user_io++;
+               else if (flags & PBLK_IOTYPE_GC)
+                       gc_io++;
+               else
+                       WARN(1, "pblk: unknown IO type\n");
+
+               page = virt_to_page(entry->data);
+               if (!page) {
+                       pr_err("pblk: could not allocate write bio page\n");
+                       flags &= ~PBLK_WRITTEN_DATA;
+                       flags |= PBLK_SUBMITTED_ENTRY;
+                       /* Release flags on context. Protect from writes */
+                       smp_store_release(&entry->w_ctx.flags, flags);
+                       goto out;
+               }
+
+               ret = bio_add_page(bio, page, rb->seg_size, 0);
+               if (ret != rb->seg_size) {
+                       pr_err("pblk: could not add page to write bio\n");
+                       flags &= ~PBLK_WRITTEN_DATA;
+                       flags |= PBLK_SUBMITTED_ENTRY;
+                       /* Release flags on context. Protect from writes */
+                       smp_store_release(&entry->w_ctx.flags, flags);
+                       goto out;
+               }
+
+               if (flags & PBLK_FLUSH_ENTRY) {
+                       unsigned int sync_point;
+
+                       sync_point = READ_ONCE(rb->sync_point);
+                       if (sync_point == pos) {
+                               /* Protect syncs */
+                               smp_store_release(&rb->sync_point, EMPTY_ENTRY);
+                       }
+
+                       flags &= ~PBLK_FLUSH_ENTRY;
+#ifdef CONFIG_NVM_DEBUG
+                       atomic_dec(&rb->inflight_sync_point);
+#endif
+               }
+
+               flags &= ~PBLK_WRITTEN_DATA;
+               flags |= PBLK_SUBMITTED_ENTRY;
+
+               /* Release flags on context. Protect from writes */
+               smp_store_release(&entry->w_ctx.flags, flags);
+
+               pos = (pos + 1) & (rb->nr_entries - 1);
+       }
+
+       read = to_read;
+       pblk_rl_out(&pblk->rl, user_io, gc_io);
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(pad, &((struct pblk *)
+                       (container_of(rb, struct pblk, rwb)))->padded_writes);
+#endif
+out:
+       return read;
+}
+
+/*
+ * Copy to bio only if the lba matches the one on the given cache entry.
+ * Otherwise, it means that the entry has been overwritten, and the bio should
+ * be directed to disk.
+ */
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+                       u64 pos, int bio_iter)
+{
+       struct pblk_rb_entry *entry;
+       struct pblk_w_ctx *w_ctx;
+       void *data;
+       int flags;
+       int ret = 1;
+
+       spin_lock(&rb->w_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+       /* Caller must ensure that the access will not cause an overflow */
+       BUG_ON(pos >= rb->nr_entries);
+#endif
+       entry = &rb->entries[pos];
+       w_ctx = &entry->w_ctx;
+       flags = READ_ONCE(w_ctx->flags);
+
+       /* Check if the entry has been overwritten or is scheduled to be */
+       if (w_ctx->lba != lba || flags & PBLK_WRITABLE_ENTRY) {
+               ret = 0;
+               goto out;
+       }
+
+       /* Only advance the bio if it hasn't been advanced already. If advanced,
+        * this bio is at least a partial bio (i.e., it has partially been
+        * filled with data from the cache). If part of the data resides on the
+        * media, we will read later on
+        */
+       if (unlikely(!bio->bi_iter.bi_idx))
+               bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE);
+
+       data = bio_data(bio);
+       memcpy(data, entry->data, rb->seg_size);
+
+out:
+       spin_unlock(&rb->w_lock);
+       return ret;
+}
+
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos)
+{
+       unsigned int entry = pos & (rb->nr_entries - 1);
+
+       return &rb->entries[entry].w_ctx;
+}
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags)
+       __acquires(&rb->s_lock)
+{
+       if (flags)
+               spin_lock_irqsave(&rb->s_lock, *flags);
+       else
+               spin_lock_irq(&rb->s_lock);
+
+       return rb->sync;
+}
+
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
+       __releases(&rb->s_lock)
+{
+       lockdep_assert_held(&rb->s_lock);
+
+       if (flags)
+               spin_unlock_irqrestore(&rb->s_lock, *flags);
+       else
+               spin_unlock_irq(&rb->s_lock);
+}
+
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
+{
+       unsigned int sync;
+       unsigned int i;
+
+       lockdep_assert_held(&rb->s_lock);
+
+       sync = READ_ONCE(rb->sync);
+
+       for (i = 0; i < nr_entries; i++)
+               sync = (sync + 1) & (rb->nr_entries - 1);
+
+       /* Protect from counts */
+       smp_store_release(&rb->sync, sync);
+
+       return sync;
+}
+
+unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb)
+{
+       unsigned int subm, sync_point;
+       unsigned int count;
+
+       /* Protect syncs */
+       sync_point = smp_load_acquire(&rb->sync_point);
+       if (sync_point == EMPTY_ENTRY)
+               return 0;
+
+       subm = READ_ONCE(rb->subm);
+
+       /* The sync point itself counts as a sector to sync */
+       count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1;
+
+       return count;
+}
+
+/*
+ * Scan from the current position of the sync pointer to find the entry that
+ * corresponds to the given ppa. This is necessary since write requests can be
+ * completed out of order. The assumption is that the ppa is close to the sync
+ * pointer thus the search will not take long.
+ *
+ * The caller of this function must guarantee that the sync pointer will no
+ * reach the entry while it is using the metadata associated with it. With this
+ * assumption in mind, there is no need to take the sync lock.
+ */
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+                                             struct ppa_addr *ppa)
+{
+       unsigned int sync, subm, count;
+       unsigned int i;
+
+       sync = READ_ONCE(rb->sync);
+       subm = READ_ONCE(rb->subm);
+       count = pblk_rb_ring_count(subm, sync, rb->nr_entries);
+
+       for (i = 0; i < count; i++)
+               sync = (sync + 1) & (rb->nr_entries - 1);
+
+       return NULL;
+}
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb)
+{
+       struct pblk_rb_entry *entry;
+       int i;
+       int ret = 0;
+
+       spin_lock(&rb->w_lock);
+       spin_lock_irq(&rb->s_lock);
+
+       if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
+                               (rb->sync == rb->l2p_update) &&
+                               (rb->sync_point == EMPTY_ENTRY)) {
+               goto out;
+       }
+
+       if (!rb->entries) {
+               ret = 1;
+               goto out;
+       }
+
+       for (i = 0; i < rb->nr_entries; i++) {
+               entry = &rb->entries[i];
+
+               if (!entry->data) {
+                       ret = 1;
+                       goto out;
+               }
+       }
+
+out:
+       spin_unlock(&rb->w_lock);
+       spin_unlock_irq(&rb->s_lock);
+
+       return ret;
+}
+
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos)
+{
+       return (pos & (rb->nr_entries - 1));
+}
+
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos)
+{
+       return (pos >= rb->nr_entries);
+}
+
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       struct pblk_c_ctx *c;
+       ssize_t offset;
+       int queued_entries = 0;
+
+       spin_lock_irq(&rb->s_lock);
+       list_for_each_entry(c, &pblk->compl_list, list)
+               queued_entries++;
+       spin_unlock_irq(&rb->s_lock);
+
+       if (rb->sync_point != EMPTY_ENTRY)
+               offset = scnprintf(buf, PAGE_SIZE,
+                       "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
+                       rb->nr_entries,
+                       rb->mem,
+                       rb->subm,
+                       rb->sync,
+                       rb->l2p_update,
+#ifdef CONFIG_NVM_DEBUG
+                       atomic_read(&rb->inflight_sync_point),
+#else
+                       0,
+#endif
+                       rb->sync_point,
+                       pblk_rb_read_count(rb),
+                       pblk_rb_space(rb),
+                       pblk_rb_sync_point_count(rb),
+                       queued_entries);
+       else
+               offset = scnprintf(buf, PAGE_SIZE,
+                       "%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n",
+                       rb->nr_entries,
+                       rb->mem,
+                       rb->subm,
+                       rb->sync,
+                       rb->l2p_update,
+#ifdef CONFIG_NVM_DEBUG
+                       atomic_read(&rb->inflight_sync_point),
+#else
+                       0,
+#endif
+                       pblk_rb_read_count(rb),
+                       pblk_rb_space(rb),
+                       pblk_rb_sync_point_count(rb),
+                       queued_entries);
+
+       return offset;
+}
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c

new file mode 100644 (file)

index 0000000..eff0982
--- /dev/null
+++ b/drivers/lightnvm/pblk-read.c
@@ -0,0 +1,529 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-read.c - pblk's read path
+ */
+
+#include "pblk.h"
+
+/*
+ * There is no guarantee that the value read from cache has not been updated and
+ * resides at another location in the cache. We guarantee though that if the
+ * value is read from the cache, it belongs to the mapped lba. In order to
+ * guarantee and order between writes and reads are ordered, a flush must be
+ * issued.
+ */
+static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
+                               sector_t lba, struct ppa_addr ppa,
+                               int bio_iter)
+{
+#ifdef CONFIG_NVM_DEBUG
+       /* Callers must ensure that the ppa points to a cache address */
+       BUG_ON(pblk_ppa_empty(ppa));
+       BUG_ON(!pblk_addr_in_cache(ppa));
+#endif
+
+       return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba,
+                                       pblk_addr_to_cacheline(ppa), bio_iter);
+}
+
+static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                                unsigned long *read_bitmap)
+{
+       struct bio *bio = rqd->bio;
+       struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+       sector_t blba = pblk_get_lba(bio);
+       int nr_secs = rqd->nr_ppas;
+       int advanced_bio = 0;
+       int i, j = 0;
+
+       /* logic error: lba out-of-bounds. Ignore read request */
+       if (!(blba + nr_secs < pblk->rl.nr_secs)) {
+               WARN_ON("pblk: read lbas out of bounds\n");
+               return;
+       }
+
+       pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
+
+       for (i = 0; i < nr_secs; i++) {
+               struct ppa_addr p = ppas[i];
+               sector_t lba = blba + i;
+
+retry:
+               if (pblk_ppa_empty(p)) {
+                       WARN_ON(test_and_set_bit(i, read_bitmap));
+                       continue;
+               }
+
+               /* Try to read from write buffer. The address is later checked
+                * on the write buffer to prevent retrieving overwritten data.
+                */
+               if (pblk_addr_in_cache(p)) {
+                       if (!pblk_read_from_cache(pblk, bio, lba, p, i)) {
+                               pblk_lookup_l2p_seq(pblk, &p, lba, 1);
+                               goto retry;
+                       }
+                       WARN_ON(test_and_set_bit(i, read_bitmap));
+                       advanced_bio = 1;
+               } else {
+                       /* Read from media non-cached sectors */
+                       rqd->ppa_list[j++] = p;
+               }
+
+               if (advanced_bio)
+                       bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(nr_secs, &pblk->inflight_reads);
+#endif
+}
+
+static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       int err;
+
+       rqd->flags = pblk_set_read_mode(pblk);
+
+       err = pblk_submit_io(pblk, rqd);
+       if (err)
+               return NVM_IO_ERR;
+
+       return NVM_IO_OK;
+}
+
+static void pblk_end_io_read(struct nvm_rq *rqd)
+{
+       struct pblk *pblk = rqd->private;
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+       struct bio *bio = rqd->bio;
+
+       if (rqd->error)
+               pblk_log_read_err(pblk, rqd);
+#ifdef CONFIG_NVM_DEBUG
+       else
+               WARN_ONCE(bio->bi_error, "pblk: corrupted read error\n");
+#endif
+
+       if (rqd->nr_ppas > 1)
+               nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+
+       bio_put(bio);
+       if (r_ctx->orig_bio) {
+#ifdef CONFIG_NVM_DEBUG
+               WARN_ONCE(r_ctx->orig_bio->bi_error,
+                                               "pblk: corrupted read bio\n");
+#endif
+               bio_endio(r_ctx->orig_bio);
+               bio_put(r_ctx->orig_bio);
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
+       atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
+#endif
+
+       pblk_free_rqd(pblk, rqd, READ);
+}
+
+static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
+                                     unsigned int bio_init_idx,
+                                     unsigned long *read_bitmap)
+{
+       struct bio *new_bio, *bio = rqd->bio;
+       struct bio_vec src_bv, dst_bv;
+       void *ppa_ptr = NULL;
+       void *src_p, *dst_p;
+       dma_addr_t dma_ppa_list = 0;
+       int nr_secs = rqd->nr_ppas;
+       int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
+       int i, ret, hole;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       new_bio = bio_alloc(GFP_KERNEL, nr_holes);
+       if (!new_bio) {
+               pr_err("pblk: could not alloc read bio\n");
+               return NVM_IO_ERR;
+       }
+
+       if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
+               goto err;
+
+       if (nr_holes != new_bio->bi_vcnt) {
+               pr_err("pblk: malformed bio\n");
+               goto err;
+       }
+
+       new_bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
+       new_bio->bi_private = &wait;
+       new_bio->bi_end_io = pblk_end_bio_sync;
+
+       rqd->bio = new_bio;
+       rqd->nr_ppas = nr_holes;
+       rqd->end_io = NULL;
+
+       if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+               ppa_ptr = rqd->ppa_list;
+               dma_ppa_list = rqd->dma_ppa_list;
+               rqd->ppa_addr = rqd->ppa_list[0];
+       }
+
+       ret = pblk_submit_read_io(pblk, rqd);
+       if (ret) {
+               bio_put(rqd->bio);
+               pr_err("pblk: read IO submission failed\n");
+               goto err;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: partial read I/O timed out\n");
+       }
+
+       if (rqd->error) {
+               atomic_long_inc(&pblk->read_failed);
+#ifdef CONFIG_NVM_DEBUG
+               pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+       }
+
+       if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+               rqd->ppa_list = ppa_ptr;
+               rqd->dma_ppa_list = dma_ppa_list;
+       }
+
+       /* Fill the holes in the original bio */
+       i = 0;
+       hole = find_first_zero_bit(read_bitmap, nr_secs);
+       do {
+               src_bv = new_bio->bi_io_vec[i++];
+               dst_bv = bio->bi_io_vec[bio_init_idx + hole];
+
+               src_p = kmap_atomic(src_bv.bv_page);
+               dst_p = kmap_atomic(dst_bv.bv_page);
+
+               memcpy(dst_p + dst_bv.bv_offset,
+                       src_p + src_bv.bv_offset,
+                       PBLK_EXPOSED_PAGE_SIZE);
+
+               kunmap_atomic(src_p);
+               kunmap_atomic(dst_p);
+
+               mempool_free(src_bv.bv_page, pblk->page_pool);
+
+               hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
+       } while (hole < nr_secs);
+
+       bio_put(new_bio);
+
+       /* Complete the original bio and associated request */
+       rqd->bio = bio;
+       rqd->nr_ppas = nr_secs;
+       rqd->private = pblk;
+
+       bio_endio(bio);
+       pblk_end_io_read(rqd);
+       return NVM_IO_OK;
+
+err:
+       /* Free allocated pages in new bio */
+       pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
+       rqd->private = pblk;
+       pblk_end_io_read(rqd);
+       return NVM_IO_ERR;
+}
+
+static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                        unsigned long *read_bitmap)
+{
+       struct bio *bio = rqd->bio;
+       struct ppa_addr ppa;
+       sector_t lba = pblk_get_lba(bio);
+
+       /* logic error: lba out-of-bounds. Ignore read request */
+       if (!(lba < pblk->rl.nr_secs)) {
+               WARN_ON("pblk: read lba out of bounds\n");
+               return;
+       }
+
+       pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+retry:
+       if (pblk_ppa_empty(ppa)) {
+               WARN_ON(test_and_set_bit(0, read_bitmap));
+               return;
+       }
+
+       /* Try to read from write buffer. The address is later checked on the
+        * write buffer to prevent retrieving overwritten data.
+        */
+       if (pblk_addr_in_cache(ppa)) {
+               if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0)) {
+                       pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+                       goto retry;
+               }
+               WARN_ON(test_and_set_bit(0, read_bitmap));
+       } else {
+               rqd->ppa_addr = ppa;
+       }
+}
+
+int pblk_submit_read(struct pblk *pblk, struct bio *bio)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       int nr_secs = pblk_get_secs(bio);
+       struct nvm_rq *rqd;
+       unsigned long read_bitmap; /* Max 64 ppas per request */
+       unsigned int bio_init_idx;
+       int ret = NVM_IO_ERR;
+
+       if (nr_secs > PBLK_MAX_REQ_ADDRS)
+               return NVM_IO_ERR;
+
+       bitmap_zero(&read_bitmap, nr_secs);
+
+       rqd = pblk_alloc_rqd(pblk, READ);
+       if (IS_ERR(rqd)) {
+               pr_err_ratelimited("pblk: not able to alloc rqd");
+               return NVM_IO_ERR;
+       }
+
+       rqd->opcode = NVM_OP_PREAD;
+       rqd->bio = bio;
+       rqd->nr_ppas = nr_secs;
+       rqd->private = pblk;
+       rqd->end_io = pblk_end_io_read;
+
+       /* Save the index for this bio's start. This is needed in case
+        * we need to fill a partial read.
+        */
+       bio_init_idx = pblk_get_bi_idx(bio);
+
+       if (nr_secs > 1) {
+               rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+                                               &rqd->dma_ppa_list);
+               if (!rqd->ppa_list) {
+                       pr_err("pblk: not able to allocate ppa list\n");
+                       goto fail_rqd_free;
+               }
+
+               pblk_read_ppalist_rq(pblk, rqd, &read_bitmap);
+       } else {
+               pblk_read_rq(pblk, rqd, &read_bitmap);
+       }
+
+       bio_get(bio);
+       if (bitmap_full(&read_bitmap, nr_secs)) {
+               bio_endio(bio);
+               pblk_end_io_read(rqd);
+               return NVM_IO_OK;
+       }
+
+       /* All sectors are to be read from the device */
+       if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
+               struct bio *int_bio = NULL;
+               struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+
+               /* Clone read bio to deal with read errors internally */
+               int_bio = bio_clone_bioset(bio, GFP_KERNEL, fs_bio_set);
+               if (!int_bio) {
+                       pr_err("pblk: could not clone read bio\n");
+                       return NVM_IO_ERR;
+               }
+
+               rqd->bio = int_bio;
+               r_ctx->orig_bio = bio;
+
+               ret = pblk_submit_read_io(pblk, rqd);
+               if (ret) {
+                       pr_err("pblk: read IO submission failed\n");
+                       if (int_bio)
+                               bio_put(int_bio);
+                       return ret;
+               }
+
+               return NVM_IO_OK;
+       }
+
+       /* The read bio request could be partially filled by the write buffer,
+        * but there are some holes that need to be read from the drive.
+        */
+       ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
+       if (ret) {
+               pr_err("pblk: failed to perform partial read\n");
+               return ret;
+       }
+
+       return NVM_IO_OK;
+
+fail_rqd_free:
+       pblk_free_rqd(pblk, rqd, READ);
+       return ret;
+}
+
+static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+                             struct pblk_line *line, u64 *lba_list,
+                             unsigned int nr_secs)
+{
+       struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+       int valid_secs = 0;
+       int i;
+
+       pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs);
+
+       for (i = 0; i < nr_secs; i++) {
+               if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id ||
+                                               pblk_ppa_empty(ppas[i])) {
+                       lba_list[i] = ADDR_EMPTY;
+                       continue;
+               }
+
+               rqd->ppa_list[valid_secs++] = ppas[i];
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(valid_secs, &pblk->inflight_reads);
+#endif
+       return valid_secs;
+}
+
+static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+                     struct pblk_line *line, sector_t lba)
+{
+       struct ppa_addr ppa;
+       int valid_secs = 0;
+
+       /* logic error: lba out-of-bounds */
+       if (!(lba < pblk->rl.nr_secs)) {
+               WARN_ON("pblk: read lba out of bounds\n");
+               goto out;
+       }
+
+       if (lba == ADDR_EMPTY)
+               goto out;
+
+       spin_lock(&pblk->trans_lock);
+       ppa = pblk_trans_map_get(pblk, lba);
+       spin_unlock(&pblk->trans_lock);
+
+       /* Ignore updated values until the moment */
+       if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id ||
+                                                       pblk_ppa_empty(ppa))
+               goto out;
+
+       rqd->ppa_addr = ppa;
+       valid_secs = 1;
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+out:
+       return valid_secs;
+}
+
+int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
+                       unsigned int nr_secs, unsigned int *secs_to_gc,
+                       struct pblk_line *line)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct request_queue *q = dev->q;
+       struct bio *bio;
+       struct nvm_rq rqd;
+       int ret, data_len;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+
+       if (nr_secs > 1) {
+               rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+                                                       &rqd.dma_ppa_list);
+               if (!rqd.ppa_list)
+                       return NVM_IO_ERR;
+
+               *secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list,
+                                                               nr_secs);
+               if (*secs_to_gc == 1) {
+                       struct ppa_addr ppa;
+
+                       ppa = rqd.ppa_list[0];
+                       nvm_dev_dma_free(dev->parent, rqd.ppa_list,
+                                                       rqd.dma_ppa_list);
+                       rqd.ppa_addr = ppa;
+               }
+       } else {
+               *secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]);
+       }
+
+       if (!(*secs_to_gc))
+               goto out;
+
+       data_len = (*secs_to_gc) * geo->sec_size;
+       bio = bio_map_kern(q, data, data_len, GFP_KERNEL);
+       if (IS_ERR(bio)) {
+               pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
+               goto err_free_dma;
+       }
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+       rqd.opcode = NVM_OP_PREAD;
+       rqd.end_io = pblk_end_io_sync;
+       rqd.private = &wait;
+       rqd.nr_ppas = *secs_to_gc;
+       rqd.bio = bio;
+
+       ret = pblk_submit_read_io(pblk, &rqd);
+       if (ret) {
+               bio_endio(bio);
+               pr_err("pblk: GC read request failed\n");
+               goto err_free_dma;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: GC read I/O timed out\n");
+       }
+
+       if (rqd.error) {
+               atomic_long_inc(&pblk->read_failed_gc);
+#ifdef CONFIG_NVM_DEBUG
+               pblk_print_failed_rqd(pblk, &rqd, rqd.error);
+#endif
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(*secs_to_gc, &pblk->sync_reads);
+       atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads);
+       atomic_long_sub(*secs_to_gc, &pblk->inflight_reads);
+#endif
+
+out:
+       if (rqd.nr_ppas > 1)
+               nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+       return NVM_IO_OK;
+
+err_free_dma:
+       if (rqd.nr_ppas > 1)
+               nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+       return NVM_IO_ERR;
+}
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c

new file mode 100644 (file)

index 0000000..0d50f41
--- /dev/null
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -0,0 +1,998 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-recovery.c - pblk's recovery path
+ */
+
+#include "pblk.h"
+
+void pblk_submit_rec(struct work_struct *work)
+{
+       struct pblk_rec_ctx *recovery =
+                       container_of(work, struct pblk_rec_ctx, ws_rec);
+       struct pblk *pblk = recovery->pblk;
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_rq *rqd = recovery->rqd;
+       struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+       int max_secs = nvm_max_phys_sects(dev);
+       struct bio *bio;
+       unsigned int nr_rec_secs;
+       unsigned int pgs_read;
+       int ret;
+
+       nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
+                                                               max_secs);
+
+       bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
+       if (!bio) {
+               pr_err("pblk: not able to create recovery bio\n");
+               return;
+       }
+
+       bio->bi_iter.bi_sector = 0;
+       bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+       rqd->bio = bio;
+       rqd->nr_ppas = nr_rec_secs;
+
+       pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
+                                                               nr_rec_secs);
+       if (pgs_read != nr_rec_secs) {
+               pr_err("pblk: could not read recovery entries\n");
+               goto err;
+       }
+
+       if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
+               pr_err("pblk: could not setup recovery request\n");
+               goto err;
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(nr_rec_secs, &pblk->recov_writes);
+#endif
+
+       ret = pblk_submit_io(pblk, rqd);
+       if (ret) {
+               pr_err("pblk: I/O submission failed: %d\n", ret);
+               goto err;
+       }
+
+       mempool_free(recovery, pblk->rec_pool);
+       return;
+
+err:
+       bio_put(bio);
+       pblk_free_rqd(pblk, rqd, WRITE);
+}
+
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
+                       struct pblk_rec_ctx *recovery, u64 *comp_bits,
+                       unsigned int comp)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       int max_secs = nvm_max_phys_sects(dev);
+       struct nvm_rq *rec_rqd;
+       struct pblk_c_ctx *rec_ctx;
+       int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
+
+       rec_rqd = pblk_alloc_rqd(pblk, WRITE);
+       if (IS_ERR(rec_rqd)) {
+               pr_err("pblk: could not create recovery req.\n");
+               return -ENOMEM;
+       }
+
+       rec_ctx = nvm_rq_to_pdu(rec_rqd);
+
+       /* Copy completion bitmap, but exclude the first X completed entries */
+       bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
+                               (unsigned long int *)comp_bits,
+                               comp, max_secs);
+
+       /* Save the context for the entries that need to be re-written and
+        * update current context with the completed entries.
+        */
+       rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
+       if (comp >= c_ctx->nr_valid) {
+               rec_ctx->nr_valid = 0;
+               rec_ctx->nr_padded = nr_entries - comp;
+
+               c_ctx->nr_padded = comp - c_ctx->nr_valid;
+       } else {
+               rec_ctx->nr_valid = c_ctx->nr_valid - comp;
+               rec_ctx->nr_padded = c_ctx->nr_padded;
+
+               c_ctx->nr_valid = comp;
+               c_ctx->nr_padded = 0;
+       }
+
+       recovery->rqd = rec_rqd;
+       recovery->pblk = pblk;
+
+       return 0;
+}
+
+__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta)
+{
+       u32 crc;
+
+       crc = pblk_calc_emeta_crc(pblk, emeta);
+       if (le32_to_cpu(emeta->crc) != crc)
+               return NULL;
+
+       if (le32_to_cpu(emeta->header.identifier) != PBLK_MAGIC)
+               return NULL;
+
+       return pblk_line_emeta_to_lbas(emeta);
+}
+
+static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct line_emeta *emeta = line->emeta;
+       __le64 *lba_list;
+       int data_start;
+       int nr_data_lbas, nr_valid_lbas, nr_lbas = 0;
+       int i;
+
+       lba_list = pblk_recov_get_lba_list(pblk, emeta);
+       if (!lba_list)
+               return 1;
+
+       data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
+       nr_data_lbas = lm->sec_per_line - lm->emeta_sec;
+       nr_valid_lbas = le64_to_cpu(emeta->nr_valid_lbas);
+
+       for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) {
+               struct ppa_addr ppa;
+               int pos;
+
+               ppa = addr_to_pblk_ppa(pblk, i, line->id);
+               pos = pblk_ppa_to_pos(geo, ppa);
+
+               /* Do not update bad blocks */
+               if (test_bit(pos, line->blk_bitmap))
+                       continue;
+
+               if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) {
+                       spin_lock(&line->lock);
+                       if (test_and_set_bit(i, line->invalid_bitmap))
+                               WARN_ON_ONCE("pblk: rec. double invalidate:\n");
+                       else
+                               line->vsc--;
+                       spin_unlock(&line->lock);
+
+                       continue;
+               }
+
+               pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa);
+               nr_lbas++;
+       }
+
+       if (nr_valid_lbas != nr_lbas)
+               pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n",
+                               line->id, line->emeta->nr_valid_lbas, nr_lbas);
+
+       line->left_msecs = 0;
+
+       return 0;
+}
+
+static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+
+       return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec -
+                               nr_bb * geo->sec_per_blk;
+}
+
+struct pblk_recov_alloc {
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct nvm_rq *rqd;
+       void *data;
+       dma_addr_t dma_ppa_list;
+       dma_addr_t dma_meta_list;
+};
+
+static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
+                              struct pblk_recov_alloc p, u64 r_ptr)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct nvm_rq *rqd;
+       struct bio *bio;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+       u64 r_ptr_int;
+       int left_ppas;
+       int rq_ppas, rq_len;
+       int i, j;
+       int ret = 0;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       ppa_list = p.ppa_list;
+       meta_list = p.meta_list;
+       rqd = p.rqd;
+       data = p.data;
+       dma_ppa_list = p.dma_ppa_list;
+       dma_meta_list = p.dma_meta_list;
+
+       left_ppas = line->cur_sec - r_ptr;
+       if (!left_ppas)
+               return 0;
+
+       r_ptr_int = r_ptr;
+
+next_read_rq:
+       memset(rqd, 0, pblk_r_rq_size);
+
+       rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+       if (!rq_ppas)
+               rq_ppas = pblk->min_write_pgs;
+       rq_len = rq_ppas * geo->sec_size;
+
+       bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+       rqd->bio = bio;
+       rqd->opcode = NVM_OP_PREAD;
+       rqd->flags = pblk_set_read_mode(pblk);
+       rqd->meta_list = meta_list;
+       rqd->nr_ppas = rq_ppas;
+       rqd->ppa_list = ppa_list;
+       rqd->dma_ppa_list = dma_ppa_list;
+       rqd->dma_meta_list = dma_meta_list;
+       rqd->end_io = pblk_end_io_sync;
+       rqd->private = &wait;
+
+       for (i = 0; i < rqd->nr_ppas; ) {
+               struct ppa_addr ppa;
+               int pos;
+
+               ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+               pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+               while (test_bit(pos, line->blk_bitmap)) {
+                       r_ptr_int += pblk->min_write_pgs;
+                       ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+                       pos = pblk_dev_ppa_to_pos(geo, ppa);
+               }
+
+               for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
+                       rqd->ppa_list[i] =
+                               addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+       }
+
+       /* If read fails, more padding is needed */
+       ret = pblk_submit_io(pblk, rqd);
+       if (ret) {
+               pr_err("pblk: I/O submission failed: %d\n", ret);
+               return ret;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: L2P recovery read timed out\n");
+               return -EINTR;
+       }
+
+       reinit_completion(&wait);
+
+       /* At this point, the read should not fail. If it does, it is a problem
+        * we cannot recover from here. Need FTL log.
+        */
+       if (rqd->error) {
+               pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
+               return -EINTR;
+       }
+
+       for (i = 0; i < rqd->nr_ppas; i++) {
+               u64 lba = le64_to_cpu(meta_list[i].lba);
+
+               if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+                       continue;
+
+               pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+       }
+
+       left_ppas -= rq_ppas;
+       if (left_ppas > 0)
+               goto next_read_rq;
+
+       return 0;
+}
+
+static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
+                             struct pblk_recov_alloc p, int left_ppas)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct nvm_rq *rqd;
+       struct bio *bio;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+       __le64 *lba_list = pblk_line_emeta_to_lbas(line->emeta);
+       u64 w_ptr = line->cur_sec;
+       int left_line_ppas = line->left_msecs;
+       int rq_ppas, rq_len;
+       int i, j;
+       int ret = 0;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       ppa_list = p.ppa_list;
+       meta_list = p.meta_list;
+       rqd = p.rqd;
+       data = p.data;
+       dma_ppa_list = p.dma_ppa_list;
+       dma_meta_list = p.dma_meta_list;
+
+next_pad_rq:
+       rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+       if (!rq_ppas)
+               rq_ppas = pblk->min_write_pgs;
+       rq_len = rq_ppas * geo->sec_size;
+
+       bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+       memset(rqd, 0, pblk_r_rq_size);
+
+       rqd->bio = bio;
+       rqd->opcode = NVM_OP_PWRITE;
+       rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+       rqd->meta_list = meta_list;
+       rqd->nr_ppas = rq_ppas;
+       rqd->ppa_list = ppa_list;
+       rqd->dma_ppa_list = dma_ppa_list;
+       rqd->dma_meta_list = dma_meta_list;
+       rqd->end_io = pblk_end_io_sync;
+       rqd->private = &wait;
+
+       for (i = 0; i < rqd->nr_ppas; ) {
+               struct ppa_addr ppa;
+               int pos;
+
+               w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+               ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+               pos = pblk_ppa_to_pos(geo, ppa);
+
+               while (test_bit(pos, line->blk_bitmap)) {
+                       w_ptr += pblk->min_write_pgs;
+                       ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+                       pos = pblk_ppa_to_pos(geo, ppa);
+               }
+
+               for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
+                       struct ppa_addr dev_ppa;
+
+                       dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+
+                       pblk_map_invalidate(pblk, dev_ppa);
+                       meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+                       lba_list[w_ptr] = cpu_to_le64(ADDR_EMPTY);
+                       rqd->ppa_list[i] = dev_ppa;
+               }
+       }
+
+       ret = pblk_submit_io(pblk, rqd);
+       if (ret) {
+               pr_err("pblk: I/O submission failed: %d\n", ret);
+               return ret;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: L2P recovery write timed out\n");
+       }
+       reinit_completion(&wait);
+
+       left_line_ppas -= rq_ppas;
+       left_ppas -= rq_ppas;
+       if (left_ppas > 0 && left_line_ppas)
+               goto next_pad_rq;
+
+       return 0;
+}
+
+/* When this function is called, it means that not all upper pages have been
+ * written in a page that contains valid data. In order to recover this data, we
+ * first find the write pointer on the device, then we pad all necessary
+ * sectors, and finally attempt to read the valid data
+ */
+static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
+                                  struct pblk_recov_alloc p)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct nvm_rq *rqd;
+       struct bio *bio;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+       u64 w_ptr = 0, r_ptr;
+       int rq_ppas, rq_len;
+       int i, j;
+       int ret = 0;
+       int rec_round;
+       int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       ppa_list = p.ppa_list;
+       meta_list = p.meta_list;
+       rqd = p.rqd;
+       data = p.data;
+       dma_ppa_list = p.dma_ppa_list;
+       dma_meta_list = p.dma_meta_list;
+
+       /* we could recover up until the line write pointer */
+       r_ptr = line->cur_sec;
+       rec_round = 0;
+
+next_rq:
+       memset(rqd, 0, pblk_r_rq_size);
+
+       rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+       if (!rq_ppas)
+               rq_ppas = pblk->min_write_pgs;
+       rq_len = rq_ppas * geo->sec_size;
+
+       bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+       rqd->bio = bio;
+       rqd->opcode = NVM_OP_PREAD;
+       rqd->flags = pblk_set_read_mode(pblk);
+       rqd->meta_list = meta_list;
+       rqd->nr_ppas = rq_ppas;
+       rqd->ppa_list = ppa_list;
+       rqd->dma_ppa_list = dma_ppa_list;
+       rqd->dma_meta_list = dma_meta_list;
+       rqd->end_io = pblk_end_io_sync;
+       rqd->private = &wait;
+
+       for (i = 0; i < rqd->nr_ppas; ) {
+               struct ppa_addr ppa;
+               int pos;
+
+               w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+               ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+               pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+               while (test_bit(pos, line->blk_bitmap)) {
+                       w_ptr += pblk->min_write_pgs;
+                       ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+                       pos = pblk_dev_ppa_to_pos(geo, ppa);
+               }
+
+               for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
+                       rqd->ppa_list[i] =
+                               addr_to_gen_ppa(pblk, w_ptr, line->id);
+       }
+
+       ret = pblk_submit_io(pblk, rqd);
+       if (ret) {
+               pr_err("pblk: I/O submission failed: %d\n", ret);
+               return ret;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: L2P recovery read timed out\n");
+       }
+       reinit_completion(&wait);
+
+       /* This should not happen since the read failed during normal recovery,
+        * but the media works funny sometimes...
+        */
+       if (!rec_round++ && !rqd->error) {
+               rec_round = 0;
+               for (i = 0; i < rqd->nr_ppas; i++, r_ptr++) {
+                       u64 lba = le64_to_cpu(meta_list[i].lba);
+
+                       if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+                               continue;
+
+                       pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+               }
+       }
+
+       /* Reached the end of the written line */
+       if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+               int pad_secs, nr_error_bits, bit;
+               int ret;
+
+               bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+               nr_error_bits = rqd->nr_ppas - bit;
+
+               /* Roll back failed sectors */
+               line->cur_sec -= nr_error_bits;
+               line->left_msecs += nr_error_bits;
+               bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+               pad_secs = pblk_pad_distance(pblk);
+               if (pad_secs > line->left_msecs)
+                       pad_secs = line->left_msecs;
+
+               ret = pblk_recov_pad_oob(pblk, line, p, pad_secs);
+               if (ret)
+                       pr_err("pblk: OOB padding failed (err:%d)\n", ret);
+
+               ret = pblk_recov_read_oob(pblk, line, p, r_ptr);
+               if (ret)
+                       pr_err("pblk: OOB read failed (err:%d)\n", ret);
+
+               line->left_ssecs = line->left_msecs;
+               left_ppas = 0;
+       }
+
+       left_ppas -= rq_ppas;
+       if (left_ppas > 0)
+               goto next_rq;
+
+       return ret;
+}
+
+static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
+                              struct pblk_recov_alloc p, int *done)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct nvm_rq *rqd;
+       struct bio *bio;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+       u64 paddr;
+       int rq_ppas, rq_len;
+       int i, j;
+       int ret = 0;
+       int left_ppas = pblk_calc_sec_in_line(pblk, line);
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       ppa_list = p.ppa_list;
+       meta_list = p.meta_list;
+       rqd = p.rqd;
+       data = p.data;
+       dma_ppa_list = p.dma_ppa_list;
+       dma_meta_list = p.dma_meta_list;
+
+       *done = 1;
+
+next_rq:
+       memset(rqd, 0, pblk_r_rq_size);
+
+       rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+       if (!rq_ppas)
+               rq_ppas = pblk->min_write_pgs;
+       rq_len = rq_ppas * geo->sec_size;
+
+       bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+       rqd->bio = bio;
+       rqd->opcode = NVM_OP_PREAD;
+       rqd->flags = pblk_set_read_mode(pblk);
+       rqd->meta_list = meta_list;
+       rqd->nr_ppas = rq_ppas;
+       rqd->ppa_list = ppa_list;
+       rqd->dma_ppa_list = dma_ppa_list;
+       rqd->dma_meta_list = dma_meta_list;
+       rqd->end_io = pblk_end_io_sync;
+       rqd->private = &wait;
+
+       for (i = 0; i < rqd->nr_ppas; ) {
+               struct ppa_addr ppa;
+               int pos;
+
+               paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+               ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+               pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+               while (test_bit(pos, line->blk_bitmap)) {
+                       paddr += pblk->min_write_pgs;
+                       ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+                       pos = pblk_dev_ppa_to_pos(geo, ppa);
+               }
+
+               for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
+                       rqd->ppa_list[i] =
+                               addr_to_gen_ppa(pblk, paddr, line->id);
+       }
+
+       ret = pblk_submit_io(pblk, rqd);
+       if (ret) {
+               pr_err("pblk: I/O submission failed: %d\n", ret);
+               bio_put(bio);
+               return ret;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: L2P recovery read timed out\n");
+       }
+       reinit_completion(&wait);
+
+       /* Reached the end of the written line */
+       if (rqd->error) {
+               int nr_error_bits, bit;
+
+               bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+               nr_error_bits = rqd->nr_ppas - bit;
+
+               /* Roll back failed sectors */
+               line->cur_sec -= nr_error_bits;
+               line->left_msecs += nr_error_bits;
+               line->left_ssecs = line->left_msecs;
+               bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+               left_ppas = 0;
+               rqd->nr_ppas = bit;
+
+               if (rqd->error != NVM_RSP_ERR_EMPTYPAGE)
+                       *done = 0;
+       }
+
+       for (i = 0; i < rqd->nr_ppas; i++) {
+               u64 lba = le64_to_cpu(meta_list[i].lba);
+
+               if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+                       continue;
+
+               pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+       }
+
+       left_ppas -= rq_ppas;
+       if (left_ppas > 0)
+               goto next_rq;
+
+       return ret;
+}
+
+/* Scan line for lbas on out of bound area */
+static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct nvm_rq *rqd;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct pblk_recov_alloc p;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+       int done, ret = 0;
+
+       rqd = pblk_alloc_rqd(pblk, READ);
+       if (IS_ERR(rqd))
+               return PTR_ERR(rqd);
+
+       meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+       if (!meta_list) {
+               ret = -ENOMEM;
+               goto free_rqd;
+       }
+
+       ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+       dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+       data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
+       if (!data) {
+               ret = -ENOMEM;
+               goto free_meta_list;
+       }
+
+       p.ppa_list = ppa_list;
+       p.meta_list = meta_list;
+       p.rqd = rqd;
+       p.data = data;
+       p.dma_ppa_list = dma_ppa_list;
+       p.dma_meta_list = dma_meta_list;
+
+       ret = pblk_recov_scan_oob(pblk, line, p, &done);
+       if (ret) {
+               pr_err("pblk: could not recover L2P from OOB\n");
+               goto out;
+       }
+
+       if (!done) {
+               ret = pblk_recov_scan_all_oob(pblk, line, p);
+               if (ret) {
+                       pr_err("pblk: could not recover L2P from OOB\n");
+                       goto out;
+               }
+       }
+
+       if (pblk_line_is_full(line))
+               pblk_line_recov_close(pblk, line);
+
+out:
+       kfree(data);
+free_meta_list:
+       nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+free_rqd:
+       pblk_free_rqd(pblk, rqd, READ);
+
+       return ret;
+}
+
+/* Insert lines ordered by sequence number (seq_num) on list */
+static void pblk_recov_line_add_ordered(struct list_head *head,
+                                       struct pblk_line *line)
+{
+       struct pblk_line *t = NULL;
+
+       list_for_each_entry(t, head, list)
+               if (t->seq_nr > line->seq_nr)
+                       break;
+
+       __list_add(&line->list, t->list.prev, &t->list);
+}
+
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *line, *tline, *data_line = NULL;
+       struct line_smeta *smeta;
+       struct line_emeta *emeta;
+       int found_lines = 0, recovered_lines = 0, open_lines = 0;
+       int is_next = 0;
+       int meta_line;
+       int i, valid_uuid = 0;
+       LIST_HEAD(recov_list);
+
+       /* TODO: Implement FTL snapshot */
+
+       /* Scan recovery - takes place when FTL snapshot fails */
+       spin_lock(&l_mg->free_lock);
+       meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+       set_bit(meta_line, &l_mg->meta_bitmap);
+       smeta = l_mg->sline_meta[meta_line].meta;
+       emeta = l_mg->eline_meta[meta_line].meta;
+       spin_unlock(&l_mg->free_lock);
+
+       /* Order data lines using their sequence number */
+       for (i = 0; i < l_mg->nr_lines; i++) {
+               u32 crc;
+
+               line = &pblk->lines[i];
+
+               memset(smeta, 0, lm->smeta_len);
+               line->smeta = smeta;
+               line->lun_bitmap = ((void *)(smeta)) +
+                                               sizeof(struct line_smeta);
+
+               /* Lines that cannot be read are assumed as not written here */
+               if (pblk_line_read_smeta(pblk, line))
+                       continue;
+
+               crc = pblk_calc_smeta_crc(pblk, smeta);
+               if (le32_to_cpu(smeta->crc) != crc)
+                       continue;
+
+               if (le32_to_cpu(smeta->header.identifier) != PBLK_MAGIC)
+                       continue;
+
+               if (le16_to_cpu(smeta->header.version) != 1) {
+                       pr_err("pblk: found incompatible line version %u\n",
+                                       smeta->header.version);
+                       return ERR_PTR(-EINVAL);
+               }
+
+               /* The first valid instance uuid is used for initialization */
+               if (!valid_uuid) {
+                       memcpy(pblk->instance_uuid, smeta->header.uuid, 16);
+                       valid_uuid = 1;
+               }
+
+               if (memcmp(pblk->instance_uuid, smeta->header.uuid, 16)) {
+                       pr_debug("pblk: ignore line %u due to uuid mismatch\n",
+                                       i);
+                       continue;
+               }
+
+               /* Update line metadata */
+               spin_lock(&line->lock);
+               line->id = le32_to_cpu(line->smeta->header.id);
+               line->type = le16_to_cpu(line->smeta->header.type);
+               line->seq_nr = le64_to_cpu(line->smeta->seq_nr);
+               spin_unlock(&line->lock);
+
+               /* Update general metadata */
+               spin_lock(&l_mg->free_lock);
+               if (line->seq_nr >= l_mg->d_seq_nr)
+                       l_mg->d_seq_nr = line->seq_nr + 1;
+               l_mg->nr_free_lines--;
+               spin_unlock(&l_mg->free_lock);
+
+               if (pblk_line_recov_alloc(pblk, line))
+                       goto out;
+
+               pblk_recov_line_add_ordered(&recov_list, line);
+               found_lines++;
+               pr_debug("pblk: recovering data line %d, seq:%llu\n",
+                                               line->id, smeta->seq_nr);
+       }
+
+       if (!found_lines) {
+               pblk_setup_uuid(pblk);
+
+               spin_lock(&l_mg->free_lock);
+               WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+                                                       &l_mg->meta_bitmap));
+               spin_unlock(&l_mg->free_lock);
+
+               goto out;
+       }
+
+       /* Verify closed blocks and recover this portion of L2P table*/
+       list_for_each_entry_safe(line, tline, &recov_list, list) {
+               int off, nr_bb;
+
+               recovered_lines++;
+               /* Calculate where emeta starts based on the line bb */
+               off = lm->sec_per_line - lm->emeta_sec;
+               nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+               off -= nr_bb * geo->sec_per_pl;
+
+               memset(emeta, 0, lm->emeta_len);
+               line->emeta = emeta;
+               line->emeta_ssec = off;
+
+               if (pblk_line_read_emeta(pblk, line)) {
+                       pblk_recov_l2p_from_oob(pblk, line);
+                       goto next;
+               }
+
+               if (pblk_recov_l2p_from_emeta(pblk, line))
+                       pblk_recov_l2p_from_oob(pblk, line);
+
+next:
+               if (pblk_line_is_full(line)) {
+                       struct list_head *move_list;
+
+                       spin_lock(&line->lock);
+                       line->state = PBLK_LINESTATE_CLOSED;
+                       move_list = pblk_line_gc_list(pblk, line);
+                       spin_unlock(&line->lock);
+
+                       spin_lock(&l_mg->gc_lock);
+                       list_move_tail(&line->list, move_list);
+                       spin_unlock(&l_mg->gc_lock);
+
+                       mempool_free(line->map_bitmap, pblk->line_meta_pool);
+                       line->map_bitmap = NULL;
+                       line->smeta = NULL;
+                       line->emeta = NULL;
+               } else {
+                       if (open_lines > 1)
+                               pr_err("pblk: failed to recover L2P\n");
+
+                       open_lines++;
+                       line->meta_line = meta_line;
+                       data_line = line;
+               }
+       }
+
+       spin_lock(&l_mg->free_lock);
+       if (!open_lines) {
+               WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+                                                       &l_mg->meta_bitmap));
+               pblk_line_replace_data(pblk);
+       } else {
+               /* Allocate next line for preparation */
+               l_mg->data_next = pblk_line_get(pblk);
+               if (l_mg->data_next) {
+                       l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+                       l_mg->data_next->type = PBLK_LINETYPE_DATA;
+                       is_next = 1;
+               }
+       }
+       spin_unlock(&l_mg->free_lock);
+
+       if (is_next) {
+               pblk_line_erase(pblk, l_mg->data_next);
+               pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+       }
+
+out:
+       if (found_lines != recovered_lines)
+               pr_err("pblk: failed to recover all found lines %d/%d\n",
+                                               found_lines, recovered_lines);
+
+       return data_line;
+}
+
+/*
+ * Pad until smeta can be read on current data line
+ */
+void pblk_recov_pad(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line *line;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct nvm_rq *rqd;
+       struct pblk_recov_alloc p;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+
+       spin_lock(&l_mg->free_lock);
+       line = l_mg->data_line;
+       spin_unlock(&l_mg->free_lock);
+
+       rqd = pblk_alloc_rqd(pblk, READ);
+       if (IS_ERR(rqd))
+               return;
+
+       meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+       if (!meta_list)
+               goto free_rqd;
+
+       ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+       dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+       data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
+       if (!data)
+               goto free_meta_list;
+
+       p.ppa_list = ppa_list;
+       p.meta_list = meta_list;
+       p.rqd = rqd;
+       p.data = data;
+       p.dma_ppa_list = dma_ppa_list;
+       p.dma_meta_list = dma_meta_list;
+
+       if (pblk_recov_pad_oob(pblk, line, p, line->left_msecs)) {
+               pr_err("pblk: Tear down padding failed\n");
+               goto free_data;
+       }
+
+       pblk_line_close(pblk, line);
+
+free_data:
+       kfree(data);
+free_meta_list:
+       nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+free_rqd:
+       pblk_free_rqd(pblk, rqd, READ);
+}
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c

new file mode 100644 (file)

index 0000000..4042162
--- /dev/null
+++ b/drivers/lightnvm/pblk-rl.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rl.c - pblk's rate limiter for user I/O
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_rl_kick_u_timer(struct pblk_rl *rl)
+{
+       mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000));
+}
+
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+       int rb_user_cnt = atomic_read(&rl->rb_user_cnt);
+
+       return (!(rb_user_cnt + nr_entries > rl->rb_user_max));
+}
+
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+       int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt);
+       int rb_user_active;
+
+       /* If there is no user I/O let GC take over space on the write buffer */
+       rb_user_active = READ_ONCE(rl->rb_user_active);
+       return (!(rb_gc_cnt + nr_entries > rl->rb_gc_max && rb_user_active));
+}
+
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
+{
+       atomic_add(nr_entries, &rl->rb_user_cnt);
+
+       /* Release user I/O state. Protect from GC */
+       smp_store_release(&rl->rb_user_active, 1);
+       pblk_rl_kick_u_timer(rl);
+}
+
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
+{
+       atomic_add(nr_entries, &rl->rb_gc_cnt);
+}
+
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc)
+{
+       atomic_sub(nr_user, &rl->rb_user_cnt);
+       atomic_sub(nr_gc, &rl->rb_gc_cnt);
+}
+
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
+{
+       return atomic_read(&rl->free_blocks);
+}
+
+/*
+ * We check for (i) the number of free blocks in the current LUN and (ii) the
+ * total number of free blocks in the pblk instance. This is to even out the
+ * number of free blocks on each LUN when GC kicks in.
+ *
+ * Only the total number of free blocks is used to configure the rate limiter.
+ */
+static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
+{
+       unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
+
+       if (free_blocks >= rl->high) {
+               rl->rb_user_max = max - rl->rb_gc_rsv;
+               rl->rb_gc_max = rl->rb_gc_rsv;
+               rl->rb_state = PBLK_RL_HIGH;
+       } else if (free_blocks < rl->high) {
+               int shift = rl->high_pw - rl->rb_windows_pw;
+               int user_windows = free_blocks >> shift;
+               int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW;
+               int gc_max;
+
+               rl->rb_user_max = user_max;
+               gc_max = max - rl->rb_user_max;
+               rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv);
+
+               if (free_blocks > rl->low)
+                       rl->rb_state = PBLK_RL_MID;
+               else
+                       rl->rb_state = PBLK_RL_LOW;
+       }
+
+       return rl->rb_state;
+}
+
+void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv)
+{
+       rl->rb_gc_rsv = rl->rb_gc_max = rsv;
+}
+
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
+{
+       struct pblk *pblk = container_of(rl, struct pblk, rl);
+       int ret;
+
+       atomic_add(line->blk_in_line, &rl->free_blocks);
+       /* Rates will not change that often - no need to lock update */
+       ret = pblk_rl_update_rates(rl, rl->rb_budget);
+
+       if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
+               pblk_gc_should_start(pblk);
+       else
+               pblk_gc_should_stop(pblk);
+}
+
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
+{
+       struct pblk *pblk = container_of(rl, struct pblk, rl);
+       int ret;
+
+       atomic_sub(line->blk_in_line, &rl->free_blocks);
+
+       /* Rates will not change that often - no need to lock update */
+       ret = pblk_rl_update_rates(rl, rl->rb_budget);
+       if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
+               pblk_gc_should_start(pblk);
+       else
+               pblk_gc_should_stop(pblk);
+}
+
+int pblk_rl_gc_thrs(struct pblk_rl *rl)
+{
+       return rl->high;
+}
+
+int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
+{
+       return rl->rb_user_max;
+}
+
+static void pblk_rl_u_timer(unsigned long data)
+{
+       struct pblk_rl *rl = (struct pblk_rl *)data;
+
+       /* Release user I/O state. Protect from GC */
+       smp_store_release(&rl->rb_user_active, 0);
+}
+
+void pblk_rl_free(struct pblk_rl *rl)
+{
+       del_timer(&rl->u_timer);
+}
+
+void pblk_rl_init(struct pblk_rl *rl, int budget)
+{
+       unsigned int rb_windows;
+
+       rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS;
+       rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
+       rl->high_pw = get_count_order(rl->high);
+
+       /* This will always be a power-of-2 */
+       rb_windows = budget / PBLK_MAX_REQ_ADDRS;
+       rl->rb_windows_pw = get_count_order(rb_windows) + 1;
+
+       /* To start with, all buffer is available to user I/O writers */
+       rl->rb_budget = budget;
+       rl->rb_user_max = budget;
+       atomic_set(&rl->rb_user_cnt, 0);
+       rl->rb_gc_max = 0;
+       rl->rb_state = PBLK_RL_HIGH;
+       atomic_set(&rl->rb_gc_cnt, 0);
+
+       setup_timer(&rl->u_timer, pblk_rl_u_timer, (unsigned long)rl);
+       rl->rb_user_active = 0;
+}
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c

new file mode 100644 (file)

index 0000000..f0af1d1
--- /dev/null
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-sysfs.c - pblk's sysfs
+ *
+ */
+
+#include "pblk.h"
+
+static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_lun *rlun;
+       ssize_t sz = 0;
+       int i;
+
+       for (i = 0; i < geo->nr_luns; i++) {
+               int active = 1;
+
+               rlun = &pblk->luns[i];
+               if (!down_trylock(&rlun->wr_sem)) {
+                       active = 0;
+                       up(&rlun->wr_sem);
+               }
+               sz += snprintf(page + sz, PAGE_SIZE - sz,
+                               "pblk: pos:%d, ch:%d, lun:%d - %d\n",
+                                       i,
+                                       rlun->bppa.g.ch,
+                                       rlun->bppa.g.lun,
+                                       active);
+       }
+
+       return sz;
+}
+
+static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       int free_blocks, total_blocks;
+       int rb_user_max, rb_user_cnt;
+       int rb_gc_max, rb_gc_rsv, rb_gc_cnt, rb_budget, rb_state;
+
+       free_blocks = atomic_read(&pblk->rl.free_blocks);
+       rb_user_max = pblk->rl.rb_user_max;
+       rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
+       rb_gc_max = pblk->rl.rb_gc_max;
+       rb_gc_rsv = pblk->rl.rb_gc_rsv;
+       rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt);
+       rb_budget = pblk->rl.rb_budget;
+       rb_state = pblk->rl.rb_state;
+
+       total_blocks = geo->blks_per_lun * geo->nr_luns;
+
+       return snprintf(page, PAGE_SIZE,
+               "u:%u/%u,gc:%u/%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n",
+                               rb_user_cnt,
+                               rb_user_max,
+                               rb_gc_cnt,
+                               rb_gc_max,
+                               rb_gc_rsv,
+                               rb_state,
+                               rb_budget,
+                               pblk->rl.low,
+                               pblk->rl.high,
+                               free_blocks,
+                               total_blocks,
+                               READ_ONCE(pblk->rl.rb_user_active));
+}
+
+static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page)
+{
+       int gc_enabled, gc_active;
+
+       pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active);
+       return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n",
+                                       gc_enabled, gc_active);
+}
+
+static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page)
+{
+       ssize_t sz;
+
+       sz = snprintf(page, PAGE_SIZE,
+                       "read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n",
+                       atomic_long_read(&pblk->read_failed),
+                       atomic_long_read(&pblk->read_high_ecc),
+                       atomic_long_read(&pblk->read_empty),
+                       atomic_long_read(&pblk->read_failed_gc),
+                       atomic_long_read(&pblk->write_failed),
+                       atomic_long_read(&pblk->erase_failed));
+
+       return sz;
+}
+
+static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page)
+{
+       return pblk_rb_sysfs(&pblk->rwb, page);
+}
+
+static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       ssize_t sz = 0;
+
+       sz = snprintf(page, PAGE_SIZE - sz,
+               "g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+               pblk->ppaf_bitsize,
+               pblk->ppaf.blk_offset, geo->ppaf.blk_len,
+               pblk->ppaf.pg_offset, geo->ppaf.pg_len,
+               pblk->ppaf.lun_offset, geo->ppaf.lun_len,
+               pblk->ppaf.ch_offset, geo->ppaf.ch_len,
+               pblk->ppaf.pln_offset, geo->ppaf.pln_len,
+               pblk->ppaf.sec_offset, geo->ppaf.sect_len);
+
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+               "d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+               geo->ppaf.blk_offset, geo->ppaf.blk_len,
+               geo->ppaf.pg_offset, geo->ppaf.pg_len,
+               geo->ppaf.lun_offset, geo->ppaf.lun_len,
+               geo->ppaf.ch_offset, geo->ppaf.ch_len,
+               geo->ppaf.pln_offset, geo->ppaf.pln_len,
+               geo->ppaf.sect_offset, geo->ppaf.sect_len);
+
+       return sz;
+}
+
+static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *line;
+       ssize_t sz = 0;
+       int nr_free_lines;
+       int cur_data, cur_log;
+       int free_line_cnt = 0, closed_line_cnt = 0;
+       int d_line_cnt = 0, l_line_cnt = 0;
+       int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
+       int free = 0, bad = 0, cor = 0;
+       int msecs = 0, ssecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
+       int map_weight = 0, meta_weight = 0;
+
+       spin_lock(&l_mg->free_lock);
+       cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1;
+       cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1;
+       nr_free_lines = l_mg->nr_free_lines;
+
+       list_for_each_entry(line, &l_mg->free_list, list)
+               free_line_cnt++;
+       spin_unlock(&l_mg->free_lock);
+
+       spin_lock(&l_mg->gc_lock);
+       list_for_each_entry(line, &l_mg->gc_full_list, list) {
+               if (line->type == PBLK_LINETYPE_DATA)
+                       d_line_cnt++;
+               else if (line->type == PBLK_LINETYPE_LOG)
+                       l_line_cnt++;
+               closed_line_cnt++;
+               gc_full++;
+       }
+
+       list_for_each_entry(line, &l_mg->gc_high_list, list) {
+               if (line->type == PBLK_LINETYPE_DATA)
+                       d_line_cnt++;
+               else if (line->type == PBLK_LINETYPE_LOG)
+                       l_line_cnt++;
+               closed_line_cnt++;
+               gc_high++;
+       }
+
+       list_for_each_entry(line, &l_mg->gc_mid_list, list) {
+               if (line->type == PBLK_LINETYPE_DATA)
+                       d_line_cnt++;
+               else if (line->type == PBLK_LINETYPE_LOG)
+                       l_line_cnt++;
+               closed_line_cnt++;
+               gc_mid++;
+       }
+
+       list_for_each_entry(line, &l_mg->gc_low_list, list) {
+               if (line->type == PBLK_LINETYPE_DATA)
+                       d_line_cnt++;
+               else if (line->type == PBLK_LINETYPE_LOG)
+                       l_line_cnt++;
+               closed_line_cnt++;
+               gc_low++;
+       }
+
+       list_for_each_entry(line, &l_mg->gc_empty_list, list) {
+               if (line->type == PBLK_LINETYPE_DATA)
+                       d_line_cnt++;
+               else if (line->type == PBLK_LINETYPE_LOG)
+                       l_line_cnt++;
+               closed_line_cnt++;
+               gc_empty++;
+       }
+
+       list_for_each_entry(line, &l_mg->free_list, list)
+               free++;
+       list_for_each_entry(line, &l_mg->bad_list, list)
+               bad++;
+       list_for_each_entry(line, &l_mg->corrupt_list, list)
+               cor++;
+       spin_unlock(&l_mg->gc_lock);
+
+       spin_lock(&l_mg->free_lock);
+       if (l_mg->data_line) {
+               cur_sec = l_mg->data_line->cur_sec;
+               msecs = l_mg->data_line->left_msecs;
+               ssecs = l_mg->data_line->left_ssecs;
+               vsc = l_mg->data_line->vsc;
+               sec_in_line = l_mg->data_line->sec_in_line;
+               meta_weight = bitmap_weight(&l_mg->meta_bitmap,
+                                                       PBLK_DATA_LINES);
+               map_weight = bitmap_weight(l_mg->data_line->map_bitmap,
+                                                       lm->sec_per_line);
+       }
+       spin_unlock(&l_mg->free_lock);
+
+       if (nr_free_lines != free_line_cnt)
+               pr_err("pblk: corrupted free line list\n");
+
+       sz = snprintf(page, PAGE_SIZE - sz,
+               "line: nluns:%d, nblks:%d, nsecs:%d\n",
+               geo->nr_luns, lm->blk_per_line, lm->sec_per_line);
+
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+               "lines:d:%d,l:%d-f:%d(%d),b:%d,co:%d,c:%d(d:%d,l:%d)t:%d\n",
+                                       cur_data, cur_log,
+                                       free, nr_free_lines, bad, cor,
+                                       closed_line_cnt,
+                                       d_line_cnt, l_line_cnt,
+                                       l_mg->nr_lines);
+
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+               "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
+                       gc_full, gc_high, gc_mid, gc_low, gc_empty,
+                       atomic_read(&pblk->gc.inflight_gc));
+
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+               "data (%d) cur:%d, left:%d/%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
+                       cur_data, cur_sec, msecs, ssecs, vsc, sec_in_line,
+                       map_weight, lm->sec_per_line, meta_weight);
+
+       return sz;
+}
+
+static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       ssize_t sz = 0;
+
+       sz = snprintf(page, PAGE_SIZE - sz,
+                               "smeta - len:%d, secs:%d\n",
+                                       lm->smeta_len, lm->smeta_sec);
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+                               "emeta - len:%d, sec:%d, bb_start:%d\n",
+                                       lm->emeta_len, lm->emeta_sec,
+                                       lm->emeta_bb);
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+                               "bitmap lengths: sec:%d, blk:%d, lun:%d\n",
+                                       lm->sec_bitmap_len,
+                                       lm->blk_bitmap_len,
+                                       lm->lun_bitmap_len);
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+                               "blk_line:%d, sec_line:%d, sec_blk:%d\n",
+                                       lm->blk_per_line,
+                                       lm->sec_per_line,
+                                       geo->sec_per_blk);
+
+       return sz;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
+{
+       return snprintf(page, PAGE_SIZE,
+               "%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n",
+                       atomic_long_read(&pblk->inflight_writes),
+                       atomic_long_read(&pblk->inflight_reads),
+                       atomic_long_read(&pblk->req_writes),
+                       atomic_long_read(&pblk->nr_flush),
+                       atomic_long_read(&pblk->padded_writes),
+                       atomic_long_read(&pblk->padded_wb),
+                       atomic_long_read(&pblk->sub_writes),
+                       atomic_long_read(&pblk->sync_writes),
+                       atomic_long_read(&pblk->compl_writes),
+                       atomic_long_read(&pblk->recov_writes),
+                       atomic_long_read(&pblk->recov_gc_writes),
+                       atomic_long_read(&pblk->recov_gc_reads),
+                       atomic_long_read(&pblk->sync_reads));
+}
+#endif
+
+static ssize_t pblk_sysfs_rate_store(struct pblk *pblk, const char *page,
+                                    size_t len)
+{
+       struct pblk_gc *gc = &pblk->gc;
+       size_t c_len;
+       int value;
+
+       c_len = strcspn(page, "\n");
+       if (c_len >= len)
+               return -EINVAL;
+
+       if (kstrtouint(page, 0, &value))
+               return -EINVAL;
+
+       spin_lock(&gc->lock);
+       pblk_rl_set_gc_rsc(&pblk->rl, value);
+       spin_unlock(&gc->lock);
+
+       return len;
+}
+
+static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
+                                  size_t len)
+{
+       size_t c_len;
+       int force;
+
+       c_len = strcspn(page, "\n");
+       if (c_len >= len)
+               return -EINVAL;
+
+       if (kstrtouint(page, 0, &force))
+               return -EINVAL;
+
+       if (force < 0 || force > 1)
+               return -EINVAL;
+
+       pblk_gc_sysfs_force(pblk, force);
+
+       return len;
+}
+
+static struct attribute sys_write_luns = {
+       .name = "write_luns",
+       .mode = 0444,
+};
+
+static struct attribute sys_rate_limiter_attr = {
+       .name = "rate_limiter",
+       .mode = 0444,
+};
+
+static struct attribute sys_gc_state = {
+       .name = "gc_state",
+       .mode = 0444,
+};
+
+static struct attribute sys_errors_attr = {
+       .name = "errors",
+       .mode = 0444,
+};
+
+static struct attribute sys_rb_attr = {
+       .name = "write_buffer",
+       .mode = 0444,
+};
+
+static struct attribute sys_stats_ppaf_attr = {
+       .name = "ppa_format",
+       .mode = 0444,
+};
+
+static struct attribute sys_lines_attr = {
+       .name = "lines",
+       .mode = 0444,
+};
+
+static struct attribute sys_lines_info_attr = {
+       .name = "lines_info",
+       .mode = 0444,
+};
+
+static struct attribute sys_gc_force = {
+       .name = "gc_force",
+       .mode = 0200,
+};
+
+static struct attribute sys_gc_rl_max = {
+       .name = "gc_rl_max",
+       .mode = 0200,
+};
+
+#ifdef CONFIG_NVM_DEBUG
+static struct attribute sys_stats_debug_attr = {
+       .name = "stats",
+       .mode = 0444,
+};
+#endif
+
+static struct attribute *pblk_attrs[] = {
+       &sys_write_luns,
+       &sys_rate_limiter_attr,
+       &sys_errors_attr,
+       &sys_gc_state,
+       &sys_gc_force,
+       &sys_gc_rl_max,
+       &sys_rb_attr,
+       &sys_stats_ppaf_attr,
+       &sys_lines_attr,
+       &sys_lines_info_attr,
+#ifdef CONFIG_NVM_DEBUG
+       &sys_stats_debug_attr,
+#endif
+       NULL,
+};
+
+static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+       struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+       if (strcmp(attr->name, "rate_limiter") == 0)
+               return pblk_sysfs_rate_limiter(pblk, buf);
+       else if (strcmp(attr->name, "write_luns") == 0)
+               return pblk_sysfs_luns_show(pblk, buf);
+       else if (strcmp(attr->name, "gc_state") == 0)
+               return pblk_sysfs_gc_state_show(pblk, buf);
+       else if (strcmp(attr->name, "errors") == 0)
+               return pblk_sysfs_stats(pblk, buf);
+       else if (strcmp(attr->name, "write_buffer") == 0)
+               return pblk_sysfs_write_buffer(pblk, buf);
+       else if (strcmp(attr->name, "ppa_format") == 0)
+               return pblk_sysfs_ppaf(pblk, buf);
+       else if (strcmp(attr->name, "lines") == 0)
+               return pblk_sysfs_lines(pblk, buf);
+       else if (strcmp(attr->name, "lines_info") == 0)
+               return pblk_sysfs_lines_info(pblk, buf);
+#ifdef CONFIG_NVM_DEBUG
+       else if (strcmp(attr->name, "stats") == 0)
+               return pblk_sysfs_stats_debug(pblk, buf);
+#endif
+       return 0;
+}
+
+static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
+                               const char *buf, size_t len)
+{
+       struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+       if (strcmp(attr->name, "gc_rl_max") == 0)
+               return pblk_sysfs_rate_store(pblk, buf, len);
+       else if (strcmp(attr->name, "gc_force") == 0)
+               return pblk_sysfs_gc_force(pblk, buf, len);
+
+       return 0;
+}
+
+static const struct sysfs_ops pblk_sysfs_ops = {
+       .show = pblk_sysfs_show,
+       .store = pblk_sysfs_store,
+};
+
+static struct kobj_type pblk_ktype = {
+       .sysfs_ops      = &pblk_sysfs_ops,
+       .default_attrs  = pblk_attrs,
+};
+
+int pblk_sysfs_init(struct gendisk *tdisk)
+{
+       struct pblk *pblk = tdisk->private_data;
+       struct device *parent_dev = disk_to_dev(pblk->disk);
+       int ret;
+
+       ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype,
+                                       kobject_get(&parent_dev->kobj),
+                                       "%s", "pblk");
+       if (ret) {
+               pr_err("pblk: could not register %s/pblk\n",
+                                               tdisk->disk_name);
+               return ret;
+       }
+
+       kobject_uevent(&pblk->kobj, KOBJ_ADD);
+       return 0;
+}
+
+void pblk_sysfs_exit(struct gendisk *tdisk)
+{
+       struct pblk *pblk = tdisk->private_data;
+
+       kobject_uevent(&pblk->kobj, KOBJ_REMOVE);
+       kobject_del(&pblk->kobj);
+       kobject_put(&pblk->kobj);
+}
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c

new file mode 100644 (file)

index 0000000..ee57db9
--- /dev/null
+++ b/drivers/lightnvm/pblk-write.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-write.c - pblk's write path from write buffer to media
+ */
+
+#include "pblk.h"
+
+static void pblk_sync_line(struct pblk *pblk, struct pblk_line *line)
+{
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_inc(&pblk->sync_writes);
+#endif
+
+       /* Counter protected by rb sync lock */
+       line->left_ssecs--;
+       if (!line->left_ssecs)
+               pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
+}
+
+static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
+                                   struct pblk_c_ctx *c_ctx)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct bio *original_bio;
+       unsigned long ret;
+       int i;
+
+       for (i = 0; i < c_ctx->nr_valid; i++) {
+               struct pblk_w_ctx *w_ctx;
+               struct ppa_addr p;
+               struct pblk_line *line;
+
+               w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
+
+               p = rqd->ppa_list[i];
+               line = &pblk->lines[pblk_dev_ppa_to_line(p)];
+               pblk_sync_line(pblk, line);
+
+               while ((original_bio = bio_list_pop(&w_ctx->bios)))
+                       bio_endio(original_bio);
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(c_ctx->nr_valid, &pblk->compl_writes);
+#endif
+
+       ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
+
+       if (rqd->meta_list)
+               nvm_dev_dma_free(dev->parent, rqd->meta_list,
+                                                       rqd->dma_meta_list);
+
+       bio_put(rqd->bio);
+       pblk_free_rqd(pblk, rqd, WRITE);
+
+       return ret;
+}
+
+static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
+                                          struct nvm_rq *rqd,
+                                          struct pblk_c_ctx *c_ctx)
+{
+       list_del(&c_ctx->list);
+       return pblk_end_w_bio(pblk, rqd, c_ctx);
+}
+
+static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
+                               struct pblk_c_ctx *c_ctx)
+{
+       struct pblk_c_ctx *c, *r;
+       unsigned long flags;
+       unsigned long pos;
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
+#endif
+
+       pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
+
+       pos = pblk_rb_sync_init(&pblk->rwb, &flags);
+       if (pos == c_ctx->sentry) {
+               pos = pblk_end_w_bio(pblk, rqd, c_ctx);
+
+retry:
+               list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
+                       rqd = nvm_rq_from_c_ctx(c);
+                       if (c->sentry == pos) {
+                               pos = pblk_end_queued_w_bio(pblk, rqd, c);
+                               goto retry;
+                       }
+               }
+       } else {
+               WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd);
+               list_add_tail(&c_ctx->list, &pblk->compl_list);
+       }
+       pblk_rb_sync_end(&pblk->rwb, &flags);
+}
+
+/* When a write fails, we are not sure whether the block has grown bad or a page
+ * range is more susceptible to write errors. If a high number of pages fail, we
+ * assume that the block is bad and we mark it accordingly. In all cases, we
+ * remap and resubmit the failed entries as fast as possible; if a flush is
+ * waiting on a completion, the whole stack would stall otherwise.
+ */
+static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       void *comp_bits = &rqd->ppa_status;
+       struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+       struct pblk_rec_ctx *recovery;
+       struct ppa_addr *ppa_list = rqd->ppa_list;
+       int nr_ppas = rqd->nr_ppas;
+       unsigned int c_entries;
+       int bit, ret;
+
+       if (unlikely(nr_ppas == 1))
+               ppa_list = &rqd->ppa_addr;
+
+       recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
+       if (!recovery) {
+               pr_err("pblk: could not allocate recovery context\n");
+               return;
+       }
+       INIT_LIST_HEAD(&recovery->failed);
+
+       bit = -1;
+       while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
+               struct pblk_rb_entry *entry;
+               struct ppa_addr ppa;
+
+               /* Logic error */
+               if (bit > c_ctx->nr_valid) {
+                       WARN_ON_ONCE("pblk: corrupted write request\n");
+                       goto out;
+               }
+
+               ppa = ppa_list[bit];
+               entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
+               if (!entry) {
+                       pr_err("pblk: could not scan entry on write failure\n");
+                       goto out;
+               }
+
+               /* The list is filled first and emptied afterwards. No need for
+                * protecting it with a lock
+                */
+               list_add_tail(&entry->index, &recovery->failed);
+       }
+
+       c_entries = find_first_bit(comp_bits, nr_ppas);
+       ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
+       if (ret) {
+               pr_err("pblk: could not recover from write failure\n");
+               goto out;
+       }
+
+       INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
+       queue_work(pblk->kw_wq, &recovery->ws_rec);
+
+out:
+       pblk_complete_write(pblk, rqd, c_ctx);
+}
+
+static void pblk_end_io_write(struct nvm_rq *rqd)
+{
+       struct pblk *pblk = rqd->private;
+       struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+
+       if (rqd->error) {
+               pblk_log_write_err(pblk, rqd);
+               return pblk_end_w_fail(pblk, rqd);
+       }
+#ifdef CONFIG_NVM_DEBUG
+       else
+               WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n");
+#endif
+
+       pblk_complete_write(pblk, rqd, c_ctx);
+}
+
+static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                          unsigned int nr_secs)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+
+       /* Setup write request */
+       rqd->opcode = NVM_OP_PWRITE;
+       rqd->nr_ppas = nr_secs;
+       rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+       rqd->private = pblk;
+       rqd->end_io = pblk_end_io_write;
+
+       rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+                                                       &rqd->dma_meta_list);
+       if (!rqd->meta_list)
+               return -ENOMEM;
+
+       if (unlikely(nr_secs == 1))
+               return 0;
+
+       rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
+       rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
+
+       return 0;
+}
+
+static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                          struct pblk_c_ctx *c_ctx)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line *e_line = pblk_line_get_data_next(pblk);
+       struct ppa_addr erase_ppa;
+       unsigned int valid = c_ctx->nr_valid;
+       unsigned int padded = c_ctx->nr_padded;
+       unsigned int nr_secs = valid + padded;
+       unsigned long *lun_bitmap;
+       int ret = 0;
+
+       lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
+       if (!lun_bitmap) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       c_ctx->lun_bitmap = lun_bitmap;
+
+       ret = pblk_alloc_w_rq(pblk, rqd, nr_secs);
+       if (ret) {
+               kfree(lun_bitmap);
+               goto out;
+       }
+
+       ppa_set_empty(&erase_ppa);
+       if (likely(!e_line || !e_line->left_eblks))
+               pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
+       else
+               pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+                                                       valid, &erase_ppa);
+
+out:
+       if (unlikely(e_line && !ppa_empty(erase_ppa))) {
+               if (pblk_blk_erase_async(pblk, erase_ppa)) {
+                       struct nvm_tgt_dev *dev = pblk->dev;
+                       struct nvm_geo *geo = &dev->geo;
+                       int bit;
+
+                       e_line->left_eblks++;
+                       bit = erase_ppa.g.lun * geo->nr_chnls + erase_ppa.g.ch;
+                       WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
+                       up(&pblk->erase_sem);
+               }
+       }
+
+       return ret;
+}
+
+int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                       struct pblk_c_ctx *c_ctx)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       unsigned long *lun_bitmap;
+       int ret;
+
+       lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
+       if (!lun_bitmap)
+               return -ENOMEM;
+
+       c_ctx->lun_bitmap = lun_bitmap;
+
+       ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas);
+       if (ret)
+               return ret;
+
+       pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
+
+       rqd->ppa_status = (u64)0;
+       rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+
+       return ret;
+}
+
+static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
+                                 unsigned int secs_to_flush)
+{
+       int secs_to_sync;
+
+       secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
+
+#ifdef CONFIG_NVM_DEBUG
+       if ((!secs_to_sync && secs_to_flush)
+                       || (secs_to_sync < 0)
+                       || (secs_to_sync > secs_avail && !secs_to_flush)) {
+               pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n",
+                               secs_avail, secs_to_sync, secs_to_flush);
+       }
+#endif
+
+       return secs_to_sync;
+}
+
+static int pblk_submit_write(struct pblk *pblk)
+{
+       struct bio *bio;
+       struct nvm_rq *rqd;
+       struct pblk_c_ctx *c_ctx;
+       unsigned int pgs_read;
+       unsigned int secs_avail, secs_to_sync, secs_to_com;
+       unsigned int secs_to_flush;
+       unsigned long pos;
+       int err;
+
+       /* If there are no sectors in the cache, flushes (bios without data)
+        * will be cleared on the cache threads
+        */
+       secs_avail = pblk_rb_read_count(&pblk->rwb);
+       if (!secs_avail)
+               return 1;
+
+       secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
+       if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
+               return 1;
+
+       rqd = pblk_alloc_rqd(pblk, WRITE);
+       if (IS_ERR(rqd)) {
+               pr_err("pblk: cannot allocate write req.\n");
+               return 1;
+       }
+       c_ctx = nvm_rq_to_pdu(rqd);
+
+       bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
+       if (!bio) {
+               pr_err("pblk: cannot allocate write bio\n");
+               goto fail_free_rqd;
+       }
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+       rqd->bio = bio;
+
+       secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
+       if (secs_to_sync > pblk->max_write_pgs) {
+               pr_err("pblk: bad buffer sync calculation\n");
+               goto fail_put_bio;
+       }
+
+       secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
+       pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+
+       pgs_read = pblk_rb_read_to_bio(&pblk->rwb, bio, c_ctx, pos,
+                                               secs_to_sync, secs_avail);
+       if (!pgs_read) {
+               pr_err("pblk: corrupted write bio\n");
+               goto fail_put_bio;
+       }
+
+       if (c_ctx->nr_padded)
+               if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, c_ctx->nr_padded))
+                       goto fail_put_bio;
+
+       /* Assign lbas to ppas and populate request structure */
+       err = pblk_setup_w_rq(pblk, rqd, c_ctx);
+       if (err) {
+               pr_err("pblk: could not setup write request\n");
+               goto fail_free_bio;
+       }
+
+       err = pblk_submit_io(pblk, rqd);
+       if (err) {
+               pr_err("pblk: I/O submission failed: %d\n", err);
+               goto fail_free_bio;
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(secs_to_sync, &pblk->sub_writes);
+#endif
+
+       return 0;
+
+fail_free_bio:
+       if (c_ctx->nr_padded)
+               pblk_bio_free_pages(pblk, bio, secs_to_sync, c_ctx->nr_padded);
+fail_put_bio:
+       bio_put(bio);
+fail_free_rqd:
+       pblk_free_rqd(pblk, rqd, WRITE);
+
+       return 1;
+}
+
+int pblk_write_ts(void *data)
+{
+       struct pblk *pblk = data;
+
+       while (!kthread_should_stop()) {
+               if (!pblk_submit_write(pblk))
+                       continue;
+               set_current_state(TASK_INTERRUPTIBLE);
+               io_schedule();
+       }
+
+       return 0;
+}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h

new file mode 100644 (file)

index 0000000..c82120c
--- /dev/null
+++ b/drivers/lightnvm/pblk.h
@@ -0,0 +1,1121 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.h)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Matias Bjorling <matias@cnexlabs.com>
+ * Write buffering: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a Physical Block-device target for Open-channel SSDs.
+ *
+ */
+
+#ifndef PBLK_H_
+#define PBLK_H_
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+#include <linux/crc32.h>
+#include <linux/uuid.h>
+
+#include <linux/lightnvm.h>
+
+/* Run only GC if less than 1/X blocks are free */
+#define GC_LIMIT_INVERSE 5
+#define GC_TIME_MSECS 1000
+
+#define PBLK_SECTOR (512)
+#define PBLK_EXPOSED_PAGE_SIZE (4096)
+#define PBLK_MAX_REQ_ADDRS (64)
+#define PBLK_MAX_REQ_ADDRS_PW (6)
+
+#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16)
+
+#define PBLK_COMMAND_TIMEOUT_MS 30000
+
+/* Max 512 LUNs per device */
+#define PBLK_MAX_LUNS_BITMAP (4)
+
+#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
+
+#define pblk_for_each_lun(pblk, rlun, i) \
+               for ((i) = 0, rlun = &(pblk)->luns[0]; \
+                       (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
+
+#define ERASE 2 /* READ = 0, WRITE = 1 */
+
+enum {
+       /* IO Types */
+       PBLK_IOTYPE_USER        = 1 << 0,
+       PBLK_IOTYPE_GC          = 1 << 1,
+
+       /* Write buffer flags */
+       PBLK_FLUSH_ENTRY        = 1 << 2,
+       PBLK_WRITTEN_DATA       = 1 << 3,
+       PBLK_SUBMITTED_ENTRY    = 1 << 4,
+       PBLK_WRITABLE_ENTRY     = 1 << 5,
+};
+
+enum {
+       PBLK_BLK_ST_OPEN =      0x1,
+       PBLK_BLK_ST_CLOSED =    0x2,
+};
+
+/* The number of GC lists and the rate-limiter states go together. This way the
+ * rate-limiter can dictate how much GC is needed based on resource utilization.
+ */
+#define PBLK_NR_GC_LISTS 3
+#define PBLK_MAX_GC_JOBS 32
+
+enum {
+       PBLK_RL_HIGH = 1,
+       PBLK_RL_MID = 2,
+       PBLK_RL_LOW = 3,
+};
+
+struct pblk_sec_meta {
+       u64 reserved;
+       __le64 lba;
+};
+
+#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
+
+/* write completion context */
+struct pblk_c_ctx {
+       struct list_head list;          /* Head for out-of-order completion */
+
+       unsigned long *lun_bitmap;      /* Luns used on current request */
+       unsigned int sentry;
+       unsigned int nr_valid;
+       unsigned int nr_padded;
+};
+
+/* Read context */
+struct pblk_r_ctx {
+       struct bio *orig_bio;
+};
+
+/* Recovery context */
+struct pblk_rec_ctx {
+       struct pblk *pblk;
+       struct nvm_rq *rqd;
+       struct list_head failed;
+       struct work_struct ws_rec;
+};
+
+/* Write context */
+struct pblk_w_ctx {
+       struct bio_list bios;           /* Original bios - used for completion
+                                        * in REQ_FUA, REQ_FLUSH case
+                                        */
+       sector_t lba;                   /* Logic addr. associated with entry */
+       struct ppa_addr ppa;            /* Physic addr. associated with entry */
+       int flags;                      /* Write context flags */
+};
+
+struct pblk_rb_entry {
+       struct ppa_addr cacheline;      /* Cacheline for this entry */
+       void *data;                     /* Pointer to data on this entry */
+       struct pblk_w_ctx w_ctx;        /* Context for this entry */
+       struct list_head index;         /* List head to enable indexes */
+};
+
+#define EMPTY_ENTRY (~0U)
+
+struct pblk_rb_pages {
+       struct page *pages;
+       int order;
+       struct list_head list;
+};
+
+struct pblk_rb {
+       struct pblk_rb_entry *entries;  /* Ring buffer entries */
+       unsigned int mem;               /* Write offset - points to next
+                                        * writable entry in memory
+                                        */
+       unsigned int subm;              /* Read offset - points to last entry
+                                        * that has been submitted to the media
+                                        * to be persisted
+                                        */
+       unsigned int sync;              /* Synced - backpointer that signals
+                                        * the last submitted entry that has
+                                        * been successfully persisted to media
+                                        */
+       unsigned int sync_point;        /* Sync point - last entry that must be
+                                        * flushed to the media. Used with
+                                        * REQ_FLUSH and REQ_FUA
+                                        */
+       unsigned int l2p_update;        /* l2p update point - next entry for
+                                        * which l2p mapping will be updated to
+                                        * contain a device ppa address (instead
+                                        * of a cacheline
+                                        */
+       unsigned int nr_entries;        /* Number of entries in write buffer -
+                                        * must be a power of two
+                                        */
+       unsigned int seg_size;          /* Size of the data segments being
+                                        * stored on each entry. Typically this
+                                        * will be 4KB
+                                        */
+
+       struct list_head pages;         /* List of data pages */
+
+       spinlock_t w_lock;              /* Write lock */
+       spinlock_t s_lock;              /* Sync lock */
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_t inflight_sync_point;   /* Not served REQ_FLUSH | REQ_FUA */
+#endif
+};
+
+#define PBLK_RECOVERY_SECTORS 16
+
+struct pblk_lun {
+       struct ppa_addr bppa;
+
+       u8 *bb_list;                    /* Bad block list for LUN. Only used on
+                                        * bring up. Bad blocks are managed
+                                        * within lines on run-time.
+                                        */
+
+       struct semaphore wr_sem;
+};
+
+struct pblk_gc_rq {
+       struct pblk_line *line;
+       void *data;
+       u64 *lba_list;
+       int nr_secs;
+       int secs_to_gc;
+       struct list_head list;
+};
+
+struct pblk_gc {
+       int gc_active;
+       int gc_enabled;
+       int gc_forced;
+       int gc_jobs_active;
+       atomic_t inflight_gc;
+
+       struct task_struct *gc_ts;
+       struct task_struct *gc_writer_ts;
+       struct workqueue_struct *gc_reader_wq;
+       struct timer_list gc_timer;
+
+       int w_entries;
+       struct list_head w_list;
+
+       spinlock_t lock;
+       spinlock_t w_lock;
+};
+
+struct pblk_rl {
+       unsigned int high;      /* Upper threshold for rate limiter (free run -
+                                * user I/O rate limiter
+                                */
+       unsigned int low;       /* Lower threshold for rate limiter (user I/O
+                                * rate limiter - stall)
+                                */
+       unsigned int high_pw;   /* High rounded up as a power of 2 */
+
+#define PBLK_USER_HIGH_THRS 2  /* Begin write limit at 50 percent
+                                * available blks
+                                */
+#define PBLK_USER_LOW_THRS 20  /* Aggressive GC at 5% available blocks */
+
+       int rb_windows_pw;      /* Number of rate windows in the write buffer
+                                * given as a power-of-2. This guarantees that
+                                * when user I/O is being rate limited, there
+                                * will be reserved enough space for the GC to
+                                * place its payload. A window is of
+                                * pblk->max_write_pgs size, which in NVMe is
+                                * 64, i.e., 256kb.
+                                */
+       int rb_budget;          /* Total number of entries available for I/O */
+       int rb_user_max;        /* Max buffer entries available for user I/O */
+       atomic_t rb_user_cnt;   /* User I/O buffer counter */
+       int rb_gc_max;          /* Max buffer entries available for GC I/O */
+       int rb_gc_rsv;          /* Reserved buffer entries for GC I/O */
+       int rb_state;           /* Rate-limiter current state */
+       atomic_t rb_gc_cnt;     /* GC I/O buffer counter */
+
+       int rb_user_active;
+       struct timer_list u_timer;
+
+       unsigned long long nr_secs;
+       unsigned long total_blocks;
+       atomic_t free_blocks;
+};
+
+#define PBLK_LINE_NR_LUN_BITMAP 2
+#define PBLK_LINE_NR_SEC_BITMAP 2
+#define PBLK_LINE_EMPTY (~0U)
+
+enum {
+       /* Line Types */
+       PBLK_LINETYPE_FREE = 0,
+       PBLK_LINETYPE_LOG = 1,
+       PBLK_LINETYPE_DATA = 2,
+
+       /* Line state */
+       PBLK_LINESTATE_FREE = 10,
+       PBLK_LINESTATE_OPEN = 11,
+       PBLK_LINESTATE_CLOSED = 12,
+       PBLK_LINESTATE_GC = 13,
+       PBLK_LINESTATE_BAD = 14,
+       PBLK_LINESTATE_CORRUPT = 15,
+
+       /* GC group */
+       PBLK_LINEGC_NONE = 20,
+       PBLK_LINEGC_EMPTY = 21,
+       PBLK_LINEGC_LOW = 22,
+       PBLK_LINEGC_MID = 23,
+       PBLK_LINEGC_HIGH = 24,
+       PBLK_LINEGC_FULL = 25,
+};
+
+#define PBLK_MAGIC 0x70626c6b /*pblk*/
+
+struct line_header {
+       __le32 crc;
+       __le32 identifier;      /* pblk identifier */
+       __u8 uuid[16];          /* instance uuid */
+       __le16 type;            /* line type */
+       __le16 version;         /* type version */
+       __le32 id;              /* line id for current line */
+};
+
+struct line_smeta {
+       struct line_header header;
+
+       __le32 crc;             /* Full structure including struct crc */
+       /* Previous line metadata */
+       __le32 prev_id;         /* Line id for previous line */
+
+       /* Current line metadata */
+       __le64 seq_nr;          /* Sequence number for current line */
+
+       /* Active writers */
+       __le32 window_wr_lun;   /* Number of parallel LUNs to write */
+
+       __le32 rsvd[2];
+};
+
+/*
+ * Metadata Layout:
+ *     1. struct pblk_emeta
+ *     2. nr_lbas u64 forming lba list
+ *     3. nr_lines (all) u32 valid sector count (vsc) (~0U: non-alloc line)
+ *     4. nr_luns bits (u64 format) forming line bad block bitmap
+ *
+ *     3. and 4. will be part of FTL log
+ */
+struct line_emeta {
+       struct line_header header;
+
+       __le32 crc;             /* Full structure including struct crc */
+
+       /* Previous line metadata */
+       __le32 prev_id;         /* Line id for prev line */
+
+       /* Current line metadata */
+       __le64 seq_nr;          /* Sequence number for current line */
+
+       /* Active writers */
+       __le32 window_wr_lun;   /* Number of parallel LUNs to write */
+
+       /* Bookkeeping for recovery */
+       __le32 next_id;         /* Line id for next line */
+       __le64 nr_lbas;         /* Number of lbas mapped in line */
+       __le64 nr_valid_lbas;   /* Number of valid lbas mapped in line */
+};
+
+struct pblk_line {
+       struct pblk *pblk;
+       unsigned int id;                /* Line number corresponds to the
+                                        * block line
+                                        */
+       unsigned int seq_nr;            /* Unique line sequence number */
+
+       int state;                      /* PBLK_LINESTATE_X */
+       int type;                       /* PBLK_LINETYPE_X */
+       int gc_group;                   /* PBLK_LINEGC_X */
+       struct list_head list;          /* Free, GC lists */
+
+       unsigned long *lun_bitmap;      /* Bitmap for LUNs mapped in line */
+
+       struct line_smeta *smeta;       /* Start metadata */
+       struct line_emeta *emeta;       /* End metadata */
+       int meta_line;                  /* Metadata line id */
+       u64 smeta_ssec;                 /* Sector where smeta starts */
+       u64 emeta_ssec;                 /* Sector where emeta starts */
+
+       unsigned int sec_in_line;       /* Number of usable secs in line */
+
+       unsigned int blk_in_line;       /* Number of good blocks in line */
+       unsigned long *blk_bitmap;      /* Bitmap for valid/invalid blocks */
+       unsigned long *erase_bitmap;    /* Bitmap for erased blocks */
+
+       unsigned long *map_bitmap;      /* Bitmap for mapped sectors in line */
+       unsigned long *invalid_bitmap;  /* Bitmap for invalid sectors in line */
+
+       int left_eblks;                 /* Blocks left for erasing */
+       atomic_t left_seblks;           /* Blocks left for sync erasing */
+
+       int left_msecs;                 /* Sectors left for mapping */
+       int left_ssecs;                 /* Sectors left to sync */
+       unsigned int cur_sec;           /* Sector map pointer */
+       unsigned int vsc;               /* Valid sector count in line */
+
+       struct kref ref;                /* Write buffer L2P references */
+
+       spinlock_t lock;                /* Necessary for invalid_bitmap only */
+};
+
+#define PBLK_DATA_LINES 2
+
+enum{
+       PBLK_KMALLOC_META = 1,
+       PBLK_VMALLOC_META = 2,
+};
+
+struct pblk_line_metadata {
+       void *meta;
+};
+
+struct pblk_line_mgmt {
+       int nr_lines;                   /* Total number of full lines */
+       int nr_free_lines;              /* Number of full lines in free list */
+
+       /* Free lists - use free_lock */
+       struct list_head free_list;     /* Full lines ready to use */
+       struct list_head corrupt_list;  /* Full lines corrupted */
+       struct list_head bad_list;      /* Full lines bad */
+
+       /* GC lists - use gc_lock */
+       struct list_head *gc_lists[PBLK_NR_GC_LISTS];
+       struct list_head gc_high_list;  /* Full lines ready to GC, high isc */
+       struct list_head gc_mid_list;   /* Full lines ready to GC, mid isc */
+       struct list_head gc_low_list;   /* Full lines ready to GC, low isc */
+
+       struct list_head gc_full_list;  /* Full lines ready to GC, no valid */
+       struct list_head gc_empty_list; /* Full lines close, all valid */
+
+       struct pblk_line *log_line;     /* Current FTL log line */
+       struct pblk_line *data_line;    /* Current data line */
+       struct pblk_line *log_next;     /* Next FTL log line */
+       struct pblk_line *data_next;    /* Next data line */
+
+       /* Metadata allocation type: VMALLOC | KMALLOC */
+       int smeta_alloc_type;
+       int emeta_alloc_type;
+
+       /* Pre-allocated metadata for data lines */
+       struct pblk_line_metadata sline_meta[PBLK_DATA_LINES];
+       struct pblk_line_metadata eline_meta[PBLK_DATA_LINES];
+       unsigned long meta_bitmap;
+
+       /* Helpers for fast bitmap calculations */
+       unsigned long *bb_template;
+       unsigned long *bb_aux;
+
+       unsigned long d_seq_nr;         /* Data line unique sequence number */
+       unsigned long l_seq_nr;         /* Log line unique sequence number */
+
+       spinlock_t free_lock;
+       spinlock_t gc_lock;
+};
+
+struct pblk_line_meta {
+       unsigned int smeta_len;         /* Total length for smeta */
+       unsigned int smeta_sec;         /* Sectors needed for smeta*/
+       unsigned int emeta_len;         /* Total length for emeta */
+       unsigned int emeta_sec;         /* Sectors needed for emeta*/
+       unsigned int emeta_bb;          /* Boundary for bb that affects emeta */
+       unsigned int sec_bitmap_len;    /* Length for sector bitmap in line */
+       unsigned int blk_bitmap_len;    /* Length for block bitmap in line */
+       unsigned int lun_bitmap_len;    /* Length for lun bitmap in line */
+
+       unsigned int blk_per_line;      /* Number of blocks in a full line */
+       unsigned int sec_per_line;      /* Number of sectors in a line */
+       unsigned int min_blk_line;      /* Min. number of good blocks in line */
+
+       unsigned int mid_thrs;          /* Threshold for GC mid list */
+       unsigned int high_thrs;         /* Threshold for GC high list */
+};
+
+struct pblk_addr_format {
+       u64     ch_mask;
+       u64     lun_mask;
+       u64     pln_mask;
+       u64     blk_mask;
+       u64     pg_mask;
+       u64     sec_mask;
+       u8      ch_offset;
+       u8      lun_offset;
+       u8      pln_offset;
+       u8      blk_offset;
+       u8      pg_offset;
+       u8      sec_offset;
+};
+
+struct pblk {
+       struct nvm_tgt_dev *dev;
+       struct gendisk *disk;
+
+       struct kobject kobj;
+
+       struct pblk_lun *luns;
+
+       struct pblk_line *lines;                /* Line array */
+       struct pblk_line_mgmt l_mg;             /* Line management */
+       struct pblk_line_meta lm;               /* Line metadata */
+
+       int ppaf_bitsize;
+       struct pblk_addr_format ppaf;
+
+       struct pblk_rb rwb;
+
+       int min_write_pgs; /* Minimum amount of pages required by controller */
+       int max_write_pgs; /* Maximum amount of pages supported by controller */
+       int pgs_in_buffer; /* Number of pages that need to be held in buffer to
+                           * guarantee successful reads.
+                           */
+
+       sector_t capacity; /* Device capacity when bad blocks are subtracted */
+       int over_pct;      /* Percentage of device used for over-provisioning */
+
+       /* pblk provisioning values. Used by rate limiter */
+       struct pblk_rl rl;
+
+       struct semaphore erase_sem;
+
+       unsigned char instance_uuid[16];
+#ifdef CONFIG_NVM_DEBUG
+       /* All debug counters apply to 4kb sector I/Os */
+       atomic_long_t inflight_writes;  /* Inflight writes (user and gc) */
+       atomic_long_t padded_writes;    /* Sectors padded due to flush/fua */
+       atomic_long_t padded_wb;        /* Sectors padded in write buffer */
+       atomic_long_t nr_flush;         /* Number of flush/fua I/O */
+       atomic_long_t req_writes;       /* Sectors stored on write buffer */
+       atomic_long_t sub_writes;       /* Sectors submitted from buffer */
+       atomic_long_t sync_writes;      /* Sectors synced to media */
+       atomic_long_t compl_writes;     /* Sectors completed in write bio */
+       atomic_long_t inflight_reads;   /* Inflight sector read requests */
+       atomic_long_t sync_reads;       /* Completed sector read requests */
+       atomic_long_t recov_writes;     /* Sectors submitted from recovery */
+       atomic_long_t recov_gc_writes;  /* Sectors submitted from write GC */
+       atomic_long_t recov_gc_reads;   /* Sectors submitted from read GC */
+#endif
+
+       spinlock_t lock;
+
+       atomic_long_t read_failed;
+       atomic_long_t read_empty;
+       atomic_long_t read_high_ecc;
+       atomic_long_t read_failed_gc;
+       atomic_long_t write_failed;
+       atomic_long_t erase_failed;
+
+       struct task_struct *writer_ts;
+
+       /* Simple translation map of logical addresses to physical addresses.
+        * The logical addresses is known by the host system, while the physical
+        * addresses are used when writing to the disk block device.
+        */
+       unsigned char *trans_map;
+       spinlock_t trans_lock;
+
+       struct list_head compl_list;
+
+       mempool_t *page_pool;
+       mempool_t *line_ws_pool;
+       mempool_t *rec_pool;
+       mempool_t *r_rq_pool;
+       mempool_t *w_rq_pool;
+       mempool_t *line_meta_pool;
+
+       struct workqueue_struct *kw_wq;
+       struct timer_list wtimer;
+
+       struct pblk_gc gc;
+};
+
+struct pblk_line_ws {
+       struct pblk *pblk;
+       struct pblk_line *line;
+       void *priv;
+       struct work_struct ws;
+};
+
+#define pblk_r_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_r_ctx))
+#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
+
+/*
+ * pblk ring buffer operations
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+                unsigned int power_size, unsigned int power_seg_sz);
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries);
+void *pblk_rb_entries_ref(struct pblk_rb *rb);
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+                          unsigned int nr_entries, unsigned int *pos);
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+                        unsigned int *pos);
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+                             struct pblk_w_ctx w_ctx, unsigned int pos);
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+                           struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
+                           unsigned int pos);
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos);
+
+void pblk_rb_sync_l2p(struct pblk_rb *rb);
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+                                struct pblk_c_ctx *c_ctx,
+                                unsigned int pos,
+                                unsigned int nr_entries,
+                                unsigned int count);
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+                                     struct list_head *list,
+                                     unsigned int max);
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+                       u64 pos, int bio_iter);
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+                                             struct ppa_addr *ppa);
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb);
+
+unsigned int pblk_rb_read_count(struct pblk_rb *rb);
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos);
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb);
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos);
+void pblk_rb_data_free(struct pblk_rb *rb);
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf);
+
+/*
+ * pblk core
+ */
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw);
+int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                       struct pblk_c_ctx *c_ctx);
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw);
+void pblk_flush_writer(struct pblk *pblk);
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba);
+void pblk_discard(struct pblk *pblk, struct bio *bio);
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+                             unsigned int nr_secs, unsigned int len,
+                             gfp_t gfp_mask);
+struct pblk_line *pblk_line_get(struct pblk *pblk);
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
+struct pblk_line *pblk_line_get_data(struct pblk *pblk);
+struct pblk_line *pblk_line_get_data_next(struct pblk *pblk);
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_is_full(struct pblk_line *line);
+void pblk_line_free(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_close_ws(struct work_struct *work);
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_mark_bb(struct work_struct *work);
+void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+                     void (*work)(struct work_struct *));
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line);
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa);
+void pblk_line_put(struct kref *ref);
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line);
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+                  unsigned long secs_to_flush);
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+                 unsigned long *lun_bitmap);
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+               unsigned long *lun_bitmap);
+void pblk_end_bio_sync(struct bio *bio);
+void pblk_end_io_sync(struct nvm_rq *rqd);
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+                      int nr_pages);
+void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line,
+                            u64 paddr);
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+                        int nr_pages);
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa);
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa);
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba,
+                          struct ppa_addr ppa);
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
+                        struct ppa_addr ppa, struct ppa_addr entry_line);
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+                      struct pblk_line *gc_line);
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+                         u64 *lba_list, int nr_secs);
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+                        sector_t blba, int nr_secs);
+
+/*
+ * pblk user I/O write path
+ */
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
+                       unsigned long flags);
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+                          unsigned int nr_entries, unsigned int nr_rec_entries,
+                          struct pblk_line *gc_line, unsigned long flags);
+
+/*
+ * pblk map
+ */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                      unsigned int sentry, unsigned long *lun_bitmap,
+                      unsigned int valid_secs, struct ppa_addr *erase_ppa);
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+                unsigned long *lun_bitmap, unsigned int valid_secs,
+                unsigned int off);
+
+/*
+ * pblk write thread
+ */
+int pblk_write_ts(void *data);
+void pblk_write_timer_fn(unsigned long data);
+void pblk_write_should_kick(struct pblk *pblk);
+
+/*
+ * pblk read path
+ */
+int pblk_submit_read(struct pblk *pblk, struct bio *bio);
+int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
+                       unsigned int nr_secs, unsigned int *secs_to_gc,
+                       struct pblk_line *line);
+/*
+ * pblk recovery
+ */
+void pblk_submit_rec(struct work_struct *work);
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
+void pblk_recov_pad(struct pblk *pblk);
+__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta);
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
+                       struct pblk_rec_ctx *recovery, u64 *comp_bits,
+                       unsigned int comp);
+
+/*
+ * pblk gc
+ */
+#define PBLK_GC_TRIES 3
+
+int pblk_gc_init(struct pblk *pblk);
+void pblk_gc_exit(struct pblk *pblk);
+void pblk_gc_should_start(struct pblk *pblk);
+void pblk_gc_should_stop(struct pblk *pblk);
+int pblk_gc_status(struct pblk *pblk);
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+                             int *gc_active);
+void pblk_gc_sysfs_force(struct pblk *pblk, int force);
+
+/*
+ * pblk rate limiter
+ */
+void pblk_rl_init(struct pblk_rl *rl, int budget);
+void pblk_rl_free(struct pblk_rl *rl);
+int pblk_rl_gc_thrs(struct pblk_rl *rl);
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
+void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv);
+int pblk_rl_sysfs_rate_show(struct pblk_rl *rl);
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
+
+/*
+ * pblk sysfs
+ */
+int pblk_sysfs_init(struct gendisk *tdisk);
+void pblk_sysfs_exit(struct gendisk *tdisk);
+
+static inline void *pblk_malloc(size_t size, int type, gfp_t flags)
+{
+       if (type == PBLK_KMALLOC_META)
+               return kmalloc(size, flags);
+       return vmalloc(size);
+}
+
+static inline void pblk_mfree(void *ptr, int type)
+{
+       if (type == PBLK_KMALLOC_META)
+               kfree(ptr);
+       else
+               vfree(ptr);
+}
+
+static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx)
+{
+       return c_ctx - sizeof(struct nvm_rq);
+}
+
+static inline void *pblk_line_emeta_to_lbas(struct line_emeta *emeta)
+{
+       return (emeta) + 1;
+}
+
+#define NVM_MEM_PAGE_WRITE (8)
+
+static inline int pblk_pad_distance(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+
+       return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl;
+}
+
+static inline int pblk_dev_ppa_to_line(struct ppa_addr p)
+{
+       return p.g.blk;
+}
+
+static inline int pblk_tgt_ppa_to_line(struct ppa_addr p)
+{
+       return p.g.blk;
+}
+
+static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+{
+       return p.g.lun * geo->nr_chnls + p.g.ch;
+}
+
+/* A block within a line corresponds to the lun */
+static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+{
+       return p.g.lun * geo->nr_chnls + p.g.ch;
+}
+
+static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
+{
+       struct ppa_addr ppa64;
+
+       ppa64.ppa = 0;
+
+       if (ppa32 == -1) {
+               ppa64.ppa = ADDR_EMPTY;
+       } else if (ppa32 & (1U << 31)) {
+               ppa64.c.line = ppa32 & ((~0U) >> 1);
+               ppa64.c.is_cached = 1;
+       } else {
+               ppa64.g.blk = (ppa32 & pblk->ppaf.blk_mask) >>
+                                                       pblk->ppaf.blk_offset;
+               ppa64.g.pg = (ppa32 & pblk->ppaf.pg_mask) >>
+                                                       pblk->ppaf.pg_offset;
+               ppa64.g.lun = (ppa32 & pblk->ppaf.lun_mask) >>
+                                                       pblk->ppaf.lun_offset;
+               ppa64.g.ch = (ppa32 & pblk->ppaf.ch_mask) >>
+                                                       pblk->ppaf.ch_offset;
+               ppa64.g.pl = (ppa32 & pblk->ppaf.pln_mask) >>
+                                                       pblk->ppaf.pln_offset;
+               ppa64.g.sec = (ppa32 & pblk->ppaf.sec_mask) >>
+                                                       pblk->ppaf.sec_offset;
+       }
+
+       return ppa64;
+}
+
+static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
+                                                               sector_t lba)
+{
+       struct ppa_addr ppa;
+
+       if (pblk->ppaf_bitsize < 32) {
+               u32 *map = (u32 *)pblk->trans_map;
+
+               ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
+       } else {
+               struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
+
+               ppa = map[lba];
+       }
+
+       return ppa;
+}
+
+static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
+{
+       u32 ppa32 = 0;
+
+       if (ppa64.ppa == ADDR_EMPTY) {
+               ppa32 = ~0U;
+       } else if (ppa64.c.is_cached) {
+               ppa32 |= ppa64.c.line;
+               ppa32 |= 1U << 31;
+       } else {
+               ppa32 |= ppa64.g.blk << pblk->ppaf.blk_offset;
+               ppa32 |= ppa64.g.pg << pblk->ppaf.pg_offset;
+               ppa32 |= ppa64.g.lun << pblk->ppaf.lun_offset;
+               ppa32 |= ppa64.g.ch << pblk->ppaf.ch_offset;
+               ppa32 |= ppa64.g.pl << pblk->ppaf.pln_offset;
+               ppa32 |= ppa64.g.sec << pblk->ppaf.sec_offset;
+       }
+
+       return ppa32;
+}
+
+static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
+                                               struct ppa_addr ppa)
+{
+       if (pblk->ppaf_bitsize < 32) {
+               u32 *map = (u32 *)pblk->trans_map;
+
+               map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
+       } else {
+               u64 *map = (u64 *)pblk->trans_map;
+
+               map[lba] = ppa.ppa;
+       }
+}
+
+static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
+                                                       struct ppa_addr p)
+{
+       u64 paddr;
+
+       paddr = 0;
+       paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset;
+       paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
+       paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
+       paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
+       paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
+
+       return paddr;
+}
+
+static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
+{
+       return (ppa_addr.ppa == ADDR_EMPTY);
+}
+
+static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
+{
+       ppa_addr->ppa = ADDR_EMPTY;
+}
+
+static inline int pblk_addr_in_cache(struct ppa_addr ppa)
+{
+       return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached);
+}
+
+static inline int pblk_addr_to_cacheline(struct ppa_addr ppa)
+{
+       return ppa.c.line;
+}
+
+static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
+{
+       struct ppa_addr p;
+
+       p.c.line = addr;
+       p.c.is_cached = 1;
+
+       return p;
+}
+
+static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
+                                             u64 line_id)
+{
+       struct ppa_addr ppa;
+
+       ppa.ppa = 0;
+       ppa.g.blk = line_id;
+       ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
+       ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
+       ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
+       ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
+       ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
+
+       return ppa;
+}
+
+static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr,
+                                        u64 line_id)
+{
+       struct ppa_addr ppa;
+
+       ppa = addr_to_gen_ppa(pblk, paddr, line_id);
+
+       return ppa;
+}
+
+static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
+                                           struct line_smeta *smeta)
+{
+       u32 crc = ~(u32)0;
+
+       crc = crc32_le(crc, (unsigned char *)smeta + sizeof(crc),
+                               sizeof(struct line_header) - sizeof(crc));
+
+       return crc;
+}
+
+static inline u32 pblk_calc_smeta_crc(struct pblk *pblk,
+                                     struct line_smeta *smeta)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       u32 crc = ~(u32)0;
+
+       crc = crc32_le(crc, (unsigned char *)smeta +
+                               sizeof(struct line_header) + sizeof(crc),
+                               lm->smeta_len -
+                               sizeof(struct line_header) - sizeof(crc));
+
+       return crc;
+}
+
+static inline u32 pblk_calc_emeta_crc(struct pblk *pblk,
+                                     struct line_emeta *emeta)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       u32 crc = ~(u32)0;
+
+       crc = crc32_le(crc, (unsigned char *)emeta +
+                               sizeof(struct line_header) + sizeof(crc),
+                               lm->emeta_len -
+                               sizeof(struct line_header) - sizeof(crc));
+
+       return crc;
+}
+
+static inline int pblk_set_progr_mode(struct pblk *pblk, int type)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       int flags;
+
+       flags = geo->plane_mode >> 1;
+
+       if (type == WRITE)
+               flags |= NVM_IO_SCRAMBLE_ENABLE;
+
+       return flags;
+}
+
+static inline int pblk_set_read_mode(struct pblk *pblk)
+{
+       return NVM_IO_SNGL_ACCESS | NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static inline void print_ppa(struct ppa_addr *p, char *msg, int error)
+{
+       if (p->c.is_cached) {
+               pr_err("ppa: (%s: %x) cache line: %llu\n",
+                               msg, error, (u64)p->c.line);
+       } else {
+               pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
+                       msg, error,
+                       p->g.ch, p->g.lun, p->g.blk,
+                       p->g.pg, p->g.pl, p->g.sec);
+       }
+}
+
+static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
+                                        int error)
+{
+       int bit = -1;
+
+       if (rqd->nr_ppas ==  1) {
+               print_ppa(&rqd->ppa_addr, "rqd", error);
+               return;
+       }
+
+       while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
+                                               bit + 1)) < rqd->nr_ppas) {
+               print_ppa(&rqd->ppa_list[bit], "rqd", error);
+       }
+
+       pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
+}
+#endif
+
+static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
+                                      struct ppa_addr *ppas, int nr_ppas)
+{
+       struct nvm_geo *geo = &tgt_dev->geo;
+       struct ppa_addr *ppa;
+       int i;
+
+       for (i = 0; i < nr_ppas; i++) {
+               ppa = &ppas[i];
+
+               if (!ppa->c.is_cached &&
+                               ppa->g.ch < geo->nr_chnls &&
+                               ppa->g.lun < geo->luns_per_chnl &&
+                               ppa->g.pl < geo->nr_planes &&
+                               ppa->g.blk < geo->blks_per_lun &&
+                               ppa->g.pg < geo->pgs_per_blk &&
+                               ppa->g.sec < geo->sec_per_pg)
+                       continue;
+
+#ifdef CONFIG_NVM_DEBUG
+               print_ppa(ppa, "boundary", i);
+#endif
+               return 1;
+       }
+       return 0;
+}
+
+static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+
+       if (paddr > lm->sec_per_line)
+               return 1;
+
+       return 0;
+}
+
+static inline unsigned int pblk_get_bi_idx(struct bio *bio)
+{
+       return bio->bi_iter.bi_idx;
+}
+
+static inline sector_t pblk_get_lba(struct bio *bio)
+{
+       return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
+}
+
+static inline unsigned int pblk_get_secs(struct bio *bio)
+{
+       return  bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
+}
+
+static inline sector_t pblk_get_sector(sector_t lba)
+{
+       return lba * NR_PHY_IN_LOG;
+}
+
+static inline void pblk_setup_uuid(struct pblk *pblk)
+{
+       uuid_le uuid;
+
+       uuid_le_gen(&uuid);
+       memcpy(pblk->instance_uuid, uuid.b, 16);
+}
+#endif /* PBLK_H_ */
author	Javier González <jg@lightnvm.io>
	Sat, 15 Apr 2017 18:55:50 +0000 (20:55 +0200)
committer	Jens Axboe <axboe@fb.com>
	Sun, 16 Apr 2017 16:06:33 +0000 (10:06 -0600)
Documentation/lightnvm/pblk.txt	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/Kconfig		patch \| blob \| history
drivers/lightnvm/Makefile		patch \| blob \| history
drivers/lightnvm/pblk-cache.c	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/pblk-core.c	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/pblk-gc.c	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/pblk-init.c	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/pblk-map.c	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/pblk-rb.c	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/pblk-read.c	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/pblk-recovery.c	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/pblk-rl.c	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/pblk-sysfs.c	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/pblk-write.c	[new file with mode: 0644]	patch \| blob
drivers/lightnvm/pblk.h	[new file with mode: 0644]	patch \| blob