Merge branch 'for-4.11/block' into for-4.11/linus-merge
authorJens Axboe <axboe@fb.com>
Fri, 17 Feb 2017 21:06:45 +0000 (14:06 -0700)
committerJens Axboe <axboe@fb.com>
Fri, 17 Feb 2017 21:06:45 +0000 (14:06 -0700)
Signed-off-by: Jens Axboe <axboe@fb.com>
68 files changed:
Documentation/cdrom/cdrom-standard.tex
MAINTAINERS
block/Kconfig
block/Kconfig.iosched
block/Makefile
block/bio.c
block/blk-cgroup.c
block/blk-core.c
block/blk-exec.c
block/blk-flush.c
block/blk-ioc.c
block/blk-merge.c
block/blk-mq-debugfs.c [new file with mode: 0644]
block/blk-mq-sched.c [new file with mode: 0644]
block/blk-mq-sched.h [new file with mode: 0644]
block/blk-mq-sysfs.c
block/blk-mq-tag.c
block/blk-mq-tag.h
block/blk-mq.c
block/blk-mq.h
block/blk-tag.c
block/blk-throttle.c
block/blk.h
block/cfq-iosched.c
block/deadline-iosched.c
block/elevator.c
block/mq-deadline.c [new file with mode: 0644]
block/noop-iosched.c
block/opal_proto.h [new file with mode: 0644]
block/partitions/efi.c
block/sed-opal.c [new file with mode: 0644]
drivers/block/cciss.c
drivers/block/cciss.h
drivers/block/floppy.c
drivers/block/loop.c
drivers/block/null_blk.c
drivers/block/paride/pcd.c
drivers/cdrom/cdrom.c
drivers/cdrom/gdrom.c
drivers/ide/ide-cd.c
drivers/lightnvm/Kconfig
drivers/lightnvm/Makefile
drivers/lightnvm/core.c
drivers/lightnvm/gennvm.c [deleted file]
drivers/lightnvm/gennvm.h [deleted file]
drivers/lightnvm/rrpc.c
drivers/lightnvm/rrpc.h
drivers/lightnvm/sysblk.c [deleted file]
drivers/md/bcache/request.c
drivers/md/dm-cache-target.c
drivers/md/dm-thin.c
drivers/nvme/host/core.c
drivers/nvme/host/lightnvm.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/scsi/sr.c
include/linux/blk-mq.h
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/cdrom.h
include/linux/elevator.h
include/linux/lightnvm.h
include/linux/nvme.h
include/linux/sbitmap.h
include/linux/sed-opal.h [new file with mode: 0644]
include/uapi/linux/lightnvm.h
include/uapi/linux/sed-opal.h [new file with mode: 0644]
lib/sbitmap.c

index c06233f..8f85b0e 100644 (file)
@@ -249,7 +249,6 @@ struct& cdrom_device_ops\ \{ \hidewidth\cr
         unsigned\ long);\cr
 \noalign{\medskip}
   &const\ int& capability;& capability flags \cr
-  &int& n_minors;& number of active minor devices \cr
 \};\cr
 }
 $$
@@ -258,13 +257,7 @@ it should add a function pointer to this $struct$. When a particular
 function is not implemented, however, this $struct$ should contain a
 NULL instead. The $capability$ flags specify the capabilities of the
 \cdrom\ hardware and/or low-level \cdrom\ driver when a \cdrom\ drive
-is registered with the \UCD. The value $n_minors$ should be a positive
-value indicating the number of minor devices that are supported by
-the low-level device driver, normally~1. Although these two variables
-are `informative' rather than `operational,' they are included in
-$cdrom_device_ops$ because they describe the capability of the {\em
-driver\/} rather than the {\em drive}. Nomenclature has always been
-difficult in computer programming.
+is registered with the \UCD.
 
 Note that most functions have fewer parameters than their
 $blkdev_fops$ counterparts. This is because very little of the
index 527d137..864e1fd 100644 (file)
@@ -8612,10 +8612,10 @@ S:      Maintained
 F:     drivers/net/ethernet/netronome/
 
 NETWORK BLOCK DEVICE (NBD)
-M:     Markus Pargmann <mpa@pengutronix.de>
+M:     Josef Bacik <jbacik@fb.com>
 S:     Maintained
+L:     linux-block@vger.kernel.org
 L:     nbd-general@lists.sourceforge.net
-T:     git git://git.pengutronix.de/git/mpa/linux-nbd.git
 F:     Documentation/blockdev/nbd.txt
 F:     drivers/block/nbd.c
 F:     include/uapi/linux/nbd.h
@@ -11089,6 +11089,17 @@ L:     linux-mmc@vger.kernel.org
 S:     Maintained
 F:     drivers/mmc/host/sdhci-spear.c
 
+SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER
+M:     Scott Bauer <scott.bauer@intel.com>
+M:     Jonathan Derrick <jonathan.derrick@intel.com>
+M:     Rafael Antognolli <rafael.antognolli@intel.com>
+L:     linux-block@vger.kernel.org
+S:     Supported
+F:     block/sed*
+F:     block/opal_proto.h
+F:     include/linux/sed*
+F:     include/uapi/linux/sed*
+
 SECURITY SUBSYSTEM
 M:     James Morris <james.l.morris@oracle.com>
 M:     "Serge E. Hallyn" <serge@hallyn.com>
index 8bf114a..1aef809 100644 (file)
@@ -147,6 +147,25 @@ config BLK_WBT_MQ
        Multiqueue currently doesn't have support for IO scheduling,
        enabling this option is recommended.
 
+config BLK_DEBUG_FS
+       bool "Block layer debugging information in debugfs"
+       default y
+       depends on DEBUG_FS
+       ---help---
+       Include block layer debugging information in debugfs. This information
+       is mostly useful for kernel developers, but it doesn't incur any cost
+       at runtime.
+
+       Unless you are building a kernel for a tiny system, you should
+       say Y here.
+
+config BLK_SED_OPAL
+       bool "Logic for interfacing with Opal enabled SEDs"
+       ---help---
+       Builds Logic for interfacing with Opal enabled controllers.
+       Enabling this option enables users to setup/unlock/lock
+       Locking ranges for SED devices using the Opal protocol.
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
index 421bef9..0715ce9 100644 (file)
@@ -63,6 +63,56 @@ config DEFAULT_IOSCHED
        default "cfq" if DEFAULT_CFQ
        default "noop" if DEFAULT_NOOP
 
+config MQ_IOSCHED_DEADLINE
+       tristate "MQ deadline I/O scheduler"
+       default y
+       ---help---
+         MQ version of the deadline IO scheduler.
+
+config MQ_IOSCHED_NONE
+       bool
+       default y
+
+choice
+       prompt "Default single-queue blk-mq I/O scheduler"
+       default DEFAULT_SQ_NONE
+       help
+         Select the I/O scheduler which will be used by default for blk-mq
+         managed block devices with a single queue.
+
+       config DEFAULT_SQ_DEADLINE
+               bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
+
+       config DEFAULT_SQ_NONE
+               bool "None"
+
+endchoice
+
+config DEFAULT_SQ_IOSCHED
+       string
+       default "mq-deadline" if DEFAULT_SQ_DEADLINE
+       default "none" if DEFAULT_SQ_NONE
+
+choice
+       prompt "Default multi-queue blk-mq I/O scheduler"
+       default DEFAULT_MQ_NONE
+       help
+         Select the I/O scheduler which will be used by default for blk-mq
+         managed block devices with multiple queues.
+
+       config DEFAULT_MQ_DEADLINE
+               bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
+
+       config DEFAULT_MQ_NONE
+               bool "None"
+
+endchoice
+
+config DEFAULT_MQ_IOSCHED
+       string
+       default "mq-deadline" if DEFAULT_MQ_DEADLINE
+       default "none" if DEFAULT_MQ_NONE
+
 endmenu
 
 endif
index a827f98..6ba1b1b 100644 (file)
@@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
                        blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
-                       blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
+                       blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
                        genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
                        badblocks.o partitions/
 
@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)      += blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)     += noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)      += cfq-iosched.o
+obj-$(CONFIG_MQ_IOSCHED_DEADLINE)      += mq-deadline.o
 
 obj-$(CONFIG_BLOCK_COMPAT)     += compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)       += cmdline-parser.o
@@ -25,3 +26,5 @@ obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
 obj-$(CONFIG_BLK_MQ_PCI)       += blk-mq-pci.o
 obj-$(CONFIG_BLK_DEV_ZONED)    += blk-zoned.o
 obj-$(CONFIG_BLK_WBT)          += blk-wbt.o
+obj-$(CONFIG_BLK_DEBUG_FS)     += blk-mq-debugfs.o
+obj-$(CONFIG_BLK_SED_OPAL)     += sed-opal.o
index 2b37502..d3c26d1 100644 (file)
@@ -1403,7 +1403,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
        bio_set_flag(bio, BIO_USER_MAPPED);
 
        /*
-        * subtle -- if __bio_map_user() ended up bouncing a bio,
+        * subtle -- if bio_map_user_iov() ended up bouncing a bio,
         * it would normally disappear when its bi_end_io is run.
         * however, we need it for the unmap, so grab an extra
         * reference to it
@@ -1445,8 +1445,8 @@ static void __bio_unmap_user(struct bio *bio)
  *     bio_unmap_user  -       unmap a bio
  *     @bio:           the bio being unmapped
  *
- *     Unmap a bio previously mapped by bio_map_user(). Must be called with
- *     process context.
+ *     Unmap a bio previously mapped by bio_map_user_iov(). Must be called from
+ *     process context.
  *
  *     bio_unmap_user() may sleep.
  */
index 8ba0af7..fb59a3e 100644 (file)
@@ -1223,7 +1223,10 @@ int blkcg_activate_policy(struct request_queue *q,
        if (blkcg_policy_enabled(q, pol))
                return 0;
 
-       blk_queue_bypass_start(q);
+       if (q->mq_ops)
+               blk_mq_freeze_queue(q);
+       else
+               blk_queue_bypass_start(q);
 pd_prealloc:
        if (!pd_prealloc) {
                pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
@@ -1261,7 +1264,10 @@ pd_prealloc:
 
        spin_unlock_irq(q->queue_lock);
 out_bypass_end:
-       blk_queue_bypass_end(q);
+       if (q->mq_ops)
+               blk_mq_unfreeze_queue(q);
+       else
+               blk_queue_bypass_end(q);
        if (pd_prealloc)
                pol->pd_free_fn(pd_prealloc);
        return ret;
@@ -1284,7 +1290,11 @@ void blkcg_deactivate_policy(struct request_queue *q,
        if (!blkcg_policy_enabled(q, pol))
                return;
 
-       blk_queue_bypass_start(q);
+       if (q->mq_ops)
+               blk_mq_freeze_queue(q);
+       else
+               blk_queue_bypass_start(q);
+
        spin_lock_irq(q->queue_lock);
 
        __clear_bit(pol->plid, q->blkcg_pols);
@@ -1304,7 +1314,11 @@ void blkcg_deactivate_policy(struct request_queue *q,
        }
 
        spin_unlock_irq(q->queue_lock);
-       blk_queue_bypass_end(q);
+
+       if (q->mq_ops)
+               blk_mq_unfreeze_queue(q);
+       else
+               blk_queue_bypass_end(q);
 }
 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
 
index 61ba08c..b2df55a 100644 (file)
@@ -39,6 +39,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-mq-sched.h"
 #include "blk-wbt.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -134,6 +135,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        rq->cmd = rq->__cmd;
        rq->cmd_len = BLK_MAX_CDB;
        rq->tag = -1;
+       rq->internal_tag = -1;
        rq->start_time = jiffies;
        set_start_time_ns(rq);
        rq->part = NULL;
@@ -525,12 +527,14 @@ void blk_set_queue_dying(struct request_queue *q)
        else {
                struct request_list *rl;
 
+               spin_lock_irq(q->queue_lock);
                blk_queue_for_each_rl(rl, q) {
                        if (rl->rq_pool) {
                                wake_up(&rl->wait[BLK_RW_SYNC]);
                                wake_up(&rl->wait[BLK_RW_ASYNC]);
                        }
                }
+               spin_unlock_irq(q->queue_lock);
        }
 }
 EXPORT_SYMBOL_GPL(blk_set_queue_dying);
@@ -1033,29 +1037,13 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
         * Flush requests do not use the elevator so skip initialization.
         * This allows a request to share the flush and elevator data.
         */
-       if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA))
+       if (op_is_flush(bio->bi_opf))
                return false;
 
        return true;
 }
 
 /**
- * rq_ioc - determine io_context for request allocation
- * @bio: request being allocated is for this bio (can be %NULL)
- *
- * Determine io_context to use for request allocation for @bio.  May return
- * %NULL if %current->io_context doesn't exist.
- */
-static struct io_context *rq_ioc(struct bio *bio)
-{
-#ifdef CONFIG_BLK_CGROUP
-       if (bio && bio->bi_ioc)
-               return bio->bi_ioc;
-#endif
-       return current->io_context;
-}
-
-/**
  * __get_request - get a free request
  * @rl: request list to allocate from
  * @op: operation and flags
@@ -1655,7 +1643,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
                return BLK_QC_T_NONE;
        }
 
-       if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) {
+       if (op_is_flush(bio->bi_opf)) {
                spin_lock_irq(q->queue_lock);
                where = ELEVATOR_INSERT_FLUSH;
                goto get_rq;
@@ -1894,7 +1882,7 @@ generic_make_request_checks(struct bio *bio)
         * drivers without flush support don't have to worry
         * about them.
         */
-       if ((bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) &&
+       if (op_is_flush(bio->bi_opf) &&
            !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
                bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
                if (!nr_sectors) {
@@ -2143,7 +2131,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
        if (q->mq_ops) {
                if (blk_queue_io_stat(q))
                        blk_account_io_start(rq, true);
-               blk_mq_insert_request(rq, false, true, false);
+               blk_mq_sched_insert_request(rq, false, true, false, false);
                return 0;
        }
 
@@ -2159,7 +2147,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
         */
        BUG_ON(blk_queued_rq(rq));
 
-       if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
+       if (op_is_flush(rq->cmd_flags))
                where = ELEVATOR_INSERT_FLUSH;
 
        add_acct_request(q, rq, where);
@@ -3270,7 +3258,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
                /*
                 * rq is already accounted, so use raw insert
                 */
-               if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
+               if (op_is_flush(rq->cmd_flags))
                        __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
                else
                        __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
index 3ecb00a..ed1f101 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/sched/sysctl.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 /*
  * for max sense size
@@ -65,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
         * be reused after dying flag is set
         */
        if (q->mq_ops) {
-               blk_mq_insert_request(rq, at_head, true, false);
+               blk_mq_sched_insert_request(rq, at_head, true, false, false);
                return;
        }
 
index 20b7c7a..4427896 100644 (file)
@@ -74,6 +74,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
 
 /* FLUSH/FUA sequences */
 enum {
@@ -391,9 +392,10 @@ static void mq_flush_data_end_io(struct request *rq, int error)
         * the comment in flush_end_io().
         */
        spin_lock_irqsave(&fq->mq_flush_lock, flags);
-       if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
-               blk_mq_run_hw_queue(hctx, true);
+       blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
        spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+
+       blk_mq_run_hw_queue(hctx, true);
 }
 
 /**
@@ -453,9 +455,9 @@ void blk_insert_flush(struct request *rq)
         */
        if ((policy & REQ_FSEQ_DATA) &&
            !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-               if (q->mq_ops) {
-                       blk_mq_insert_request(rq, false, true, false);
-               else
+               if (q->mq_ops)
+                       blk_mq_sched_insert_request(rq, false, true, false, false);
+               else
                        list_add_tail(&rq->queuelist, &q->queue_head);
                return;
        }
index 381cb50..fe186a9 100644 (file)
@@ -43,8 +43,10 @@ static void ioc_exit_icq(struct io_cq *icq)
        if (icq->flags & ICQ_EXITED)
                return;
 
-       if (et->ops.elevator_exit_icq_fn)
-               et->ops.elevator_exit_icq_fn(icq);
+       if (et->uses_mq && et->ops.mq.exit_icq)
+               et->ops.mq.exit_icq(icq);
+       else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn)
+               et->ops.sq.elevator_exit_icq_fn(icq);
 
        icq->flags |= ICQ_EXITED;
 }
@@ -383,8 +385,10 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
        if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
                hlist_add_head(&icq->ioc_node, &ioc->icq_list);
                list_add(&icq->q_node, &q->icq_list);
-               if (et->ops.elevator_init_icq_fn)
-                       et->ops.elevator_init_icq_fn(icq);
+               if (et->uses_mq && et->ops.mq.init_icq)
+                       et->ops.mq.init_icq(icq);
+               else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn)
+                       et->ops.sq.elevator_init_icq_fn(icq);
        } else {
                kmem_cache_free(et->icq_cache, icq);
                icq = ioc_lookup_icq(ioc, q);
index 182398c..6aa43de 100644 (file)
@@ -763,8 +763,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 {
        struct elevator_queue *e = q->elevator;
 
-       if (e->type->ops.elevator_allow_rq_merge_fn)
-               if (!e->type->ops.elevator_allow_rq_merge_fn(q, rq, next))
+       if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn)
+               if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
                        return 0;
 
        return attempt_merge(q, rq, next);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
new file mode 100644 (file)
index 0000000..5cd2b43
--- /dev/null
@@ -0,0 +1,756 @@
+/*
+ * Copyright (C) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+struct blk_mq_debugfs_attr {
+       const char *name;
+       umode_t mode;
+       const struct file_operations *fops;
+};
+
+static struct dentry *block_debugfs_root;
+
+static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file,
+                                  const struct seq_operations *ops)
+{
+       struct seq_file *m;
+       int ret;
+
+       ret = seq_open(file, ops);
+       if (!ret) {
+               m = file->private_data;
+               m->private = inode->i_private;
+       }
+       return ret;
+}
+
+static int hctx_state_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       seq_printf(m, "0x%lx\n", hctx->state);
+       return 0;
+}
+
+static int hctx_state_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_state_show, inode->i_private);
+}
+
+static const struct file_operations hctx_state_fops = {
+       .open           = hctx_state_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int hctx_flags_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       seq_printf(m, "0x%lx\n", hctx->flags);
+       return 0;
+}
+
+static int hctx_flags_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_flags_show, inode->i_private);
+}
+
+static const struct file_operations hctx_flags_fops = {
+       .open           = hctx_flags_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
+{
+       struct request *rq = list_entry_rq(v);
+
+       seq_printf(m, "%p {.cmd_type=%u, .cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n",
+                  rq, rq->cmd_type, rq->cmd_flags, (unsigned int)rq->rq_flags,
+                  rq->tag, rq->internal_tag);
+       return 0;
+}
+
+static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       spin_lock(&hctx->lock);
+       return seq_list_start(&hctx->dispatch, *pos);
+}
+
+static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       return seq_list_next(v, &hctx->dispatch, pos);
+}
+
+static void hctx_dispatch_stop(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       spin_unlock(&hctx->lock);
+}
+
+static const struct seq_operations hctx_dispatch_seq_ops = {
+       .start  = hctx_dispatch_start,
+       .next   = hctx_dispatch_next,
+       .stop   = hctx_dispatch_stop,
+       .show   = blk_mq_debugfs_rq_show,
+};
+
+static int hctx_dispatch_open(struct inode *inode, struct file *file)
+{
+       return blk_mq_debugfs_seq_open(inode, file, &hctx_dispatch_seq_ops);
+}
+
+static const struct file_operations hctx_dispatch_fops = {
+       .open           = hctx_dispatch_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
+
+static int hctx_ctx_map_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       sbitmap_bitmap_show(&hctx->ctx_map, m);
+       return 0;
+}
+
+static int hctx_ctx_map_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_ctx_map_show, inode->i_private);
+}
+
+static const struct file_operations hctx_ctx_map_fops = {
+       .open           = hctx_ctx_map_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static void blk_mq_debugfs_tags_show(struct seq_file *m,
+                                    struct blk_mq_tags *tags)
+{
+       seq_printf(m, "nr_tags=%u\n", tags->nr_tags);
+       seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
+       seq_printf(m, "active_queues=%d\n",
+                  atomic_read(&tags->active_queues));
+
+       seq_puts(m, "\nbitmap_tags:\n");
+       sbitmap_queue_show(&tags->bitmap_tags, m);
+
+       if (tags->nr_reserved_tags) {
+               seq_puts(m, "\nbreserved_tags:\n");
+               sbitmap_queue_show(&tags->breserved_tags, m);
+       }
+}
+
+static int hctx_tags_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+       struct request_queue *q = hctx->queue;
+
+       mutex_lock(&q->sysfs_lock);
+       if (hctx->tags)
+               blk_mq_debugfs_tags_show(m, hctx->tags);
+       mutex_unlock(&q->sysfs_lock);
+
+       return 0;
+}
+
+static int hctx_tags_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_tags_show, inode->i_private);
+}
+
+static const struct file_operations hctx_tags_fops = {
+       .open           = hctx_tags_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int hctx_tags_bitmap_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+       struct request_queue *q = hctx->queue;
+
+       mutex_lock(&q->sysfs_lock);
+       if (hctx->tags)
+               sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
+       mutex_unlock(&q->sysfs_lock);
+       return 0;
+}
+
+static int hctx_tags_bitmap_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_tags_bitmap_show, inode->i_private);
+}
+
+static const struct file_operations hctx_tags_bitmap_fops = {
+       .open           = hctx_tags_bitmap_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int hctx_sched_tags_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+       struct request_queue *q = hctx->queue;
+
+       mutex_lock(&q->sysfs_lock);
+       if (hctx->sched_tags)
+               blk_mq_debugfs_tags_show(m, hctx->sched_tags);
+       mutex_unlock(&q->sysfs_lock);
+
+       return 0;
+}
+
+static int hctx_sched_tags_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_sched_tags_show, inode->i_private);
+}
+
+static const struct file_operations hctx_sched_tags_fops = {
+       .open           = hctx_sched_tags_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int hctx_sched_tags_bitmap_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+       struct request_queue *q = hctx->queue;
+
+       mutex_lock(&q->sysfs_lock);
+       if (hctx->sched_tags)
+               sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
+       mutex_unlock(&q->sysfs_lock);
+       return 0;
+}
+
+static int hctx_sched_tags_bitmap_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_sched_tags_bitmap_show, inode->i_private);
+}
+
+static const struct file_operations hctx_sched_tags_bitmap_fops = {
+       .open           = hctx_sched_tags_bitmap_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int hctx_io_poll_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       seq_printf(m, "considered=%lu\n", hctx->poll_considered);
+       seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
+       seq_printf(m, "success=%lu\n", hctx->poll_success);
+       return 0;
+}
+
+static int hctx_io_poll_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_io_poll_show, inode->i_private);
+}
+
+static ssize_t hctx_io_poll_write(struct file *file, const char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+       struct seq_file *m = file->private_data;
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
+       return count;
+}
+
+static const struct file_operations hctx_io_poll_fops = {
+       .open           = hctx_io_poll_open,
+       .read           = seq_read,
+       .write          = hctx_io_poll_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
+{
+       seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+                  stat->nr_samples, stat->mean, stat->min, stat->max);
+}
+
+static int hctx_stats_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+       struct blk_rq_stat stat[2];
+
+       blk_stat_init(&stat[BLK_STAT_READ]);
+       blk_stat_init(&stat[BLK_STAT_WRITE]);
+
+       blk_hctx_stat_get(hctx, stat);
+
+       seq_puts(m, "read: ");
+       print_stat(m, &stat[BLK_STAT_READ]);
+       seq_puts(m, "\n");
+
+       seq_puts(m, "write: ");
+       print_stat(m, &stat[BLK_STAT_WRITE]);
+       seq_puts(m, "\n");
+       return 0;
+}
+
+static int hctx_stats_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_stats_show, inode->i_private);
+}
+
+static ssize_t hctx_stats_write(struct file *file, const char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       struct seq_file *m = file->private_data;
+       struct blk_mq_hw_ctx *hctx = m->private;
+       struct blk_mq_ctx *ctx;
+       int i;
+
+       hctx_for_each_ctx(hctx, ctx, i) {
+               blk_stat_init(&ctx->stat[BLK_STAT_READ]);
+               blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
+       }
+       return count;
+}
+
+static const struct file_operations hctx_stats_fops = {
+       .open           = hctx_stats_open,
+       .read           = seq_read,
+       .write          = hctx_stats_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int hctx_dispatched_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+       int i;
+
+       seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
+
+       for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
+               unsigned int d = 1U << (i - 1);
+
+               seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
+       }
+
+       seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
+       return 0;
+}
+
+static int hctx_dispatched_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_dispatched_show, inode->i_private);
+}
+
+static ssize_t hctx_dispatched_write(struct file *file, const char __user *buf,
+                                    size_t count, loff_t *ppos)
+{
+       struct seq_file *m = file->private_data;
+       struct blk_mq_hw_ctx *hctx = m->private;
+       int i;
+
+       for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
+               hctx->dispatched[i] = 0;
+       return count;
+}
+
+static const struct file_operations hctx_dispatched_fops = {
+       .open           = hctx_dispatched_open,
+       .read           = seq_read,
+       .write          = hctx_dispatched_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int hctx_queued_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       seq_printf(m, "%lu\n", hctx->queued);
+       return 0;
+}
+
+static int hctx_queued_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_queued_show, inode->i_private);
+}
+
+static ssize_t hctx_queued_write(struct file *file, const char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       struct seq_file *m = file->private_data;
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       hctx->queued = 0;
+       return count;
+}
+
+static const struct file_operations hctx_queued_fops = {
+       .open           = hctx_queued_open,
+       .read           = seq_read,
+       .write          = hctx_queued_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int hctx_run_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       seq_printf(m, "%lu\n", hctx->run);
+       return 0;
+}
+
+static int hctx_run_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_run_show, inode->i_private);
+}
+
+static ssize_t hctx_run_write(struct file *file, const char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       struct seq_file *m = file->private_data;
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       hctx->run = 0;
+       return count;
+}
+
+static const struct file_operations hctx_run_fops = {
+       .open           = hctx_run_open,
+       .read           = seq_read,
+       .write          = hctx_run_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int hctx_active_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_hw_ctx *hctx = m->private;
+
+       seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
+       return 0;
+}
+
+static int hctx_active_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, hctx_active_show, inode->i_private);
+}
+
+static const struct file_operations hctx_active_fops = {
+       .open           = hctx_active_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
+{
+       struct blk_mq_ctx *ctx = m->private;
+
+       spin_lock(&ctx->lock);
+       return seq_list_start(&ctx->rq_list, *pos);
+}
+
+static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       struct blk_mq_ctx *ctx = m->private;
+
+       return seq_list_next(v, &ctx->rq_list, pos);
+}
+
+static void ctx_rq_list_stop(struct seq_file *m, void *v)
+{
+       struct blk_mq_ctx *ctx = m->private;
+
+       spin_unlock(&ctx->lock);
+}
+
+static const struct seq_operations ctx_rq_list_seq_ops = {
+       .start  = ctx_rq_list_start,
+       .next   = ctx_rq_list_next,
+       .stop   = ctx_rq_list_stop,
+       .show   = blk_mq_debugfs_rq_show,
+};
+
+static int ctx_rq_list_open(struct inode *inode, struct file *file)
+{
+       return blk_mq_debugfs_seq_open(inode, file, &ctx_rq_list_seq_ops);
+}
+
+static const struct file_operations ctx_rq_list_fops = {
+       .open           = ctx_rq_list_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
+
+static int ctx_dispatched_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_ctx *ctx = m->private;
+
+       seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
+       return 0;
+}
+
+static int ctx_dispatched_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, ctx_dispatched_show, inode->i_private);
+}
+
+static ssize_t ctx_dispatched_write(struct file *file, const char __user *buf,
+                                   size_t count, loff_t *ppos)
+{
+       struct seq_file *m = file->private_data;
+       struct blk_mq_ctx *ctx = m->private;
+
+       ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
+       return count;
+}
+
+static const struct file_operations ctx_dispatched_fops = {
+       .open           = ctx_dispatched_open,
+       .read           = seq_read,
+       .write          = ctx_dispatched_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int ctx_merged_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_ctx *ctx = m->private;
+
+       seq_printf(m, "%lu\n", ctx->rq_merged);
+       return 0;
+}
+
+static int ctx_merged_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, ctx_merged_show, inode->i_private);
+}
+
+static ssize_t ctx_merged_write(struct file *file, const char __user *buf,
+                                   size_t count, loff_t *ppos)
+{
+       struct seq_file *m = file->private_data;
+       struct blk_mq_ctx *ctx = m->private;
+
+       ctx->rq_merged = 0;
+       return count;
+}
+
+static const struct file_operations ctx_merged_fops = {
+       .open           = ctx_merged_open,
+       .read           = seq_read,
+       .write          = ctx_merged_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int ctx_completed_show(struct seq_file *m, void *v)
+{
+       struct blk_mq_ctx *ctx = m->private;
+
+       seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
+       return 0;
+}
+
+static int ctx_completed_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, ctx_completed_show, inode->i_private);
+}
+
+static ssize_t ctx_completed_write(struct file *file, const char __user *buf,
+                                  size_t count, loff_t *ppos)
+{
+       struct seq_file *m = file->private_data;
+       struct blk_mq_ctx *ctx = m->private;
+
+       ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
+       return count;
+}
+
+static const struct file_operations ctx_completed_fops = {
+       .open           = ctx_completed_open,
+       .read           = seq_read,
+       .write          = ctx_completed_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
+       {"state", 0400, &hctx_state_fops},
+       {"flags", 0400, &hctx_flags_fops},
+       {"dispatch", 0400, &hctx_dispatch_fops},
+       {"ctx_map", 0400, &hctx_ctx_map_fops},
+       {"tags", 0400, &hctx_tags_fops},
+       {"tags_bitmap", 0400, &hctx_tags_bitmap_fops},
+       {"sched_tags", 0400, &hctx_sched_tags_fops},
+       {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops},
+       {"io_poll", 0600, &hctx_io_poll_fops},
+       {"stats", 0600, &hctx_stats_fops},
+       {"dispatched", 0600, &hctx_dispatched_fops},
+       {"queued", 0600, &hctx_queued_fops},
+       {"run", 0600, &hctx_run_fops},
+       {"active", 0400, &hctx_active_fops},
+};
+
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
+       {"rq_list", 0400, &ctx_rq_list_fops},
+       {"dispatched", 0600, &ctx_dispatched_fops},
+       {"merged", 0600, &ctx_merged_fops},
+       {"completed", 0600, &ctx_completed_fops},
+};
+
+int blk_mq_debugfs_register(struct request_queue *q, const char *name)
+{
+       if (!block_debugfs_root)
+               return -ENOENT;
+
+       q->debugfs_dir = debugfs_create_dir(name, block_debugfs_root);
+       if (!q->debugfs_dir)
+               goto err;
+
+       if (blk_mq_debugfs_register_hctxs(q))
+               goto err;
+
+       return 0;
+
+err:
+       blk_mq_debugfs_unregister(q);
+       return -ENOMEM;
+}
+
+void blk_mq_debugfs_unregister(struct request_queue *q)
+{
+       debugfs_remove_recursive(q->debugfs_dir);
+       q->mq_debugfs_dir = NULL;
+       q->debugfs_dir = NULL;
+}
+
+static int blk_mq_debugfs_register_ctx(struct request_queue *q,
+                                      struct blk_mq_ctx *ctx,
+                                      struct dentry *hctx_dir)
+{
+       struct dentry *ctx_dir;
+       char name[20];
+       int i;
+
+       snprintf(name, sizeof(name), "cpu%u", ctx->cpu);
+       ctx_dir = debugfs_create_dir(name, hctx_dir);
+       if (!ctx_dir)
+               return -ENOMEM;
+
+       for (i = 0; i < ARRAY_SIZE(blk_mq_debugfs_ctx_attrs); i++) {
+               const struct blk_mq_debugfs_attr *attr;
+
+               attr = &blk_mq_debugfs_ctx_attrs[i];
+               if (!debugfs_create_file(attr->name, attr->mode, ctx_dir, ctx,
+                                        attr->fops))
+                       return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int blk_mq_debugfs_register_hctx(struct request_queue *q,
+                                       struct blk_mq_hw_ctx *hctx)
+{
+       struct blk_mq_ctx *ctx;
+       struct dentry *hctx_dir;
+       char name[20];
+       int i;
+
+       snprintf(name, sizeof(name), "%u", hctx->queue_num);
+       hctx_dir = debugfs_create_dir(name, q->mq_debugfs_dir);
+       if (!hctx_dir)
+               return -ENOMEM;
+
+       for (i = 0; i < ARRAY_SIZE(blk_mq_debugfs_hctx_attrs); i++) {
+               const struct blk_mq_debugfs_attr *attr;
+
+               attr = &blk_mq_debugfs_hctx_attrs[i];
+               if (!debugfs_create_file(attr->name, attr->mode, hctx_dir, hctx,
+                                        attr->fops))
+                       return -ENOMEM;
+       }
+
+       hctx_for_each_ctx(hctx, ctx, i) {
+               if (blk_mq_debugfs_register_ctx(q, ctx, hctx_dir))
+                       return -ENOMEM;
+       }
+
+       return 0;
+}
+
+int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       if (!q->debugfs_dir)
+               return -ENOENT;
+
+       q->mq_debugfs_dir = debugfs_create_dir("mq", q->debugfs_dir);
+       if (!q->mq_debugfs_dir)
+               goto err;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (blk_mq_debugfs_register_hctx(q, hctx))
+                       goto err;
+       }
+
+       return 0;
+
+err:
+       blk_mq_debugfs_unregister_hctxs(q);
+       return -ENOMEM;
+}
+
+void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+{
+       debugfs_remove_recursive(q->mq_debugfs_dir);
+       q->mq_debugfs_dir = NULL;
+}
+
+void blk_mq_debugfs_init(void)
+{
+       block_debugfs_root = debugfs_create_dir("block", NULL);
+}
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
new file mode 100644 (file)
index 0000000..114814e
--- /dev/null
@@ -0,0 +1,481 @@
+/*
+ * blk-mq scheduling framework
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+
+#include <trace/events/block.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-wbt.h"
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+                                void (*exit)(struct blk_mq_hw_ctx *))
+{
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (exit && hctx->sched_data)
+                       exit(hctx);
+               kfree(hctx->sched_data);
+               hctx->sched_data = NULL;
+       }
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+                               int (*init)(struct blk_mq_hw_ctx *),
+                               void (*exit)(struct blk_mq_hw_ctx *))
+{
+       struct blk_mq_hw_ctx *hctx;
+       int ret;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
+               if (!hctx->sched_data) {
+                       ret = -ENOMEM;
+                       goto error;
+               }
+
+               if (init) {
+                       ret = init(hctx);
+                       if (ret) {
+                               /*
+                                * We don't want to give exit() a partially
+                                * initialized sched_data. init() must clean up
+                                * if it fails.
+                                */
+                               kfree(hctx->sched_data);
+                               hctx->sched_data = NULL;
+                               goto error;
+                       }
+               }
+       }
+
+       return 0;
+error:
+       blk_mq_sched_free_hctx_data(q, exit);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
+
+static void __blk_mq_sched_assign_ioc(struct request_queue *q,
+                                     struct request *rq, struct io_context *ioc)
+{
+       struct io_cq *icq;
+
+       spin_lock_irq(q->queue_lock);
+       icq = ioc_lookup_icq(ioc, q);
+       spin_unlock_irq(q->queue_lock);
+
+       if (!icq) {
+               icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
+               if (!icq)
+                       return;
+       }
+
+       rq->elv.icq = icq;
+       if (!blk_mq_sched_get_rq_priv(q, rq)) {
+               rq->rq_flags |= RQF_ELVPRIV;
+               get_io_context(icq->ioc);
+               return;
+       }
+
+       rq->elv.icq = NULL;
+}
+
+static void blk_mq_sched_assign_ioc(struct request_queue *q,
+                                   struct request *rq, struct bio *bio)
+{
+       struct io_context *ioc;
+
+       ioc = rq_ioc(bio);
+       if (ioc)
+               __blk_mq_sched_assign_ioc(q, rq, ioc);
+}
+
+struct request *blk_mq_sched_get_request(struct request_queue *q,
+                                        struct bio *bio,
+                                        unsigned int op,
+                                        struct blk_mq_alloc_data *data)
+{
+       struct elevator_queue *e = q->elevator;
+       struct blk_mq_hw_ctx *hctx;
+       struct blk_mq_ctx *ctx;
+       struct request *rq;
+
+       blk_queue_enter_live(q);
+       ctx = blk_mq_get_ctx(q);
+       hctx = blk_mq_map_queue(q, ctx->cpu);
+
+       blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx);
+
+       if (e) {
+               data->flags |= BLK_MQ_REQ_INTERNAL;
+
+               /*
+                * Flush requests are special and go directly to the
+                * dispatch list.
+                */
+               if (!op_is_flush(op) && e->type->ops.mq.get_request) {
+                       rq = e->type->ops.mq.get_request(q, op, data);
+                       if (rq)
+                               rq->rq_flags |= RQF_QUEUED;
+               } else
+                       rq = __blk_mq_alloc_request(data, op);
+       } else {
+               rq = __blk_mq_alloc_request(data, op);
+               if (rq)
+                       data->hctx->tags->rqs[rq->tag] = rq;
+       }
+
+       if (rq) {
+               if (!op_is_flush(op)) {
+                       rq->elv.icq = NULL;
+                       if (e && e->type->icq_cache)
+                               blk_mq_sched_assign_ioc(q, rq, bio);
+               }
+               data->hctx->queued++;
+               return rq;
+       }
+
+       blk_queue_exit(q);
+       return NULL;
+}
+
+void blk_mq_sched_put_request(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+       struct elevator_queue *e = q->elevator;
+
+       if (rq->rq_flags & RQF_ELVPRIV) {
+               blk_mq_sched_put_rq_priv(rq->q, rq);
+               if (rq->elv.icq) {
+                       put_io_context(rq->elv.icq->ioc);
+                       rq->elv.icq = NULL;
+               }
+       }
+
+       if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
+               e->type->ops.mq.put_request(rq);
+       else
+               blk_mq_finish_request(rq);
+}
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+       struct elevator_queue *e = hctx->queue->elevator;
+       LIST_HEAD(rq_list);
+
+       if (unlikely(blk_mq_hctx_stopped(hctx)))
+               return;
+
+       hctx->run++;
+
+       /*
+        * If we have previous entries on our dispatch list, grab them first for
+        * more fair dispatch.
+        */
+       if (!list_empty_careful(&hctx->dispatch)) {
+               spin_lock(&hctx->lock);
+               if (!list_empty(&hctx->dispatch))
+                       list_splice_init(&hctx->dispatch, &rq_list);
+               spin_unlock(&hctx->lock);
+       }
+
+       /*
+        * Only ask the scheduler for requests, if we didn't have residual
+        * requests from the dispatch list. This is to avoid the case where
+        * we only ever dispatch a fraction of the requests available because
+        * of low device queue depth. Once we pull requests out of the IO
+        * scheduler, we can no longer merge or sort them. So it's best to
+        * leave them there for as long as we can. Mark the hw queue as
+        * needing a restart in that case.
+        */
+       if (!list_empty(&rq_list)) {
+               blk_mq_sched_mark_restart(hctx);
+               blk_mq_dispatch_rq_list(hctx, &rq_list);
+       } else if (!e || !e->type->ops.mq.dispatch_request) {
+               blk_mq_flush_busy_ctxs(hctx, &rq_list);
+               blk_mq_dispatch_rq_list(hctx, &rq_list);
+       } else {
+               do {
+                       struct request *rq;
+
+                       rq = e->type->ops.mq.dispatch_request(hctx);
+                       if (!rq)
+                               break;
+                       list_add(&rq->queuelist, &rq_list);
+               } while (blk_mq_dispatch_rq_list(hctx, &rq_list));
+       }
+}
+
+void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
+                                  struct list_head *rq_list,
+                                  struct request *(*get_rq)(struct blk_mq_hw_ctx *))
+{
+       do {
+               struct request *rq;
+
+               rq = get_rq(hctx);
+               if (!rq)
+                       break;
+
+               list_add_tail(&rq->queuelist, rq_list);
+       } while (1);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
+
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio)
+{
+       struct request *rq;
+       int ret;
+
+       ret = elv_merge(q, &rq, bio);
+       if (ret == ELEVATOR_BACK_MERGE) {
+               if (!blk_mq_sched_allow_merge(q, rq, bio))
+                       return false;
+               if (bio_attempt_back_merge(q, rq, bio)) {
+                       if (!attempt_back_merge(q, rq))
+                               elv_merged_request(q, rq, ret);
+                       return true;
+               }
+       } else if (ret == ELEVATOR_FRONT_MERGE) {
+               if (!blk_mq_sched_allow_merge(q, rq, bio))
+                       return false;
+               if (bio_attempt_front_merge(q, rq, bio)) {
+                       if (!attempt_front_merge(q, rq))
+                               elv_merged_request(q, rq, ret);
+                       return true;
+               }
+       }
+
+       return false;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
+
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (e->type->ops.mq.bio_merge) {
+               struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+               struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+               blk_mq_put_ctx(ctx);
+               return e->type->ops.mq.bio_merge(hctx, bio);
+       }
+
+       return false;
+}
+
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
+{
+       return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
+
+void blk_mq_sched_request_inserted(struct request *rq)
+{
+       trace_block_rq_insert(rq->q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
+
+bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+       if (rq->tag == -1) {
+               rq->rq_flags |= RQF_SORTED;
+               return false;
+       }
+
+       /*
+        * If we already have a real request tag, send directly to
+        * the dispatch list.
+        */
+       spin_lock(&hctx->lock);
+       list_add(&rq->queuelist, &hctx->dispatch);
+       spin_unlock(&hctx->lock);
+       return true;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert);
+
+static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+{
+       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+               clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+               if (blk_mq_hctx_has_pending(hctx))
+                       blk_mq_run_hw_queue(hctx, true);
+       }
+}
+
+void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx)
+{
+       unsigned int i;
+
+       if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+               blk_mq_sched_restart_hctx(hctx);
+       else {
+               struct request_queue *q = hctx->queue;
+
+               if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+                       return;
+
+               clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
+
+               queue_for_each_hw_ctx(q, hctx, i)
+                       blk_mq_sched_restart_hctx(hctx);
+       }
+}
+
+/*
+ * Add flush/fua to the queue. If we fail getting a driver tag, then
+ * punt to the requeue list. Requeue will re-invoke us from a context
+ * that's safe to block from.
+ */
+static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
+                                     struct request *rq, bool can_block)
+{
+       if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
+               blk_insert_flush(rq);
+               blk_mq_run_hw_queue(hctx, true);
+       } else
+               blk_mq_add_to_requeue_list(rq, true, true);
+}
+
+void blk_mq_sched_insert_request(struct request *rq, bool at_head,
+                                bool run_queue, bool async, bool can_block)
+{
+       struct request_queue *q = rq->q;
+       struct elevator_queue *e = q->elevator;
+       struct blk_mq_ctx *ctx = rq->mq_ctx;
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+       if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
+               blk_mq_sched_insert_flush(hctx, rq, can_block);
+               return;
+       }
+
+       if (e && e->type->ops.mq.insert_requests) {
+               LIST_HEAD(list);
+
+               list_add(&rq->queuelist, &list);
+               e->type->ops.mq.insert_requests(hctx, &list, at_head);
+       } else {
+               spin_lock(&ctx->lock);
+               __blk_mq_insert_request(hctx, rq, at_head);
+               spin_unlock(&ctx->lock);
+       }
+
+       if (run_queue)
+               blk_mq_run_hw_queue(hctx, async);
+}
+
+void blk_mq_sched_insert_requests(struct request_queue *q,
+                                 struct blk_mq_ctx *ctx,
+                                 struct list_head *list, bool run_queue_async)
+{
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+       struct elevator_queue *e = hctx->queue->elevator;
+
+       if (e && e->type->ops.mq.insert_requests)
+               e->type->ops.mq.insert_requests(hctx, list, false);
+       else
+               blk_mq_insert_requests(hctx, ctx, list);
+
+       blk_mq_run_hw_queue(hctx, run_queue_async);
+}
+
+static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
+                                  struct blk_mq_hw_ctx *hctx,
+                                  unsigned int hctx_idx)
+{
+       if (hctx->sched_tags) {
+               blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
+               blk_mq_free_rq_map(hctx->sched_tags);
+               hctx->sched_tags = NULL;
+       }
+}
+
+int blk_mq_sched_setup(struct request_queue *q)
+{
+       struct blk_mq_tag_set *set = q->tag_set;
+       struct blk_mq_hw_ctx *hctx;
+       int ret, i;
+
+       /*
+        * Default to 256, since we don't split into sync/async like the
+        * old code did. Additionally, this is a per-hw queue depth.
+        */
+       q->nr_requests = 2 * BLKDEV_MAX_RQ;
+
+       /*
+        * We're switching to using an IO scheduler, so setup the hctx
+        * scheduler tags and switch the request map from the regular
+        * tags to scheduler tags. First allocate what we need, so we
+        * can safely fail and fallback, if needed.
+        */
+       ret = 0;
+       queue_for_each_hw_ctx(q, hctx, i) {
+               hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0);
+               if (!hctx->sched_tags) {
+                       ret = -ENOMEM;
+                       break;
+               }
+               ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests);
+               if (ret)
+                       break;
+       }
+
+       /*
+        * If we failed, free what we did allocate
+        */
+       if (ret) {
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       if (!hctx->sched_tags)
+                               continue;
+                       blk_mq_sched_free_tags(set, hctx, i);
+               }
+
+               return ret;
+       }
+
+       return 0;
+}
+
+void blk_mq_sched_teardown(struct request_queue *q)
+{
+       struct blk_mq_tag_set *set = q->tag_set;
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i)
+               blk_mq_sched_free_tags(set, hctx, i);
+}
+
+int blk_mq_sched_init(struct request_queue *q)
+{
+       int ret;
+
+#if defined(CONFIG_DEFAULT_SQ_NONE)
+       if (q->nr_hw_queues == 1)
+               return 0;
+#endif
+#if defined(CONFIG_DEFAULT_MQ_NONE)
+       if (q->nr_hw_queues > 1)
+               return 0;
+#endif
+
+       mutex_lock(&q->sysfs_lock);
+       ret = elevator_init(q, NULL);
+       mutex_unlock(&q->sysfs_lock);
+
+       return ret;
+}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
new file mode 100644 (file)
index 0000000..9478aae
--- /dev/null
@@ -0,0 +1,142 @@
+#ifndef BLK_MQ_SCHED_H
+#define BLK_MQ_SCHED_H
+
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+                               int (*init)(struct blk_mq_hw_ctx *),
+                               void (*exit)(struct blk_mq_hw_ctx *));
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+                                void (*exit)(struct blk_mq_hw_ctx *));
+
+struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data);
+void blk_mq_sched_put_request(struct request *rq);
+
+void blk_mq_sched_request_inserted(struct request *rq);
+bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq);
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio);
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx);
+
+void blk_mq_sched_insert_request(struct request *rq, bool at_head,
+                                bool run_queue, bool async, bool can_block);
+void blk_mq_sched_insert_requests(struct request_queue *q,
+                                 struct blk_mq_ctx *ctx,
+                                 struct list_head *list, bool run_queue_async);
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
+                       struct list_head *rq_list,
+                       struct request *(*get_rq)(struct blk_mq_hw_ctx *));
+
+int blk_mq_sched_setup(struct request_queue *q);
+void blk_mq_sched_teardown(struct request_queue *q);
+
+int blk_mq_sched_init(struct request_queue *q);
+
+static inline bool
+blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio))
+               return false;
+
+       return __blk_mq_sched_bio_merge(q, bio);
+}
+
+static inline int blk_mq_sched_get_rq_priv(struct request_queue *q,
+                                          struct request *rq)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (e && e->type->ops.mq.get_rq_priv)
+               return e->type->ops.mq.get_rq_priv(q, rq);
+
+       return 0;
+}
+
+static inline void blk_mq_sched_put_rq_priv(struct request_queue *q,
+                                           struct request *rq)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (e && e->type->ops.mq.put_rq_priv)
+               e->type->ops.mq.put_rq_priv(q, rq);
+}
+
+static inline bool
+blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
+                        struct bio *bio)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (e && e->type->ops.mq.allow_merge)
+               return e->type->ops.mq.allow_merge(q, rq, bio);
+
+       return true;
+}
+
+static inline void
+blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+       struct elevator_queue *e = hctx->queue->elevator;
+
+       if (e && e->type->ops.mq.completed_request)
+               e->type->ops.mq.completed_request(hctx, rq);
+
+       BUG_ON(rq->internal_tag == -1);
+
+       blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+}
+
+static inline void blk_mq_sched_started_request(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+       struct elevator_queue *e = q->elevator;
+
+       if (e && e->type->ops.mq.started_request)
+               e->type->ops.mq.started_request(rq);
+}
+
+static inline void blk_mq_sched_requeue_request(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+       struct elevator_queue *e = q->elevator;
+
+       if (e && e->type->ops.mq.requeue_request)
+               e->type->ops.mq.requeue_request(rq);
+}
+
+static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
+{
+       struct elevator_queue *e = hctx->queue->elevator;
+
+       if (e && e->type->ops.mq.has_work)
+               return e->type->ops.mq.has_work(hctx);
+
+       return false;
+}
+
+static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx)
+{
+       if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+               set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+               if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+                       struct request_queue *q = hctx->queue;
+
+                       if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+                               set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
+               }
+       }
+}
+
+static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
+{
+       return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+}
+
+#endif
index eacd3af..308b3f4 100644 (file)
@@ -122,123 +122,16 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
        return res;
 }
 
-static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
-{
-       return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
-                               ctx->rq_dispatched[0]);
-}
-
-static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
-{
-       return sprintf(page, "%lu\n", ctx->rq_merged);
-}
-
-static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
-{
-       return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
-                               ctx->rq_completed[0]);
-}
-
-static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
-{
-       struct request *rq;
-       int len = snprintf(page, PAGE_SIZE - 1, "%s:\n", msg);
-
-       list_for_each_entry(rq, list, queuelist) {
-               const int rq_len = 2 * sizeof(rq) + 2;
-
-               /* if the output will be truncated */
-               if (PAGE_SIZE - 1 < len + rq_len) {
-                       /* backspacing if it can't hold '\t...\n' */
-                       if (PAGE_SIZE - 1 < len + 5)
-                               len -= rq_len;
-                       len += snprintf(page + len, PAGE_SIZE - 1 - len,
-                                       "\t...\n");
-                       break;
-               }
-               len += snprintf(page + len, PAGE_SIZE - 1 - len,
-                               "\t%p\n", rq);
-       }
-
-       return len;
-}
-
-static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
-{
-       ssize_t ret;
-
-       spin_lock(&ctx->lock);
-       ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
-       spin_unlock(&ctx->lock);
-
-       return ret;
-}
-
-static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-       return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n",
-                      hctx->poll_considered, hctx->poll_invoked,
-                      hctx->poll_success);
-}
-
-static ssize_t blk_mq_hw_sysfs_poll_store(struct blk_mq_hw_ctx *hctx,
-                                         const char *page, size_t size)
-{
-       hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
-
-       return size;
-}
-
-static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
-                                          char *page)
-{
-       return sprintf(page, "%lu\n", hctx->queued);
-}
-
-static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-       return sprintf(page, "%lu\n", hctx->run);
-}
-
-static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
-                                              char *page)
-{
-       char *start_page = page;
-       int i;
-
-       page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
-
-       for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
-               unsigned int d = 1U << (i - 1);
-
-               page += sprintf(page, "%8u\t%lu\n", d, hctx->dispatched[i]);
-       }
-
-       page += sprintf(page, "%8u+\t%lu\n", 1U << (i - 1),
-                                               hctx->dispatched[i]);
-       return page - start_page;
-}
-
-static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
+static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx,
                                            char *page)
 {
-       ssize_t ret;
-
-       spin_lock(&hctx->lock);
-       ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
-       spin_unlock(&hctx->lock);
-
-       return ret;
+       return sprintf(page, "%u\n", hctx->tags->nr_tags);
 }
 
-static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
+static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx,
+                                                    char *page)
 {
-       return blk_mq_tag_sysfs_show(hctx->tags, page);
-}
-
-static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-       return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
+       return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags);
 }
 
 static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
@@ -259,121 +152,27 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
        return ret;
 }
 
-static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
-{
-       struct blk_mq_ctx *ctx;
-       unsigned int i;
-
-       hctx_for_each_ctx(hctx, ctx, i) {
-               blk_stat_init(&ctx->stat[BLK_STAT_READ]);
-               blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
-       }
-}
-
-static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
-                                         const char *page, size_t count)
-{
-       blk_mq_stat_clear(hctx);
-       return count;
-}
-
-static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
-{
-       return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
-                       pre, (long long) stat->nr_samples,
-                       (long long) stat->mean, (long long) stat->min,
-                       (long long) stat->max);
-}
-
-static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-       struct blk_rq_stat stat[2];
-       ssize_t ret;
-
-       blk_stat_init(&stat[BLK_STAT_READ]);
-       blk_stat_init(&stat[BLK_STAT_WRITE]);
-
-       blk_hctx_stat_get(hctx, stat);
-
-       ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
-       ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
-       return ret;
-}
-
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
-       .attr = {.name = "dispatched", .mode = S_IRUGO },
-       .show = blk_mq_sysfs_dispatched_show,
-};
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
-       .attr = {.name = "merged", .mode = S_IRUGO },
-       .show = blk_mq_sysfs_merged_show,
-};
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
-       .attr = {.name = "completed", .mode = S_IRUGO },
-       .show = blk_mq_sysfs_completed_show,
-};
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
-       .attr = {.name = "rq_list", .mode = S_IRUGO },
-       .show = blk_mq_sysfs_rq_list_show,
-};
-
 static struct attribute *default_ctx_attrs[] = {
-       &blk_mq_sysfs_dispatched.attr,
-       &blk_mq_sysfs_merged.attr,
-       &blk_mq_sysfs_completed.attr,
-       &blk_mq_sysfs_rq_list.attr,
        NULL,
 };
 
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
-       .attr = {.name = "queued", .mode = S_IRUGO },
-       .show = blk_mq_hw_sysfs_queued_show,
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
+       .attr = {.name = "nr_tags", .mode = S_IRUGO },
+       .show = blk_mq_hw_sysfs_nr_tags_show,
 };
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
-       .attr = {.name = "run", .mode = S_IRUGO },
-       .show = blk_mq_hw_sysfs_run_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
-       .attr = {.name = "dispatched", .mode = S_IRUGO },
-       .show = blk_mq_hw_sysfs_dispatched_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
-       .attr = {.name = "active", .mode = S_IRUGO },
-       .show = blk_mq_hw_sysfs_active_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
-       .attr = {.name = "pending", .mode = S_IRUGO },
-       .show = blk_mq_hw_sysfs_rq_list_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
-       .attr = {.name = "tags", .mode = S_IRUGO },
-       .show = blk_mq_hw_sysfs_tags_show,
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = {
+       .attr = {.name = "nr_reserved_tags", .mode = S_IRUGO },
+       .show = blk_mq_hw_sysfs_nr_reserved_tags_show,
 };
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
        .attr = {.name = "cpu_list", .mode = S_IRUGO },
        .show = blk_mq_hw_sysfs_cpus_show,
 };
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
-       .attr = {.name = "io_poll", .mode = S_IWUSR | S_IRUGO },
-       .show = blk_mq_hw_sysfs_poll_show,
-       .store = blk_mq_hw_sysfs_poll_store,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
-       .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
-       .show = blk_mq_hw_sysfs_stat_show,
-       .store = blk_mq_hw_sysfs_stat_store,
-};
 
 static struct attribute *default_hw_ctx_attrs[] = {
-       &blk_mq_hw_sysfs_queued.attr,
-       &blk_mq_hw_sysfs_run.attr,
-       &blk_mq_hw_sysfs_dispatched.attr,
-       &blk_mq_hw_sysfs_pending.attr,
-       &blk_mq_hw_sysfs_tags.attr,
+       &blk_mq_hw_sysfs_nr_tags.attr,
+       &blk_mq_hw_sysfs_nr_reserved_tags.attr,
        &blk_mq_hw_sysfs_cpus.attr,
-       &blk_mq_hw_sysfs_active.attr,
-       &blk_mq_hw_sysfs_poll.attr,
-       &blk_mq_hw_sysfs_stat.attr,
        NULL,
 };
 
@@ -455,6 +254,8 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
                kobject_put(&hctx->kobj);
        }
 
+       blk_mq_debugfs_unregister(q);
+
        kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
        kobject_del(&q->mq_kobj);
        kobject_put(&q->mq_kobj);
@@ -504,6 +305,8 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
 
        kobject_uevent(&q->mq_kobj, KOBJ_ADD);
 
+       blk_mq_debugfs_register(q, kobject_name(&dev->kobj));
+
        queue_for_each_hw_ctx(q, hctx, i) {
                ret = blk_mq_register_hctx(hctx);
                if (ret)
@@ -529,6 +332,8 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
        if (!q->mq_sysfs_init_done)
                return;
 
+       blk_mq_debugfs_unregister_hctxs(q);
+
        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_unregister_hctx(hctx);
 }
@@ -541,6 +346,8 @@ int blk_mq_sysfs_register(struct request_queue *q)
        if (!q->mq_sysfs_init_done)
                return ret;
 
+       blk_mq_debugfs_register_hctxs(q);
+
        queue_for_each_hw_ctx(q, hctx, i) {
                ret = blk_mq_register_hctx(hctx);
                if (ret)
index dcf5ce3..54c8436 100644 (file)
@@ -90,113 +90,97 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
        return atomic_read(&hctx->nr_active) < depth;
 }
 
-static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt)
+static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
+                           struct sbitmap_queue *bt)
 {
-       if (!hctx_may_queue(hctx, bt))
+       if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
+           !hctx_may_queue(data->hctx, bt))
                return -1;
        return __sbitmap_queue_get(bt);
 }
 
-static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
-                 struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags)
+unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
+       struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+       struct sbitmap_queue *bt;
        struct sbq_wait_state *ws;
        DEFINE_WAIT(wait);
+       unsigned int tag_offset;
+       bool drop_ctx;
        int tag;
 
-       tag = __bt_get(hctx, bt);
+       if (data->flags & BLK_MQ_REQ_RESERVED) {
+               if (unlikely(!tags->nr_reserved_tags)) {
+                       WARN_ON_ONCE(1);
+                       return BLK_MQ_TAG_FAIL;
+               }
+               bt = &tags->breserved_tags;
+               tag_offset = 0;
+       } else {
+               bt = &tags->bitmap_tags;
+               tag_offset = tags->nr_reserved_tags;
+       }
+
+       tag = __blk_mq_get_tag(data, bt);
        if (tag != -1)
-               return tag;
+               goto found_tag;
 
        if (data->flags & BLK_MQ_REQ_NOWAIT)
-               return -1;
+               return BLK_MQ_TAG_FAIL;
 
-       ws = bt_wait_ptr(bt, hctx);
+       ws = bt_wait_ptr(bt, data->hctx);
+       drop_ctx = data->ctx == NULL;
        do {
                prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-               tag = __bt_get(hctx, bt);
+               tag = __blk_mq_get_tag(data, bt);
                if (tag != -1)
                        break;
 
                /*
                 * We're out of tags on this hardware queue, kick any
                 * pending IO submits before going to sleep waiting for
-                * some to complete. Note that hctx can be NULL here for
-                * reserved tag allocation.
+                * some to complete.
                 */
-               if (hctx)
-                       blk_mq_run_hw_queue(hctx, false);
+               blk_mq_run_hw_queue(data->hctx, false);
 
                /*
                 * Retry tag allocation after running the hardware queue,
                 * as running the queue may also have found completions.
                 */
-               tag = __bt_get(hctx, bt);
+               tag = __blk_mq_get_tag(data, bt);
                if (tag != -1)
                        break;
 
-               blk_mq_put_ctx(data->ctx);
+               if (data->ctx)
+                       blk_mq_put_ctx(data->ctx);
 
                io_schedule();
 
                data->ctx = blk_mq_get_ctx(data->q);
                data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
-               if (data->flags & BLK_MQ_REQ_RESERVED) {
-                       bt = &data->hctx->tags->breserved_tags;
-               } else {
-                       hctx = data->hctx;
-                       bt = &hctx->tags->bitmap_tags;
-               }
+               tags = blk_mq_tags_from_data(data);
+               if (data->flags & BLK_MQ_REQ_RESERVED)
+                       bt = &tags->breserved_tags;
+               else
+                       bt = &tags->bitmap_tags;
+
                finish_wait(&ws->wait, &wait);
-               ws = bt_wait_ptr(bt, hctx);
+               ws = bt_wait_ptr(bt, data->hctx);
        } while (1);
 
-       finish_wait(&ws->wait, &wait);
-       return tag;
-}
-
-static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
-{
-       int tag;
-
-       tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
-                    data->hctx->tags);
-       if (tag >= 0)
-               return tag + data->hctx->tags->nr_reserved_tags;
-
-       return BLK_MQ_TAG_FAIL;
-}
-
-static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
-{
-       int tag;
-
-       if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
-               WARN_ON_ONCE(1);
-               return BLK_MQ_TAG_FAIL;
-       }
-
-       tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL,
-                    data->hctx->tags);
-       if (tag < 0)
-               return BLK_MQ_TAG_FAIL;
+       if (drop_ctx && data->ctx)
+               blk_mq_put_ctx(data->ctx);
 
-       return tag;
-}
+       finish_wait(&ws->wait, &wait);
 
-unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
-{
-       if (data->flags & BLK_MQ_REQ_RESERVED)
-               return __blk_mq_get_reserved_tag(data);
-       return __blk_mq_get_tag(data);
+found_tag:
+       return tag + tag_offset;
 }
 
-void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
-                   unsigned int tag)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+                   struct blk_mq_ctx *ctx, unsigned int tag)
 {
-       struct blk_mq_tags *tags = hctx->tags;
-
        if (tag >= tags->nr_reserved_tags) {
                const int real_tag = tag - tags->nr_reserved_tags;
 
@@ -312,11 +296,11 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
                struct blk_mq_tags *tags = set->tags[i];
 
                for (j = 0; j < tags->nr_tags; j++) {
-                       if (!tags->rqs[j])
+                       if (!tags->static_rqs[j])
                                continue;
 
                        ret = set->ops->reinit_request(set->driver_data,
-                                               tags->rqs[j]);
+                                               tags->static_rqs[j]);
                        if (ret)
                                goto out;
                }
@@ -351,11 +335,6 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 
 }
 
-static unsigned int bt_unused_tags(const struct sbitmap_queue *bt)
-{
-       return bt->sb.depth - sbitmap_weight(&bt->sb);
-}
-
 static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
                    bool round_robin, int node)
 {
@@ -411,19 +390,56 @@ void blk_mq_free_tags(struct blk_mq_tags *tags)
        kfree(tags);
 }
 
-int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
+int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
+                           struct blk_mq_tags **tagsptr, unsigned int tdepth,
+                           bool can_grow)
 {
-       tdepth -= tags->nr_reserved_tags;
-       if (tdepth > tags->nr_tags)
+       struct blk_mq_tags *tags = *tagsptr;
+
+       if (tdepth <= tags->nr_reserved_tags)
                return -EINVAL;
 
+       tdepth -= tags->nr_reserved_tags;
+
        /*
-        * Don't need (or can't) update reserved tags here, they remain
-        * static and should never need resizing.
+        * If we are allowed to grow beyond the original size, allocate
+        * a new set of tags before freeing the old one.
         */
-       sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+       if (tdepth > tags->nr_tags) {
+               struct blk_mq_tag_set *set = hctx->queue->tag_set;
+               struct blk_mq_tags *new;
+               bool ret;
+
+               if (!can_grow)
+                       return -EINVAL;
+
+               /*
+                * We need some sort of upper limit, set it high enough that
+                * no valid use cases should require more.
+                */
+               if (tdepth > 16 * BLKDEV_MAX_RQ)
+                       return -EINVAL;
+
+               new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0);
+               if (!new)
+                       return -ENOMEM;
+               ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
+               if (ret) {
+                       blk_mq_free_rq_map(new);
+                       return -ENOMEM;
+               }
+
+               blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
+               blk_mq_free_rq_map(*tagsptr);
+               *tagsptr = new;
+       } else {
+               /*
+                * Don't need (or can't) update reserved tags here, they
+                * remain static and should never need resizing.
+                */
+               sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+       }
 
-       blk_mq_tag_wakeup_all(tags, false);
        return 0;
 }
 
@@ -454,25 +470,3 @@ u32 blk_mq_unique_tag(struct request *rq)
                (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
 }
 EXPORT_SYMBOL(blk_mq_unique_tag);
-
-ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
-{
-       char *orig_page = page;
-       unsigned int free, res;
-
-       if (!tags)
-               return 0;
-
-       page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
-                       "bits_per_word=%u\n",
-                       tags->nr_tags, tags->nr_reserved_tags,
-                       1U << tags->bitmap_tags.sb.shift);
-
-       free = bt_unused_tags(&tags->bitmap_tags);
-       res = bt_unused_tags(&tags->breserved_tags);
-
-       page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
-       page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
-
-       return page - orig_page;
-}
index d166273..6349742 100644 (file)
@@ -16,6 +16,7 @@ struct blk_mq_tags {
        struct sbitmap_queue breserved_tags;
 
        struct request **rqs;
+       struct request **static_rqs;
        struct list_head page_list;
 };
 
@@ -24,11 +25,12 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
-                          unsigned int tag);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+                          struct blk_mq_ctx *ctx, unsigned int tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
-extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
-extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
+extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
+                                       struct blk_mq_tags **tags,
+                                       unsigned int depth, bool can_grow);
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
                void *priv);
index c3400b5..489076e 100644 (file)
@@ -32,6 +32,7 @@
 #include "blk-mq-tag.h"
 #include "blk-stat.h"
 #include "blk-wbt.h"
+#include "blk-mq-sched.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
@@ -39,9 +40,11 @@ static LIST_HEAD(all_q_list);
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
-static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
-       return sbitmap_any_bit_set(&hctx->ctx_map);
+       return sbitmap_any_bit_set(&hctx->ctx_map) ||
+                       !list_empty_careful(&hctx->dispatch) ||
+                       blk_mq_sched_has_work(hctx);
 }
 
 /*
@@ -167,8 +170,8 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_can_queue);
 
-static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
-                              struct request *rq, unsigned int op)
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+                       struct request *rq, unsigned int op)
 {
        INIT_LIST_HEAD(&rq->queuelist);
        /* csd/requeue_work/fifo_time is initialized before use */
@@ -213,53 +216,58 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 
        ctx->rq_dispatched[op_is_sync(op)]++;
 }
+EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
 
-static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+                                      unsigned int op)
 {
        struct request *rq;
        unsigned int tag;
 
        tag = blk_mq_get_tag(data);
        if (tag != BLK_MQ_TAG_FAIL) {
-               rq = data->hctx->tags->rqs[tag];
+               struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 
-               if (blk_mq_tag_busy(data->hctx)) {
-                       rq->rq_flags = RQF_MQ_INFLIGHT;
-                       atomic_inc(&data->hctx->nr_active);
+               rq = tags->static_rqs[tag];
+
+               if (data->flags & BLK_MQ_REQ_INTERNAL) {
+                       rq->tag = -1;
+                       rq->internal_tag = tag;
+               } else {
+                       if (blk_mq_tag_busy(data->hctx)) {
+                               rq->rq_flags = RQF_MQ_INFLIGHT;
+                               atomic_inc(&data->hctx->nr_active);
+                       }
+                       rq->tag = tag;
+                       rq->internal_tag = -1;
                }
 
-               rq->tag = tag;
                blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
                return rq;
        }
 
        return NULL;
 }
+EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
 
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
                unsigned int flags)
 {
-       struct blk_mq_ctx *ctx;
-       struct blk_mq_hw_ctx *hctx;
+       struct blk_mq_alloc_data alloc_data = { .flags = flags };
        struct request *rq;
-       struct blk_mq_alloc_data alloc_data;
        int ret;
 
        ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
        if (ret)
                return ERR_PTR(ret);
 
-       ctx = blk_mq_get_ctx(q);
-       hctx = blk_mq_map_queue(q, ctx->cpu);
-       blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-       rq = __blk_mq_alloc_request(&alloc_data, rw);
-       blk_mq_put_ctx(ctx);
+       rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
 
-       if (!rq) {
-               blk_queue_exit(q);
+       blk_mq_put_ctx(alloc_data.ctx);
+       blk_queue_exit(q);
+
+       if (!rq)
                return ERR_PTR(-EWOULDBLOCK);
-       }
 
        rq->__data_len = 0;
        rq->__sector = (sector_t) -1;
@@ -319,10 +327,10 @@ out_queue_exit:
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
-static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
-                                 struct blk_mq_ctx *ctx, struct request *rq)
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+                            struct request *rq)
 {
-       const int tag = rq->tag;
+       const int sched_tag = rq->internal_tag;
        struct request_queue *q = rq->q;
 
        if (rq->rq_flags & RQF_MQ_INFLIGHT)
@@ -333,23 +341,31 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 
        clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
        clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
-       blk_mq_put_tag(hctx, ctx, tag);
+       if (rq->tag != -1)
+               blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+       if (sched_tag != -1)
+               blk_mq_sched_completed_request(hctx, rq);
+       blk_mq_sched_restart_queues(hctx);
        blk_queue_exit(q);
 }
 
-void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
+                                    struct request *rq)
 {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
 
        ctx->rq_completed[rq_is_sync(rq)]++;
-       __blk_mq_free_request(hctx, ctx, rq);
+       __blk_mq_finish_request(hctx, ctx, rq);
+}
 
+void blk_mq_finish_request(struct request *rq)
+{
+       blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 }
-EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
 
 void blk_mq_free_request(struct request *rq)
 {
-       blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
+       blk_mq_sched_put_request(rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
@@ -467,6 +483,8 @@ void blk_mq_start_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
 
+       blk_mq_sched_started_request(rq);
+
        trace_block_rq_issue(q, rq);
 
        rq->resid_len = blk_rq_bytes(rq);
@@ -515,6 +533,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 
        trace_block_rq_requeue(q, rq);
        wbt_requeue(q->rq_wb, &rq->issue_stat);
+       blk_mq_sched_requeue_request(rq);
 
        if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
                if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -549,13 +568,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
 
                rq->rq_flags &= ~RQF_SOFTBARRIER;
                list_del_init(&rq->queuelist);
-               blk_mq_insert_request(rq, true, false, false);
+               blk_mq_sched_insert_request(rq, true, false, false, true);
        }
 
        while (!list_empty(&rq_list)) {
                rq = list_entry(rq_list.next, struct request, queuelist);
                list_del_init(&rq->queuelist);
-               blk_mq_insert_request(rq, false, false, false);
+               blk_mq_sched_insert_request(rq, false, false, false, true);
        }
 
        blk_mq_run_hw_queues(q, false);
@@ -639,7 +658,7 @@ struct blk_mq_timeout_data {
 
 void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
-       struct blk_mq_ops *ops = req->q->mq_ops;
+       const struct blk_mq_ops *ops = req->q->mq_ops;
        enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 
        /*
@@ -763,6 +782,12 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
                        continue;
 
                el_ret = blk_try_merge(rq, bio);
+               if (el_ret == ELEVATOR_NO_MERGE)
+                       continue;
+
+               if (!blk_mq_sched_allow_merge(q, rq, bio))
+                       break;
+
                if (el_ret == ELEVATOR_BACK_MERGE) {
                        if (bio_attempt_back_merge(q, rq, bio)) {
                                ctx->rq_merged++;
@@ -803,7 +828,7 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
  * Process software queues that have been marked busy, splicing them
  * to the for-dispatch
  */
-static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
        struct flush_busy_ctx_data data = {
                .hctx = hctx,
@@ -812,6 +837,7 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 
        sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
 }
+EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
 static inline unsigned int queued_to_index(unsigned int queued)
 {
@@ -821,6 +847,74 @@ static inline unsigned int queued_to_index(unsigned int queued)
        return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
+bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
+                          bool wait)
+{
+       struct blk_mq_alloc_data data = {
+               .q = rq->q,
+               .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
+               .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+       };
+
+       if (rq->tag != -1) {
+done:
+               if (hctx)
+                       *hctx = data.hctx;
+               return true;
+       }
+
+       rq->tag = blk_mq_get_tag(&data);
+       if (rq->tag >= 0) {
+               if (blk_mq_tag_busy(data.hctx)) {
+                       rq->rq_flags |= RQF_MQ_INFLIGHT;
+                       atomic_inc(&data.hctx->nr_active);
+               }
+               data.hctx->tags->rqs[rq->tag] = rq;
+               goto done;
+       }
+
+       return false;
+}
+
+static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+                                 struct request *rq)
+{
+       if (rq->tag == -1 || rq->internal_tag == -1)
+               return;
+
+       blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
+       rq->tag = -1;
+
+       if (rq->rq_flags & RQF_MQ_INFLIGHT) {
+               rq->rq_flags &= ~RQF_MQ_INFLIGHT;
+               atomic_dec(&hctx->nr_active);
+       }
+}
+
+/*
+ * If we fail getting a driver tag because all the driver tags are already
+ * assigned and on the dispatch list, BUT the first entry does not have a
+ * tag, then we could deadlock. For that case, move entries with assigned
+ * driver tags to the front, leaving the set of tagged requests in the
+ * same order, and the untagged set in the same order.
+ */
+static bool reorder_tags_to_front(struct list_head *list)
+{
+       struct request *rq, *tmp, *first = NULL;
+
+       list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
+               if (rq == first)
+                       break;
+               if (rq->tag != -1) {
+                       list_move(&rq->queuelist, list);
+                       if (!first)
+                               first = rq;
+               }
+       }
+
+       return first != NULL;
+}
+
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
        struct request_queue *q = hctx->queue;
@@ -843,6 +937,20 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                struct blk_mq_queue_data bd;
 
                rq = list_first_entry(list, struct request, queuelist);
+               if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+                       if (!queued && reorder_tags_to_front(list))
+                               continue;
+
+                       /*
+                        * We failed getting a driver tag. Mark the queue(s)
+                        * as needing a restart. Retry getting a tag again,
+                        * in case the needed IO completed right before we
+                        * marked the queue as needing a restart.
+                        */
+                       blk_mq_sched_mark_restart(hctx);
+                       if (!blk_mq_get_driver_tag(rq, &hctx, false))
+                               break;
+               }
                list_del_init(&rq->queuelist);
 
                bd.rq = rq;
@@ -855,6 +963,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                        queued++;
                        break;
                case BLK_MQ_RQ_QUEUE_BUSY:
+                       blk_mq_put_driver_tag(hctx, rq);
                        list_add(&rq->queuelist, list);
                        __blk_mq_requeue_request(rq);
                        break;
@@ -885,7 +994,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
         */
        if (!list_empty(list)) {
                spin_lock(&hctx->lock);
-               list_splice(list, &hctx->dispatch);
+               list_splice_init(list, &hctx->dispatch);
                spin_unlock(&hctx->lock);
 
                /*
@@ -896,47 +1005,17 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                 * the requests in rq_list might get lost.
                 *
                 * blk_mq_run_hw_queue() already checks the STOPPED bit
-                **/
-               blk_mq_run_hw_queue(hctx, true);
+                *
+                * If RESTART is set, then let completion restart the queue
+                * instead of potentially looping here.
+                */
+               if (!blk_mq_sched_needs_restart(hctx))
+                       blk_mq_run_hw_queue(hctx, true);
        }
 
        return ret != BLK_MQ_RQ_QUEUE_BUSY;
 }
 
-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
-{
-       LIST_HEAD(rq_list);
-
-       if (unlikely(blk_mq_hctx_stopped(hctx)))
-               return;
-
-       hctx->run++;
-
-       /*
-        * Touch any software queue that has pending entries.
-        */
-       flush_busy_ctxs(hctx, &rq_list);
-
-       /*
-        * If we have previous entries on our dispatch list, grab them
-        * and stuff them at the front for more fair dispatch.
-        */
-       if (!list_empty_careful(&hctx->dispatch)) {
-               spin_lock(&hctx->lock);
-               if (!list_empty(&hctx->dispatch))
-                       list_splice_init(&hctx->dispatch, &rq_list);
-               spin_unlock(&hctx->lock);
-       }
-
-       blk_mq_dispatch_rq_list(hctx, &rq_list);
-}
-
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
        int srcu_idx;
@@ -946,11 +1025,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
        if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
                rcu_read_lock();
-               blk_mq_process_rq_list(hctx);
+               blk_mq_sched_dispatch_requests(hctx);
                rcu_read_unlock();
        } else {
                srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
-               blk_mq_process_rq_list(hctx);
+               blk_mq_sched_dispatch_requests(hctx);
                srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
        }
 }
@@ -1006,8 +1085,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
        int i;
 
        queue_for_each_hw_ctx(q, hctx, i) {
-               if ((!blk_mq_hctx_has_pending(hctx) &&
-                   list_empty_careful(&hctx->dispatch)) ||
+               if (!blk_mq_hctx_has_pending(hctx) ||
                    blk_mq_hctx_stopped(hctx))
                        continue;
 
@@ -1116,6 +1194,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
        if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
                return;
 
+       blk_mq_stop_hw_queue(hctx);
        kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
                        &hctx->delay_work, msecs_to_jiffies(msecs));
 }
@@ -1135,8 +1214,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
                list_add_tail(&rq->queuelist, &ctx->rq_list);
 }
 
-static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
-                                   struct request *rq, bool at_head)
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+                            bool at_head)
 {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
 
@@ -1144,32 +1223,10 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
        blk_mq_hctx_mark_pending(hctx, ctx);
 }
 
-void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
-                          bool async)
-{
-       struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct request_queue *q = rq->q;
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
-       spin_lock(&ctx->lock);
-       __blk_mq_insert_request(hctx, rq, at_head);
-       spin_unlock(&ctx->lock);
-
-       if (run_queue)
-               blk_mq_run_hw_queue(hctx, async);
-}
-
-static void blk_mq_insert_requests(struct request_queue *q,
-                                    struct blk_mq_ctx *ctx,
-                                    struct list_head *list,
-                                    int depth,
-                                    bool from_schedule)
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+                           struct list_head *list)
 
 {
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
-       trace_block_unplug(q, depth, !from_schedule);
-
        /*
         * preemption doesn't flush plug list, so it's possible ctx->cpu is
         * offline now
@@ -1185,8 +1242,6 @@ static void blk_mq_insert_requests(struct request_queue *q,
        }
        blk_mq_hctx_mark_pending(hctx, ctx);
        spin_unlock(&ctx->lock);
-
-       blk_mq_run_hw_queue(hctx, from_schedule);
 }
 
 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -1222,9 +1277,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
                BUG_ON(!rq->q);
                if (rq->mq_ctx != this_ctx) {
                        if (this_ctx) {
-                               blk_mq_insert_requests(this_q, this_ctx,
-                                                       &ctx_list, depth,
-                                                       from_schedule);
+                               trace_block_unplug(this_q, depth, from_schedule);
+                               blk_mq_sched_insert_requests(this_q, this_ctx,
+                                                               &ctx_list,
+                                                               from_schedule);
                        }
 
                        this_ctx = rq->mq_ctx;
@@ -1241,8 +1297,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
         * on 'ctx_list'. Do those.
         */
        if (this_ctx) {
-               blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
-                                      from_schedule);
+               trace_block_unplug(this_q, depth, from_schedule);
+               blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
+                                               from_schedule);
        }
 }
 
@@ -1280,46 +1337,39 @@ insert_rq:
                }
 
                spin_unlock(&ctx->lock);
-               __blk_mq_free_request(hctx, ctx, rq);
+               __blk_mq_finish_request(hctx, ctx, rq);
                return true;
        }
 }
 
-static struct request *blk_mq_map_request(struct request_queue *q,
-                                         struct bio *bio,
-                                         struct blk_mq_alloc_data *data)
+static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
-       struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *ctx;
-       struct request *rq;
-
-       blk_queue_enter_live(q);
-       ctx = blk_mq_get_ctx(q);
-       hctx = blk_mq_map_queue(q, ctx->cpu);
+       if (rq->tag != -1)
+               return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
 
-       trace_block_getrq(q, bio, bio->bi_opf);
-       blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
-       rq = __blk_mq_alloc_request(data, bio->bi_opf);
-
-       data->hctx->queued++;
-       return rq;
+       return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
 }
 
 static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 {
-       int ret;
        struct request_queue *q = rq->q;
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
        struct blk_mq_queue_data bd = {
                .rq = rq,
                .list = NULL,
                .last = 1
        };
-       blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+       struct blk_mq_hw_ctx *hctx;
+       blk_qc_t new_cookie;
+       int ret;
 
-       if (blk_mq_hctx_stopped(hctx))
+       if (q->elevator)
                goto insert;
 
+       if (!blk_mq_get_driver_tag(rq, &hctx, false))
+               goto insert;
+
+       new_cookie = request_to_qc_t(hctx, rq);
+
        /*
         * For OK queue, we are done. For error, kill it. Any other
         * error (busy), just add it to our list as we previously
@@ -1341,7 +1391,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
        }
 
 insert:
-       blk_mq_insert_request(rq, false, true, true);
+       blk_mq_sched_insert_request(rq, false, true, true, false);
 }
 
 /*
@@ -1352,8 +1402,8 @@ insert:
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
        const int is_sync = op_is_sync(bio->bi_opf);
-       const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
-       struct blk_mq_alloc_data data;
+       const int is_flush_fua = op_is_flush(bio->bi_opf);
+       struct blk_mq_alloc_data data = { .flags = 0 };
        struct request *rq;
        unsigned int request_count = 0, srcu_idx;
        struct blk_plug *plug;
@@ -1374,9 +1424,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
            blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
                return BLK_QC_T_NONE;
 
+       if (blk_mq_sched_bio_merge(q, bio))
+               return BLK_QC_T_NONE;
+
        wb_acct = wbt_wait(q->rq_wb, bio, NULL);
 
-       rq = blk_mq_map_request(q, bio, &data);
+       trace_block_getrq(q, bio, bio->bi_opf);
+
+       rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
        if (unlikely(!rq)) {
                __wbt_done(q->rq_wb, wb_acct);
                return BLK_QC_T_NONE;
@@ -1384,12 +1439,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
        wbt_track(&rq->issue_stat, wb_acct);
 
-       cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+       cookie = request_to_qc_t(data.hctx, rq);
 
        if (unlikely(is_flush_fua)) {
+               blk_mq_put_ctx(data.ctx);
                blk_mq_bio_to_request(rq, bio);
+               blk_mq_get_driver_tag(rq, NULL, true);
                blk_insert_flush(rq);
-               goto run_queue;
+               blk_mq_run_hw_queue(data.hctx, true);
+               goto done;
        }
 
        plug = current->plug;
@@ -1438,6 +1496,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                goto done;
        }
 
+       if (q->elevator) {
+               blk_mq_put_ctx(data.ctx);
+               blk_mq_bio_to_request(rq, bio);
+               blk_mq_sched_insert_request(rq, false, true,
+                                               !is_sync || is_flush_fua, true);
+               goto done;
+       }
        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
                /*
                 * For a SYNC request, send it to the hardware immediately. For
@@ -1445,7 +1510,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 * latter allows for merging opportunities and more efficient
                 * dispatching.
                 */
-run_queue:
                blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
        }
        blk_mq_put_ctx(data.ctx);
@@ -1460,10 +1524,10 @@ done:
 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
        const int is_sync = op_is_sync(bio->bi_opf);
-       const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
+       const int is_flush_fua = op_is_flush(bio->bi_opf);
        struct blk_plug *plug;
        unsigned int request_count = 0;
-       struct blk_mq_alloc_data data;
+       struct blk_mq_alloc_data data = { .flags = 0 };
        struct request *rq;
        blk_qc_t cookie;
        unsigned int wb_acct;
@@ -1483,9 +1547,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
        } else
                request_count = blk_plug_queued_count(q);
 
+       if (blk_mq_sched_bio_merge(q, bio))
+               return BLK_QC_T_NONE;
+
        wb_acct = wbt_wait(q->rq_wb, bio, NULL);
 
-       rq = blk_mq_map_request(q, bio, &data);
+       trace_block_getrq(q, bio, bio->bi_opf);
+
+       rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
        if (unlikely(!rq)) {
                __wbt_done(q->rq_wb, wb_acct);
                return BLK_QC_T_NONE;
@@ -1493,12 +1562,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 
        wbt_track(&rq->issue_stat, wb_acct);
 
-       cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+       cookie = request_to_qc_t(data.hctx, rq);
 
        if (unlikely(is_flush_fua)) {
+               blk_mq_put_ctx(data.ctx);
                blk_mq_bio_to_request(rq, bio);
+               blk_mq_get_driver_tag(rq, NULL, true);
                blk_insert_flush(rq);
-               goto run_queue;
+               blk_mq_run_hw_queue(data.hctx, true);
+               goto done;
        }
 
        /*
@@ -1535,6 +1607,13 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
                return cookie;
        }
 
+       if (q->elevator) {
+               blk_mq_put_ctx(data.ctx);
+               blk_mq_bio_to_request(rq, bio);
+               blk_mq_sched_insert_request(rq, false, true,
+                                               !is_sync || is_flush_fua, true);
+               goto done;
+       }
        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
                /*
                 * For a SYNC request, send it to the hardware immediately. For
@@ -1542,16 +1621,16 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
                 * latter allows for merging opportunities and more efficient
                 * dispatching.
                 */
-run_queue:
                blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
        }
 
        blk_mq_put_ctx(data.ctx);
+done:
        return cookie;
 }
 
-static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
-               struct blk_mq_tags *tags, unsigned int hctx_idx)
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+                    unsigned int hctx_idx)
 {
        struct page *page;
 
@@ -1559,11 +1638,13 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
                int i;
 
                for (i = 0; i < tags->nr_tags; i++) {
-                       if (!tags->rqs[i])
+                       struct request *rq = tags->static_rqs[i];
+
+                       if (!rq)
                                continue;
-                       set->ops->exit_request(set->driver_data, tags->rqs[i],
+                       set->ops->exit_request(set->driver_data, rq,
                                                hctx_idx, i);
-                       tags->rqs[i] = NULL;
+                       tags->static_rqs[i] = NULL;
                }
        }
 
@@ -1577,33 +1658,32 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
                kmemleak_free(page_address(page));
                __free_pages(page, page->private);
        }
+}
 
+void blk_mq_free_rq_map(struct blk_mq_tags *tags)
+{
        kfree(tags->rqs);
+       tags->rqs = NULL;
+       kfree(tags->static_rqs);
+       tags->static_rqs = NULL;
 
        blk_mq_free_tags(tags);
 }
 
-static size_t order_to_size(unsigned int order)
-{
-       return (size_t)PAGE_SIZE << order;
-}
-
-static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
-               unsigned int hctx_idx)
+struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+                                       unsigned int hctx_idx,
+                                       unsigned int nr_tags,
+                                       unsigned int reserved_tags)
 {
        struct blk_mq_tags *tags;
-       unsigned int i, j, entries_per_page, max_order = 4;
-       size_t rq_size, left;
 
-       tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+       tags = blk_mq_init_tags(nr_tags, reserved_tags,
                                set->numa_node,
                                BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
        if (!tags)
                return NULL;
 
-       INIT_LIST_HEAD(&tags->page_list);
-
-       tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
+       tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                 set->numa_node);
        if (!tags->rqs) {
@@ -1611,15 +1691,40 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                return NULL;
        }
 
+       tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
+                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
+                                set->numa_node);
+       if (!tags->static_rqs) {
+               kfree(tags->rqs);
+               blk_mq_free_tags(tags);
+               return NULL;
+       }
+
+       return tags;
+}
+
+static size_t order_to_size(unsigned int order)
+{
+       return (size_t)PAGE_SIZE << order;
+}
+
+int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+                    unsigned int hctx_idx, unsigned int depth)
+{
+       unsigned int i, j, entries_per_page, max_order = 4;
+       size_t rq_size, left;
+
+       INIT_LIST_HEAD(&tags->page_list);
+
        /*
         * rq_size is the size of the request plus driver payload, rounded
         * to the cacheline size
         */
        rq_size = round_up(sizeof(struct request) + set->cmd_size,
                                cache_line_size());
-       left = rq_size * set->queue_depth;
+       left = rq_size * depth;
 
-       for (i = 0; i < set->queue_depth; ) {
+       for (i = 0; i < depth; ) {
                int this_order = max_order;
                struct page *page;
                int to_do;
@@ -1653,15 +1758,17 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                 */
                kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
                entries_per_page = order_to_size(this_order) / rq_size;
-               to_do = min(entries_per_page, set->queue_depth - i);
+               to_do = min(entries_per_page, depth - i);
                left -= to_do * rq_size;
                for (j = 0; j < to_do; j++) {
-                       tags->rqs[i] = p;
+                       struct request *rq = p;
+
+                       tags->static_rqs[i] = rq;
                        if (set->ops->init_request) {
                                if (set->ops->init_request(set->driver_data,
-                                               tags->rqs[i], hctx_idx, i,
+                                               rq, hctx_idx, i,
                                                set->numa_node)) {
-                                       tags->rqs[i] = NULL;
+                                       tags->static_rqs[i] = NULL;
                                        goto fail;
                                }
                        }
@@ -1670,11 +1777,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                        i++;
                }
        }
-       return tags;
+       return 0;
 
 fail:
-       blk_mq_free_rq_map(set, tags, hctx_idx);
-       return NULL;
+       blk_mq_free_rqs(set, tags, hctx_idx);
+       return -ENOMEM;
 }
 
 /*
@@ -1866,6 +1973,35 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
        }
 }
 
+static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
+{
+       int ret = 0;
+
+       set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
+                                       set->queue_depth, set->reserved_tags);
+       if (!set->tags[hctx_idx])
+               return false;
+
+       ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
+                               set->queue_depth);
+       if (!ret)
+               return true;
+
+       blk_mq_free_rq_map(set->tags[hctx_idx]);
+       set->tags[hctx_idx] = NULL;
+       return false;
+}
+
+static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
+                                        unsigned int hctx_idx)
+{
+       if (set->tags[hctx_idx]) {
+               blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
+               blk_mq_free_rq_map(set->tags[hctx_idx]);
+               set->tags[hctx_idx] = NULL;
+       }
+}
+
 static void blk_mq_map_swqueue(struct request_queue *q,
                               const struct cpumask *online_mask)
 {
@@ -1894,17 +2030,15 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 
                hctx_idx = q->mq_map[i];
                /* unmapped hw queue can be remapped after CPU topo changed */
-               if (!set->tags[hctx_idx]) {
-                       set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx);
-
+               if (!set->tags[hctx_idx] &&
+                   !__blk_mq_alloc_rq_map(set, hctx_idx)) {
                        /*
                         * If tags initialization fail for some hctx,
                         * that hctx won't be brought online.  In this
                         * case, remap the current ctx to hctx[0] which
                         * is guaranteed to always have tags allocated
                         */
-                       if (!set->tags[hctx_idx])
-                               q->mq_map[i] = 0;
+                       q->mq_map[i] = 0;
                }
 
                ctx = per_cpu_ptr(q->queue_ctx, i);
@@ -1927,10 +2061,9 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                         * fallback in case of a new remap fails
                         * allocation
                         */
-                       if (i && set->tags[i]) {
-                               blk_mq_free_rq_map(set, set->tags[i], i);
-                               set->tags[i] = NULL;
-                       }
+                       if (i && set->tags[i])
+                               blk_mq_free_map_and_requests(set, i);
+
                        hctx->tags = NULL;
                        continue;
                }
@@ -2023,6 +2156,8 @@ void blk_mq_release(struct request_queue *q)
        struct blk_mq_hw_ctx *hctx;
        unsigned int i;
 
+       blk_mq_sched_teardown(q);
+
        /* hctx kobj stays in hctx */
        queue_for_each_hw_ctx(q, hctx, i) {
                if (!hctx)
@@ -2097,10 +2232,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx = hctxs[j];
 
                if (hctx) {
-                       if (hctx->tags) {
-                               blk_mq_free_rq_map(set, hctx->tags, j);
-                               set->tags[j] = NULL;
-                       }
+                       if (hctx->tags)
+                               blk_mq_free_map_and_requests(set, j);
                        blk_mq_exit_hctx(q, set, hctx, j);
                        free_cpumask_var(hctx->cpumask);
                        kobject_put(&hctx->kobj);
@@ -2181,6 +2314,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
        mutex_unlock(&all_q_mutex);
        put_online_cpus();
 
+       if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
+               int ret;
+
+               ret = blk_mq_sched_init(q);
+               if (ret)
+                       return ERR_PTR(ret);
+       }
+
        return q;
 
 err_hctxs:
@@ -2279,10 +2420,10 @@ static int blk_mq_queue_reinit_dead(unsigned int cpu)
  * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
  * and set bit0 in pending bitmap as ctx1->index_hw is still zero.
  *
- * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in
- * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
- * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list
- * is ignored.
+ * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set
+ * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
+ * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is
+ * ignored.
  */
 static int blk_mq_queue_reinit_prepare(unsigned int cpu)
 {
@@ -2296,17 +2437,15 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
        int i;
 
-       for (i = 0; i < set->nr_hw_queues; i++) {
-               set->tags[i] = blk_mq_init_rq_map(set, i);
-               if (!set->tags[i])
+       for (i = 0; i < set->nr_hw_queues; i++)
+               if (!__blk_mq_alloc_rq_map(set, i))
                        goto out_unwind;
-       }
 
        return 0;
 
 out_unwind:
        while (--i >= 0)
-               blk_mq_free_rq_map(set, set->tags[i], i);
+               blk_mq_free_rq_map(set->tags[i]);
 
        return -ENOMEM;
 }
@@ -2430,10 +2569,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
        int i;
 
-       for (i = 0; i < nr_cpu_ids; i++) {
-               if (set->tags[i])
-                       blk_mq_free_rq_map(set, set->tags[i], i);
-       }
+       for (i = 0; i < nr_cpu_ids; i++)
+               blk_mq_free_map_and_requests(set, i);
 
        kfree(set->mq_map);
        set->mq_map = NULL;
@@ -2449,14 +2586,28 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
        struct blk_mq_hw_ctx *hctx;
        int i, ret;
 
-       if (!set || nr > set->queue_depth)
+       if (!set)
                return -EINVAL;
 
+       blk_mq_freeze_queue(q);
+       blk_mq_quiesce_queue(q);
+
        ret = 0;
        queue_for_each_hw_ctx(q, hctx, i) {
                if (!hctx->tags)
                        continue;
-               ret = blk_mq_tag_update_depth(hctx->tags, nr);
+               /*
+                * If we're using an MQ scheduler, just update the scheduler
+                * queue depth. This is similar to what the old code would do.
+                */
+               if (!hctx->sched_tags) {
+                       ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
+                                                       min(nr, set->queue_depth),
+                                                       false);
+               } else {
+                       ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
+                                                       nr, true);
+               }
                if (ret)
                        break;
        }
@@ -2464,6 +2615,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
        if (!ret)
                q->nr_requests = nr;
 
+       blk_mq_unfreeze_queue(q);
+       blk_mq_start_stopped_hw_queues(q, true);
+
        return ret;
 }
 
@@ -2649,7 +2803,10 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
                blk_flush_plug_list(plug, false);
 
        hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
-       rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+       if (!blk_qc_t_is_internal(cookie))
+               rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+       else
+               rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
 
        return __blk_mq_poll(hctx, rq);
 }
@@ -2667,6 +2824,8 @@ void blk_mq_enable_hotplug(void)
 
 static int __init blk_mq_init(void)
 {
+       blk_mq_debugfs_init();
+
        cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                blk_mq_hctx_notify_dead);
 
index 63e9116..b52abd6 100644 (file)
@@ -32,8 +32,32 @@ void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
+void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
+bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
+bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
+                               bool wait);
 
 /*
+ * Internal helpers for allocating/freeing the request map
+ */
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+                    unsigned int hctx_idx);
+void blk_mq_free_rq_map(struct blk_mq_tags *tags);
+struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+                                       unsigned int hctx_idx,
+                                       unsigned int nr_tags,
+                                       unsigned int reserved_tags);
+int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+                    unsigned int hctx_idx, unsigned int depth);
+
+/*
+ * Internal helpers for request insertion into sw queues
+ */
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+                               bool at_head);
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+                               struct list_head *list);
+/*
  * CPU hotplug helpers
  */
 void blk_mq_enable_hotplug(void);
@@ -57,6 +81,40 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
 
+/*
+ * debugfs helpers
+ */
+#ifdef CONFIG_BLK_DEBUG_FS
+void blk_mq_debugfs_init(void);
+int blk_mq_debugfs_register(struct request_queue *q, const char *name);
+void blk_mq_debugfs_unregister(struct request_queue *q);
+int blk_mq_debugfs_register_hctxs(struct request_queue *q);
+void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
+#else
+static inline void blk_mq_debugfs_init(void)
+{
+}
+
+static inline int blk_mq_debugfs_register(struct request_queue *q,
+                                         const char *name)
+{
+       return 0;
+}
+
+static inline void blk_mq_debugfs_unregister(struct request_queue *q)
+{
+}
+
+static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+{
+       return 0;
+}
+
+static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+{
+}
+#endif
+
 extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
 
 void blk_mq_release(struct request_queue *q);
@@ -103,6 +161,25 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
        data->hctx = hctx;
 }
 
+static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
+{
+       if (data->flags & BLK_MQ_REQ_INTERNAL)
+               return data->hctx->sched_tags;
+
+       return data->hctx->tags;
+}
+
+/*
+ * Internal helpers for request allocation/init/free
+ */
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+                       struct request *rq, unsigned int op);
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+                               struct request *rq);
+void blk_mq_finish_request(struct request *rq);
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+                                       unsigned int op);
+
 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
 {
        return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
index bae1dec..07cc329 100644 (file)
@@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
        list_del_init(&rq->queuelist);
        rq->rq_flags &= ~RQF_QUEUED;
        rq->tag = -1;
+       rq->internal_tag = -1;
 
        if (unlikely(bqt->tag_index[tag] == NULL))
                printk(KERN_ERR "%s: tag %d is missing\n",
index a6bb4fe..82fd0cc 100644 (file)
@@ -866,10 +866,12 @@ static void tg_update_disptime(struct throtl_grp *tg)
        unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
        struct bio *bio;
 
-       if ((bio = throtl_peek_queued(&sq->queued[READ])))
+       bio = throtl_peek_queued(&sq->queued[READ]);
+       if (bio)
                tg_may_dispatch(tg, bio, &read_wait);
 
-       if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
+       bio = throtl_peek_queued(&sq->queued[WRITE]);
+       if (bio)
                tg_may_dispatch(tg, bio, &write_wait);
 
        min_wait = min(read_wait, write_wait);
index 041185e..9a716b5 100644 (file)
@@ -167,7 +167,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
                        return NULL;
                }
                if (unlikely(blk_queue_bypass(q)) ||
-                   !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
+                   !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
                        return NULL;
        }
 }
@@ -176,16 +176,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
 
-       if (e->type->ops.elevator_activate_req_fn)
-               e->type->ops.elevator_activate_req_fn(q, rq);
+       if (e->type->ops.sq.elevator_activate_req_fn)
+               e->type->ops.sq.elevator_activate_req_fn(q, rq);
 }
 
 static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
 
-       if (e->type->ops.elevator_deactivate_req_fn)
-               e->type->ops.elevator_deactivate_req_fn(q, rq);
+       if (e->type->ops.sq.elevator_deactivate_req_fn)
+               e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
 }
 
 #ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -264,6 +264,22 @@ void ioc_clear_queue(struct request_queue *q);
 int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
 
 /**
+ * rq_ioc - determine io_context for request allocation
+ * @bio: request being allocated is for this bio (can be %NULL)
+ *
+ * Determine io_context to use for request allocation for @bio.  May return
+ * %NULL if %current->io_context doesn't exist.
+ */
+static inline struct io_context *rq_ioc(struct bio *bio)
+{
+#ifdef CONFIG_BLK_CGROUP
+       if (bio && bio->bi_ioc)
+               return bio->bi_ioc;
+#endif
+       return current->io_context;
+}
+
+/**
  * create_io_context - try to create task->io_context
  * @gfp_mask: allocation mask
  * @node: allocation node
index c73a6fc..f0f29ee 100644 (file)
@@ -2749,9 +2749,11 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
        if (!cfqg)
                return NULL;
 
-       for_each_cfqg_st(cfqg, i, j, st)
-               if ((cfqq = cfq_rb_first(st)) != NULL)
+       for_each_cfqg_st(cfqg, i, j, st) {
+               cfqq = cfq_rb_first(st);
+               if (cfqq)
                        return cfqq;
+       }
        return NULL;
 }
 
@@ -3864,6 +3866,8 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
                goto out;
        }
 
+       /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
+       cfqq->ioprio_class = IOPRIO_CLASS_NONE;
        cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
        cfq_init_prio_data(cfqq, cic);
        cfq_link_cfqq_cfqg(cfqq, cfqg);
@@ -4837,7 +4841,7 @@ static struct elv_fs_entry cfq_attrs[] = {
 };
 
 static struct elevator_type iosched_cfq = {
-       .ops = {
+       .ops.sq = {
                .elevator_merge_fn =            cfq_merge,
                .elevator_merged_fn =           cfq_merged_request,
                .elevator_merge_req_fn =        cfq_merged_requests,
index 55e0bb6..05fc0ea 100644 (file)
@@ -439,7 +439,7 @@ static struct elv_fs_entry deadline_attrs[] = {
 };
 
 static struct elevator_type iosched_deadline = {
-       .ops = {
+       .ops.sq = {
                .elevator_merge_fn =            deadline_merge,
                .elevator_merged_fn =           deadline_merged_request,
                .elevator_merge_req_fn =        deadline_merged_requests,
index 40f0c04..b2a5516 100644 (file)
@@ -40,6 +40,7 @@
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
@@ -58,8 +59,10 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
        struct request_queue *q = rq->q;
        struct elevator_queue *e = q->elevator;
 
-       if (e->type->ops.elevator_allow_bio_merge_fn)
-               return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio);
+       if (e->uses_mq && e->type->ops.mq.allow_merge)
+               return e->type->ops.mq.allow_merge(q, rq, bio);
+       else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
+               return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
 
        return 1;
 }
@@ -163,6 +166,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
        kobject_init(&eq->kobj, &elv_ktype);
        mutex_init(&eq->sysfs_lock);
        hash_init(eq->hash);
+       eq->uses_mq = e->uses_mq;
 
        return eq;
 }
@@ -203,11 +207,12 @@ int elevator_init(struct request_queue *q, char *name)
        }
 
        /*
-        * Use the default elevator specified by config boot param or
-        * config option.  Don't try to load modules as we could be running
-        * off async and request_module() isn't allowed from async.
+        * Use the default elevator specified by config boot param for
+        * non-mq devices, or by config option. Don't try to load modules
+        * as we could be running off async and request_module() isn't
+        * allowed from async.
         */
-       if (!e && *chosen_elevator) {
+       if (!e && !q->mq_ops && *chosen_elevator) {
                e = elevator_get(chosen_elevator, false);
                if (!e)
                        printk(KERN_ERR "I/O scheduler %s not found\n",
@@ -215,18 +220,32 @@ int elevator_init(struct request_queue *q, char *name)
        }
 
        if (!e) {
-               e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+               if (q->mq_ops && q->nr_hw_queues == 1)
+                       e = elevator_get(CONFIG_DEFAULT_SQ_IOSCHED, false);
+               else if (q->mq_ops)
+                       e = elevator_get(CONFIG_DEFAULT_MQ_IOSCHED, false);
+               else
+                       e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+
                if (!e) {
                        printk(KERN_ERR
                                "Default I/O scheduler not found. " \
-                               "Using noop.\n");
+                               "Using noop/none.\n");
                        e = elevator_get("noop", false);
                }
        }
 
-       err = e->ops.elevator_init_fn(q, e);
-       if (err)
+       if (e->uses_mq) {
+               err = blk_mq_sched_setup(q);
+               if (!err)
+                       err = e->ops.mq.init_sched(q, e);
+       } else
+               err = e->ops.sq.elevator_init_fn(q, e);
+       if (err) {
+               if (e->uses_mq)
+                       blk_mq_sched_teardown(q);
                elevator_put(e);
+       }
        return err;
 }
 EXPORT_SYMBOL(elevator_init);
@@ -234,8 +253,10 @@ EXPORT_SYMBOL(elevator_init);
 void elevator_exit(struct elevator_queue *e)
 {
        mutex_lock(&e->sysfs_lock);
-       if (e->type->ops.elevator_exit_fn)
-               e->type->ops.elevator_exit_fn(e);
+       if (e->uses_mq && e->type->ops.mq.exit_sched)
+               e->type->ops.mq.exit_sched(e);
+       else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
+               e->type->ops.sq.elevator_exit_fn(e);
        mutex_unlock(&e->sysfs_lock);
 
        kobject_put(&e->kobj);
@@ -253,6 +274,7 @@ void elv_rqhash_del(struct request_queue *q, struct request *rq)
        if (ELV_ON_HASH(rq))
                __elv_rqhash_del(rq);
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_del);
 
 void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
@@ -262,6 +284,7 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq)
        hash_add(e->hash, &rq->hash, rq_hash_key(rq));
        rq->rq_flags |= RQF_HASHED;
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_add);
 
 void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 {
@@ -443,8 +466,10 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
                return ELEVATOR_BACK_MERGE;
        }
 
-       if (e->type->ops.elevator_merge_fn)
-               return e->type->ops.elevator_merge_fn(q, req, bio);
+       if (e->uses_mq && e->type->ops.mq.request_merge)
+               return e->type->ops.mq.request_merge(q, req, bio);
+       else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
+               return e->type->ops.sq.elevator_merge_fn(q, req, bio);
 
        return ELEVATOR_NO_MERGE;
 }
@@ -456,8 +481,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
  *
  * Returns true if we merged, false otherwise
  */
-static bool elv_attempt_insert_merge(struct request_queue *q,
-                                    struct request *rq)
+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
 {
        struct request *__rq;
        bool ret;
@@ -495,8 +519,10 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
        struct elevator_queue *e = q->elevator;
 
-       if (e->type->ops.elevator_merged_fn)
-               e->type->ops.elevator_merged_fn(q, rq, type);
+       if (e->uses_mq && e->type->ops.mq.request_merged)
+               e->type->ops.mq.request_merged(q, rq, type);
+       else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
+               e->type->ops.sq.elevator_merged_fn(q, rq, type);
 
        if (type == ELEVATOR_BACK_MERGE)
                elv_rqhash_reposition(q, rq);
@@ -508,10 +534,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
                             struct request *next)
 {
        struct elevator_queue *e = q->elevator;
-       const int next_sorted = next->rq_flags & RQF_SORTED;
-
-       if (next_sorted && e->type->ops.elevator_merge_req_fn)
-               e->type->ops.elevator_merge_req_fn(q, rq, next);
+       bool next_sorted = false;
+
+       if (e->uses_mq && e->type->ops.mq.requests_merged)
+               e->type->ops.mq.requests_merged(q, rq, next);
+       else if (e->type->ops.sq.elevator_merge_req_fn) {
+               next_sorted = next->rq_flags & RQF_SORTED;
+               if (next_sorted)
+                       e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
+       }
 
        elv_rqhash_reposition(q, rq);
 
@@ -528,8 +559,11 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
 {
        struct elevator_queue *e = q->elevator;
 
-       if (e->type->ops.elevator_bio_merged_fn)
-               e->type->ops.elevator_bio_merged_fn(q, rq, bio);
+       if (WARN_ON_ONCE(e->uses_mq))
+               return;
+
+       if (e->type->ops.sq.elevator_bio_merged_fn)
+               e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
 }
 
 #ifdef CONFIG_PM
@@ -574,11 +608,15 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 
 void elv_drain_elevator(struct request_queue *q)
 {
+       struct elevator_queue *e = q->elevator;
        static int printed;
 
+       if (WARN_ON_ONCE(e->uses_mq))
+               return;
+
        lockdep_assert_held(q->queue_lock);
 
-       while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
+       while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
                ;
        if (q->nr_sorted && printed++ < 10) {
                printk(KERN_ERR "%s: forced dispatching is broken "
@@ -653,7 +691,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
                 * rq cannot be accessed after calling
                 * elevator_add_req_fn.
                 */
-               q->elevator->type->ops.elevator_add_req_fn(q, rq);
+               q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
                break;
 
        case ELEVATOR_INSERT_FLUSH:
@@ -682,8 +720,11 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
 
-       if (e->type->ops.elevator_latter_req_fn)
-               return e->type->ops.elevator_latter_req_fn(q, rq);
+       if (e->uses_mq && e->type->ops.mq.next_request)
+               return e->type->ops.mq.next_request(q, rq);
+       else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
+               return e->type->ops.sq.elevator_latter_req_fn(q, rq);
+
        return NULL;
 }
 
@@ -691,8 +732,10 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
 
-       if (e->type->ops.elevator_former_req_fn)
-               return e->type->ops.elevator_former_req_fn(q, rq);
+       if (e->uses_mq && e->type->ops.mq.former_request)
+               return e->type->ops.mq.former_request(q, rq);
+       if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
+               return e->type->ops.sq.elevator_former_req_fn(q, rq);
        return NULL;
 }
 
@@ -701,8 +744,11 @@ int elv_set_request(struct request_queue *q, struct request *rq,
 {
        struct elevator_queue *e = q->elevator;
 
-       if (e->type->ops.elevator_set_req_fn)
-               return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
+       if (WARN_ON_ONCE(e->uses_mq))
+               return 0;
+
+       if (e->type->ops.sq.elevator_set_req_fn)
+               return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
        return 0;
 }
 
@@ -710,16 +756,22 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
 
-       if (e->type->ops.elevator_put_req_fn)
-               e->type->ops.elevator_put_req_fn(rq);
+       if (WARN_ON_ONCE(e->uses_mq))
+               return;
+
+       if (e->type->ops.sq.elevator_put_req_fn)
+               e->type->ops.sq.elevator_put_req_fn(rq);
 }
 
 int elv_may_queue(struct request_queue *q, unsigned int op)
 {
        struct elevator_queue *e = q->elevator;
 
-       if (e->type->ops.elevator_may_queue_fn)
-               return e->type->ops.elevator_may_queue_fn(q, op);
+       if (WARN_ON_ONCE(e->uses_mq))
+               return 0;
+
+       if (e->type->ops.sq.elevator_may_queue_fn)
+               return e->type->ops.sq.elevator_may_queue_fn(q, op);
 
        return ELV_MQUEUE_MAY;
 }
@@ -728,14 +780,17 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 {
        struct elevator_queue *e = q->elevator;
 
+       if (WARN_ON_ONCE(e->uses_mq))
+               return;
+
        /*
         * request is released from the driver, io must be done
         */
        if (blk_account_rq(rq)) {
                q->in_flight[rq_is_sync(rq)]--;
                if ((rq->rq_flags & RQF_SORTED) &&
-                   e->type->ops.elevator_completed_req_fn)
-                       e->type->ops.elevator_completed_req_fn(q, rq);
+                   e->type->ops.sq.elevator_completed_req_fn)
+                       e->type->ops.sq.elevator_completed_req_fn(q, rq);
        }
 }
 
@@ -803,8 +858,8 @@ int elv_register_queue(struct request_queue *q)
                }
                kobject_uevent(&e->kobj, KOBJ_ADD);
                e->registered = 1;
-               if (e->type->ops.elevator_registered_fn)
-                       e->type->ops.elevator_registered_fn(q);
+               if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
+                       e->type->ops.sq.elevator_registered_fn(q);
        }
        return error;
 }
@@ -891,9 +946,14 @@ EXPORT_SYMBOL_GPL(elv_unregister);
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
        struct elevator_queue *old = q->elevator;
-       bool registered = old->registered;
+       bool old_registered = false;
        int err;
 
+       if (q->mq_ops) {
+               blk_mq_freeze_queue(q);
+               blk_mq_quiesce_queue(q);
+       }
+
        /*
         * Turn on BYPASS and drain all requests w/ elevator private data.
         * Block layer doesn't call into a quiesced elevator - all requests
@@ -901,42 +961,76 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
         * using INSERT_BACK.  All requests have SOFTBARRIER set and no
         * merge happens either.
         */
-       blk_queue_bypass_start(q);
+       if (old) {
+               old_registered = old->registered;
 
-       /* unregister and clear all auxiliary data of the old elevator */
-       if (registered)
-               elv_unregister_queue(q);
+               if (old->uses_mq)
+                       blk_mq_sched_teardown(q);
 
-       spin_lock_irq(q->queue_lock);
-       ioc_clear_queue(q);
-       spin_unlock_irq(q->queue_lock);
+               if (!q->mq_ops)
+                       blk_queue_bypass_start(q);
+
+               /* unregister and clear all auxiliary data of the old elevator */
+               if (old_registered)
+                       elv_unregister_queue(q);
+
+               spin_lock_irq(q->queue_lock);
+               ioc_clear_queue(q);
+               spin_unlock_irq(q->queue_lock);
+       }
 
        /* allocate, init and register new elevator */
-       err = new_e->ops.elevator_init_fn(q, new_e);
-       if (err)
-               goto fail_init;
+       if (new_e) {
+               if (new_e->uses_mq) {
+                       err = blk_mq_sched_setup(q);
+                       if (!err)
+                               err = new_e->ops.mq.init_sched(q, new_e);
+               } else
+                       err = new_e->ops.sq.elevator_init_fn(q, new_e);
+               if (err)
+                       goto fail_init;
 
-       if (registered) {
                err = elv_register_queue(q);
                if (err)
                        goto fail_register;
-       }
+       } else
+               q->elevator = NULL;
 
        /* done, kill the old one and finish */
-       elevator_exit(old);
-       blk_queue_bypass_end(q);
+       if (old) {
+               elevator_exit(old);
+               if (!q->mq_ops)
+                       blk_queue_bypass_end(q);
+       }
 
-       blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+       if (q->mq_ops) {
+               blk_mq_unfreeze_queue(q);
+               blk_mq_start_stopped_hw_queues(q, true);
+       }
+
+       if (new_e)
+               blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+       else
+               blk_add_trace_msg(q, "elv switch: none");
 
        return 0;
 
 fail_register:
+       if (q->mq_ops)
+               blk_mq_sched_teardown(q);
        elevator_exit(q->elevator);
 fail_init:
        /* switch failed, restore and re-register old elevator */
-       q->elevator = old;
-       elv_register_queue(q);
-       blk_queue_bypass_end(q);
+       if (old) {
+               q->elevator = old;
+               elv_register_queue(q);
+               if (!q->mq_ops)
+                       blk_queue_bypass_end(q);
+       }
+       if (q->mq_ops) {
+               blk_mq_unfreeze_queue(q);
+               blk_mq_start_stopped_hw_queues(q, true);
+       }
 
        return err;
 }
@@ -949,8 +1043,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
        char elevator_name[ELV_NAME_MAX];
        struct elevator_type *e;
 
-       if (!q->elevator)
-               return -ENXIO;
+       /*
+        * Special case for mq, turn off scheduling
+        */
+       if (q->mq_ops && !strncmp(name, "none", 4))
+               return elevator_switch(q, NULL);
 
        strlcpy(elevator_name, name, sizeof(elevator_name));
        e = elevator_get(strstrip(elevator_name), true);
@@ -959,11 +1056,21 @@ static int __elevator_change(struct request_queue *q, const char *name)
                return -EINVAL;
        }
 
-       if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
+       if (q->elevator &&
+           !strcmp(elevator_name, q->elevator->type->elevator_name)) {
                elevator_put(e);
                return 0;
        }
 
+       if (!e->uses_mq && q->mq_ops) {
+               elevator_put(e);
+               return -EINVAL;
+       }
+       if (e->uses_mq && !q->mq_ops) {
+               elevator_put(e);
+               return -EINVAL;
+       }
+
        return elevator_switch(q, e);
 }
 
@@ -985,7 +1092,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 {
        int ret;
 
-       if (!q->elevator)
+       if (!(q->mq_ops || q->request_fn))
                return count;
 
        ret = __elevator_change(q, name);
@@ -999,24 +1106,34 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
        struct elevator_queue *e = q->elevator;
-       struct elevator_type *elv;
+       struct elevator_type *elv = NULL;
        struct elevator_type *__e;
        int len = 0;
 
-       if (!q->elevator || !blk_queue_stackable(q))
+       if (!blk_queue_stackable(q))
                return sprintf(name, "none\n");
 
-       elv = e->type;
+       if (!q->elevator)
+               len += sprintf(name+len, "[none] ");
+       else
+               elv = e->type;
 
        spin_lock(&elv_list_lock);
        list_for_each_entry(__e, &elv_list, list) {
-               if (!strcmp(elv->elevator_name, __e->elevator_name))
+               if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
                        len += sprintf(name+len, "[%s] ", elv->elevator_name);
-               else
+                       continue;
+               }
+               if (__e->uses_mq && q->mq_ops)
+                       len += sprintf(name+len, "%s ", __e->elevator_name);
+               else if (!__e->uses_mq && !q->mq_ops)
                        len += sprintf(name+len, "%s ", __e->elevator_name);
        }
        spin_unlock(&elv_list_lock);
 
+       if (q->mq_ops && q->elevator)
+               len += sprintf(name+len, "none");
+
        len += sprintf(len+name, "\n");
        return len;
 }
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
new file mode 100644 (file)
index 0000000..d93ec71
--- /dev/null
@@ -0,0 +1,555 @@
+/*
+ *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
+ *  for the blk-mq scheduling framework
+ *
+ *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+
+/*
+ * See Documentation/block/deadline-iosched.txt
+ */
+static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int writes_starved = 2;    /* max times reads can starve a write */
+static const int fifo_batch = 16;       /* # of sequential requests treated as one
+                                    by the above parameters. For throughput. */
+
+struct deadline_data {
+       /*
+        * run time data
+        */
+
+       /*
+        * requests (deadline_rq s) are present on both sort_list and fifo_list
+        */
+       struct rb_root sort_list[2];
+       struct list_head fifo_list[2];
+
+       /*
+        * next in sort order. read, write or both are NULL
+        */
+       struct request *next_rq[2];
+       unsigned int batching;          /* number of sequential requests made */
+       unsigned int starved;           /* times reads have starved writes */
+
+       /*
+        * settings that change how the i/o scheduler behaves
+        */
+       int fifo_expire[2];
+       int fifo_batch;
+       int writes_starved;
+       int front_merges;
+
+       spinlock_t lock;
+       struct list_head dispatch;
+};
+
+static inline struct rb_root *
+deadline_rb_root(struct deadline_data *dd, struct request *rq)
+{
+       return &dd->sort_list[rq_data_dir(rq)];
+}
+
+/*
+ * get the request after `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_latter_request(struct request *rq)
+{
+       struct rb_node *node = rb_next(&rq->rb_node);
+
+       if (node)
+               return rb_entry_rq(node);
+
+       return NULL;
+}
+
+static void
+deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+       struct rb_root *root = deadline_rb_root(dd, rq);
+
+       elv_rb_add(root, rq);
+}
+
+static inline void
+deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+       const int data_dir = rq_data_dir(rq);
+
+       if (dd->next_rq[data_dir] == rq)
+               dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+       elv_rb_del(deadline_rb_root(dd, rq), rq);
+}
+
+/*
+ * remove rq from rbtree and fifo.
+ */
+static void deadline_remove_request(struct request_queue *q, struct request *rq)
+{
+       struct deadline_data *dd = q->elevator->elevator_data;
+
+       list_del_init(&rq->queuelist);
+
+       /*
+        * We might not be on the rbtree, if we are doing an insert merge
+        */
+       if (!RB_EMPTY_NODE(&rq->rb_node))
+               deadline_del_rq_rb(dd, rq);
+
+       elv_rqhash_del(q, rq);
+       if (q->last_merge == rq)
+               q->last_merge = NULL;
+}
+
+static void dd_request_merged(struct request_queue *q, struct request *req,
+                             int type)
+{
+       struct deadline_data *dd = q->elevator->elevator_data;
+
+       /*
+        * if the merge was a front merge, we need to reposition request
+        */
+       if (type == ELEVATOR_FRONT_MERGE) {
+               elv_rb_del(deadline_rb_root(dd, req), req);
+               deadline_add_rq_rb(dd, req);
+       }
+}
+
+static void dd_merged_requests(struct request_queue *q, struct request *req,
+                              struct request *next)
+{
+       /*
+        * if next expires before rq, assign its expire time to rq
+        * and move into next position (next will be deleted) in fifo
+        */
+       if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
+               if (time_before((unsigned long)next->fifo_time,
+                               (unsigned long)req->fifo_time)) {
+                       list_move(&req->queuelist, &next->queuelist);
+                       req->fifo_time = next->fifo_time;
+               }
+       }
+
+       /*
+        * kill knowledge of next, this one is a goner
+        */
+       deadline_remove_request(q, next);
+}
+
+/*
+ * move an entry to dispatch queue
+ */
+static void
+deadline_move_request(struct deadline_data *dd, struct request *rq)
+{
+       const int data_dir = rq_data_dir(rq);
+
+       dd->next_rq[READ] = NULL;
+       dd->next_rq[WRITE] = NULL;
+       dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+       /*
+        * take it off the sort and fifo list
+        */
+       deadline_remove_request(rq->q, rq);
+}
+
+/*
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
+ * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ */
+static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
+{
+       struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
+
+       /*
+        * rq is expired!
+        */
+       if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
+               return 1;
+
+       return 0;
+}
+
+/*
+ * deadline_dispatch_requests selects the best request according to
+ * read/write expire, fifo_batch, etc
+ */
+static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+       struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+       struct request *rq;
+       bool reads, writes;
+       int data_dir;
+
+       if (!list_empty(&dd->dispatch)) {
+               rq = list_first_entry(&dd->dispatch, struct request, queuelist);
+               list_del_init(&rq->queuelist);
+               goto done;
+       }
+
+       reads = !list_empty(&dd->fifo_list[READ]);
+       writes = !list_empty(&dd->fifo_list[WRITE]);
+
+       /*
+        * batches are currently reads XOR writes
+        */
+       if (dd->next_rq[WRITE])
+               rq = dd->next_rq[WRITE];
+       else
+               rq = dd->next_rq[READ];
+
+       if (rq && dd->batching < dd->fifo_batch)
+               /* we have a next request are still entitled to batch */
+               goto dispatch_request;
+
+       /*
+        * at this point we are not running a batch. select the appropriate
+        * data direction (read / write)
+        */
+
+       if (reads) {
+               BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
+
+               if (writes && (dd->starved++ >= dd->writes_starved))
+                       goto dispatch_writes;
+
+               data_dir = READ;
+
+               goto dispatch_find_request;
+       }
+
+       /*
+        * there are either no reads or writes have been starved
+        */
+
+       if (writes) {
+dispatch_writes:
+               BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
+
+               dd->starved = 0;
+
+               data_dir = WRITE;
+
+               goto dispatch_find_request;
+       }
+
+       return NULL;
+
+dispatch_find_request:
+       /*
+        * we are not running a batch, find best request for selected data_dir
+        */
+       if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+               /*
+                * A deadline has expired, the last request was in the other
+                * direction, or we have run out of higher-sectored requests.
+                * Start again from the request with the earliest expiry time.
+                */
+               rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+       } else {
+               /*
+                * The last req was the same dir and we have a next request in
+                * sort order. No expired requests so continue on from here.
+                */
+               rq = dd->next_rq[data_dir];
+       }
+
+       dd->batching = 0;
+
+dispatch_request:
+       /*
+        * rq is the selected appropriate request.
+        */
+       dd->batching++;
+       deadline_move_request(dd, rq);
+done:
+       rq->rq_flags |= RQF_STARTED;
+       return rq;
+}
+
+static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+       struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+       struct request *rq;
+
+       spin_lock(&dd->lock);
+       rq = __dd_dispatch_request(hctx);
+       spin_unlock(&dd->lock);
+
+       return rq;
+}
+
+static void dd_exit_queue(struct elevator_queue *e)
+{
+       struct deadline_data *dd = e->elevator_data;
+
+       BUG_ON(!list_empty(&dd->fifo_list[READ]));
+       BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+
+       kfree(dd);
+}
+
+/*
+ * initialize elevator private data (deadline_data).
+ */
+static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+       struct deadline_data *dd;
+       struct elevator_queue *eq;
+
+       eq = elevator_alloc(q, e);
+       if (!eq)
+               return -ENOMEM;
+
+       dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
+       if (!dd) {
+               kobject_put(&eq->kobj);
+               return -ENOMEM;
+       }
+       eq->elevator_data = dd;
+
+       INIT_LIST_HEAD(&dd->fifo_list[READ]);
+       INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
+       dd->sort_list[READ] = RB_ROOT;
+       dd->sort_list[WRITE] = RB_ROOT;
+       dd->fifo_expire[READ] = read_expire;
+       dd->fifo_expire[WRITE] = write_expire;
+       dd->writes_starved = writes_starved;
+       dd->front_merges = 1;
+       dd->fifo_batch = fifo_batch;
+       spin_lock_init(&dd->lock);
+       INIT_LIST_HEAD(&dd->dispatch);
+
+       q->elevator = eq;
+       return 0;
+}
+
+static int dd_request_merge(struct request_queue *q, struct request **rq,
+                           struct bio *bio)
+{
+       struct deadline_data *dd = q->elevator->elevator_data;
+       sector_t sector = bio_end_sector(bio);
+       struct request *__rq;
+
+       if (!dd->front_merges)
+               return ELEVATOR_NO_MERGE;
+
+       __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
+       if (__rq) {
+               BUG_ON(sector != blk_rq_pos(__rq));
+
+               if (elv_bio_merge_ok(__rq, bio)) {
+                       *rq = __rq;
+                       return ELEVATOR_FRONT_MERGE;
+               }
+       }
+
+       return ELEVATOR_NO_MERGE;
+}
+
+static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+       struct request_queue *q = hctx->queue;
+       struct deadline_data *dd = q->elevator->elevator_data;
+       int ret;
+
+       spin_lock(&dd->lock);
+       ret = blk_mq_sched_try_merge(q, bio);
+       spin_unlock(&dd->lock);
+
+       return ret;
+}
+
+/*
+ * add rq to rbtree and fifo
+ */
+static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+                             bool at_head)
+{
+       struct request_queue *q = hctx->queue;
+       struct deadline_data *dd = q->elevator->elevator_data;
+       const int data_dir = rq_data_dir(rq);
+
+       if (blk_mq_sched_try_insert_merge(q, rq))
+               return;
+
+       blk_mq_sched_request_inserted(rq);
+
+       if (blk_mq_sched_bypass_insert(hctx, rq))
+               return;
+
+       if (at_head || rq->cmd_type != REQ_TYPE_FS) {
+               if (at_head)
+                       list_add(&rq->queuelist, &dd->dispatch);
+               else
+                       list_add_tail(&rq->queuelist, &dd->dispatch);
+       } else {
+               deadline_add_rq_rb(dd, rq);
+
+               if (rq_mergeable(rq)) {
+                       elv_rqhash_add(q, rq);
+                       if (!q->last_merge)
+                               q->last_merge = rq;
+               }
+
+               /*
+                * set expire time and add to fifo list
+                */
+               rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
+               list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+       }
+}
+
+static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
+                              struct list_head *list, bool at_head)
+{
+       struct request_queue *q = hctx->queue;
+       struct deadline_data *dd = q->elevator->elevator_data;
+
+       spin_lock(&dd->lock);
+       while (!list_empty(list)) {
+               struct request *rq;
+
+               rq = list_first_entry(list, struct request, queuelist);
+               list_del_init(&rq->queuelist);
+               dd_insert_request(hctx, rq, at_head);
+       }
+       spin_unlock(&dd->lock);
+}
+
+static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
+{
+       struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+
+       return !list_empty_careful(&dd->dispatch) ||
+               !list_empty_careful(&dd->fifo_list[0]) ||
+               !list_empty_careful(&dd->fifo_list[1]);
+}
+
+/*
+ * sysfs parts below
+ */
+static ssize_t
+deadline_var_show(int var, char *page)
+{
+       return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+deadline_var_store(int *var, const char *page, size_t count)
+{
+       char *p = (char *) page;
+
+       *var = simple_strtol(p, &p, 10);
+       return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                           \
+static ssize_t __FUNC(struct elevator_queue *e, char *page)            \
+{                                                                      \
+       struct deadline_data *dd = e->elevator_data;                    \
+       int __data = __VAR;                                             \
+       if (__CONV)                                                     \
+               __data = jiffies_to_msecs(__data);                      \
+       return deadline_var_show(__data, (page));                       \
+}
+SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                        \
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)        \
+{                                                                      \
+       struct deadline_data *dd = e->elevator_data;                    \
+       int __data;                                                     \
+       int ret = deadline_var_store(&__data, (page), count);           \
+       if (__data < (MIN))                                             \
+               __data = (MIN);                                         \
+       else if (__data > (MAX))                                        \
+               __data = (MAX);                                         \
+       if (__CONV)                                                     \
+               *(__PTR) = msecs_to_jiffies(__data);                    \
+       else                                                            \
+               *(__PTR) = __data;                                      \
+       return ret;                                                     \
+}
+STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+#undef STORE_FUNCTION
+
+#define DD_ATTR(name) \
+       __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
+                                     deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+       DD_ATTR(read_expire),
+       DD_ATTR(write_expire),
+       DD_ATTR(writes_starved),
+       DD_ATTR(front_merges),
+       DD_ATTR(fifo_batch),
+       __ATTR_NULL
+};
+
+static struct elevator_type mq_deadline = {
+       .ops.mq = {
+               .insert_requests        = dd_insert_requests,
+               .dispatch_request       = dd_dispatch_request,
+               .next_request           = elv_rb_latter_request,
+               .former_request         = elv_rb_former_request,
+               .bio_merge              = dd_bio_merge,
+               .request_merge          = dd_request_merge,
+               .requests_merged        = dd_merged_requests,
+               .request_merged         = dd_request_merged,
+               .has_work               = dd_has_work,
+               .init_sched             = dd_init_queue,
+               .exit_sched             = dd_exit_queue,
+       },
+
+       .uses_mq        = true,
+       .elevator_attrs = deadline_attrs,
+       .elevator_name = "mq-deadline",
+       .elevator_owner = THIS_MODULE,
+};
+
+static int __init deadline_init(void)
+{
+       return elv_register(&mq_deadline);
+}
+
+static void __exit deadline_exit(void)
+{
+       elv_unregister(&mq_deadline);
+}
+
+module_init(deadline_init);
+module_exit(deadline_exit);
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ deadline IO scheduler");
index a163c48..2d1b15d 100644 (file)
@@ -92,7 +92,7 @@ static void noop_exit_queue(struct elevator_queue *e)
 }
 
 static struct elevator_type elevator_noop = {
-       .ops = {
+       .ops.sq = {
                .elevator_merge_req_fn          = noop_merged_requests,
                .elevator_dispatch_fn           = noop_dispatch,
                .elevator_add_req_fn            = noop_add_request,
diff --git a/block/opal_proto.h b/block/opal_proto.h
new file mode 100644 (file)
index 0000000..f40c9ac
--- /dev/null
@@ -0,0 +1,452 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ *    Rafael Antognolli <rafael.antognolli@intel.com>
+ *    Scott  Bauer      <scott.bauer@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/types.h>
+
+#ifndef _OPAL_PROTO_H
+#define _OPAL_PROTO_H
+
+/*
+ * These constant values come from:
+ * SPC-4 section
+ * 6.30 SECURITY PROTOCOL IN command / table 265.
+ */
+enum {
+       TCG_SECP_00 = 0,
+       TCG_SECP_01,
+};
+
+/*
+ * Token defs derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * 3.2.2 Data Stream Encoding
+ */
+enum opal_response_token {
+       OPAL_DTA_TOKENID_BYTESTRING = 0xe0,
+       OPAL_DTA_TOKENID_SINT = 0xe1,
+       OPAL_DTA_TOKENID_UINT = 0xe2,
+       OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */
+       OPAL_DTA_TOKENID_INVALID = 0X0
+};
+
+#define DTAERROR_NO_METHOD_STATUS 0x89
+#define GENERIC_HOST_SESSION_NUM 0x41
+
+#define TPER_SYNC_SUPPORTED 0x01
+
+#define TINY_ATOM_DATA_MASK 0x3F
+#define TINY_ATOM_SIGNED 0x40
+
+#define SHORT_ATOM_ID 0x80
+#define SHORT_ATOM_BYTESTRING 0x20
+#define SHORT_ATOM_SIGNED 0x10
+#define SHORT_ATOM_LEN_MASK 0xF
+
+#define MEDIUM_ATOM_ID 0xC0
+#define MEDIUM_ATOM_BYTESTRING 0x10
+#define MEDIUM_ATOM_SIGNED 0x8
+#define MEDIUM_ATOM_LEN_MASK 0x7
+
+#define LONG_ATOM_ID 0xe0
+#define LONG_ATOM_BYTESTRING 0x2
+#define LONG_ATOM_SIGNED 0x1
+
+/* Derived from TCG Core spec 2.01 Section:
+ * 3.2.2.1
+ * Data Type
+ */
+#define TINY_ATOM_BYTE   0x7F
+#define SHORT_ATOM_BYTE  0xBF
+#define MEDIUM_ATOM_BYTE 0xDF
+#define LONG_ATOM_BYTE   0xE3
+
+#define OPAL_INVAL_PARAM 12
+#define OPAL_MANUFACTURED_INACTIVE 0x08
+#define OPAL_DISCOVERY_COMID 0x0001
+
+#define LOCKING_RANGE_NON_GLOBAL 0x03
+/*
+ * User IDs used in the TCG storage SSCs
+ * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 6.3 Assigned UIDs
+ */
+#define OPAL_UID_LENGTH 8
+#define OPAL_METHOD_LENGTH 8
+#define OPAL_MSID_KEYLEN 15
+#define OPAL_UID_LENGTH_HALF 4
+
+/* Enum to index OPALUID array */
+enum opal_uid {
+       /* users */
+       OPAL_SMUID_UID,
+       OPAL_THISSP_UID,
+       OPAL_ADMINSP_UID,
+       OPAL_LOCKINGSP_UID,
+       OPAL_ENTERPRISE_LOCKINGSP_UID,
+       OPAL_ANYBODY_UID,
+       OPAL_SID_UID,
+       OPAL_ADMIN1_UID,
+       OPAL_USER1_UID,
+       OPAL_USER2_UID,
+       OPAL_PSID_UID,
+       OPAL_ENTERPRISE_BANDMASTER0_UID,
+       OPAL_ENTERPRISE_ERASEMASTER_UID,
+       /* tables */
+       OPAL_LOCKINGRANGE_GLOBAL,
+       OPAL_LOCKINGRANGE_ACE_RDLOCKED,
+       OPAL_LOCKINGRANGE_ACE_WRLOCKED,
+       OPAL_MBRCONTROL,
+       OPAL_MBR,
+       OPAL_AUTHORITY_TABLE,
+       OPAL_C_PIN_TABLE,
+       OPAL_LOCKING_INFO_TABLE,
+       OPAL_ENTERPRISE_LOCKING_INFO_TABLE,
+       /* C_PIN_TABLE object ID's */
+       OPAL_C_PIN_MSID,
+       OPAL_C_PIN_SID,
+       OPAL_C_PIN_ADMIN1,
+       /* half UID's (only first 4 bytes used) */
+       OPAL_HALF_UID_AUTHORITY_OBJ_REF,
+       OPAL_HALF_UID_BOOLEAN_ACE,
+       /* omitted optional parameter */
+       OPAL_UID_HEXFF,
+};
+
+#define OPAL_METHOD_LENGTH 8
+
+/* Enum for indexing the OPALMETHOD array */
+enum opal_method {
+       OPAL_PROPERTIES,
+       OPAL_STARTSESSION,
+       OPAL_REVERT,
+       OPAL_ACTIVATE,
+       OPAL_EGET,
+       OPAL_ESET,
+       OPAL_NEXT,
+       OPAL_EAUTHENTICATE,
+       OPAL_GETACL,
+       OPAL_GENKEY,
+       OPAL_REVERTSP,
+       OPAL_GET,
+       OPAL_SET,
+       OPAL_AUTHENTICATE,
+       OPAL_RANDOM,
+       OPAL_ERASE,
+};
+
+enum opal_token {
+       /* Boolean */
+       OPAL_TRUE = 0x01,
+       OPAL_FALSE = 0x00,
+       OPAL_BOOLEAN_EXPR = 0x03,
+       /* cellblocks */
+       OPAL_TABLE = 0x00,
+       OPAL_STARTROW = 0x01,
+       OPAL_ENDROW = 0x02,
+       OPAL_STARTCOLUMN = 0x03,
+       OPAL_ENDCOLUMN = 0x04,
+       OPAL_VALUES = 0x01,
+       /* authority table */
+       OPAL_PIN = 0x03,
+       /* locking tokens */
+       OPAL_RANGESTART = 0x03,
+       OPAL_RANGELENGTH = 0x04,
+       OPAL_READLOCKENABLED = 0x05,
+       OPAL_WRITELOCKENABLED = 0x06,
+       OPAL_READLOCKED = 0x07,
+       OPAL_WRITELOCKED = 0x08,
+       OPAL_ACTIVEKEY = 0x0A,
+       /* locking info table */
+       OPAL_MAXRANGES = 0x04,
+        /* mbr control */
+       OPAL_MBRENABLE = 0x01,
+       OPAL_MBRDONE = 0x02,
+       /* properties */
+       OPAL_HOSTPROPERTIES = 0x00,
+       /* atoms */
+       OPAL_STARTLIST = 0xf0,
+       OPAL_ENDLIST = 0xf1,
+       OPAL_STARTNAME = 0xf2,
+       OPAL_ENDNAME = 0xf3,
+       OPAL_CALL = 0xf8,
+       OPAL_ENDOFDATA = 0xf9,
+       OPAL_ENDOFSESSION = 0xfa,
+       OPAL_STARTTRANSACTON = 0xfb,
+       OPAL_ENDTRANSACTON = 0xfC,
+       OPAL_EMPTYATOM = 0xff,
+       OPAL_WHERE = 0x00,
+};
+
+/* Locking state for a locking range */
+enum opal_lockingstate {
+       OPAL_LOCKING_READWRITE = 0x01,
+       OPAL_LOCKING_READONLY = 0x02,
+       OPAL_LOCKING_LOCKED = 0x03,
+};
+
+/* Packets derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Secion: 3.2.3 ComPackets, Packets & Subpackets
+ */
+
+/* Comm Packet (header) for transmissions. */
+struct opal_compacket {
+       __be32 reserved0;
+       u8 extendedComID[4];
+       __be32 outstandingData;
+       __be32 minTransfer;
+       __be32 length;
+};
+
+/* Packet structure. */
+struct opal_packet {
+       __be32 tsn;
+       __be32 hsn;
+       __be32 seq_number;
+       __be16 reserved0;
+       __be16 ack_type;
+       __be32 acknowledgment;
+       __be32 length;
+};
+
+/* Data sub packet header */
+struct opal_data_subpacket {
+       u8 reserved0[6];
+       __be16 kind;
+       __be32 length;
+};
+
+/* header of a response */
+struct opal_header {
+       struct opal_compacket cp;
+       struct opal_packet pkt;
+       struct opal_data_subpacket subpkt;
+};
+
+#define FC_TPER       0x0001
+#define FC_LOCKING    0x0002
+#define FC_GEOMETRY   0x0003
+#define FC_ENTERPRISE 0x0100
+#define FC_DATASTORE  0x0202
+#define FC_SINGLEUSER 0x0201
+#define FC_OPALV100   0x0200
+#define FC_OPALV200   0x0203
+
+/*
+ * The Discovery 0 Header. As defined in
+ * Opal SSC Documentation
+ * Section: 3.3.5 Capability Discovery
+ */
+struct d0_header {
+       __be32 length; /* the length of the header 48 in 2.00.100 */
+       __be32 revision; /**< revision of the header 1 in 2.00.100 */
+       __be32 reserved01;
+       __be32 reserved02;
+       /*
+        * the remainder of the structure is vendor specific and will not be
+        * addressed now
+        */
+       u8 ignored[32];
+};
+
+/*
+ * TPer Feature Descriptor. Contains flags indicating support for the
+ * TPer features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x001 in 2.00.100
+ */
+struct d0_tper_features {
+       /*
+        * supported_features bits:
+        * bit 7: reserved
+        * bit 6: com ID management
+        * bit 5: reserved
+        * bit 4: streaming support
+        * bit 3: buffer management
+        * bit 2: ACK/NACK
+        * bit 1: async
+        * bit 0: sync
+        */
+       u8 supported_features;
+       /*
+        * bytes 5 through 15 are reserved, but we represent the first 3 as
+        * u8 to keep the other two 32bits integers aligned.
+        */
+       u8 reserved01[3];
+       __be32 reserved02;
+       __be32 reserved03;
+};
+
+/*
+ * Locking Feature Descriptor. Contains flags indicating support for the
+ * locking features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x0002 in 2.00.100
+ */
+struct d0_locking_features {
+       /*
+        * supported_features bits:
+        * bits 6-7: reserved
+        * bit 5: MBR done
+        * bit 4: MBR enabled
+        * bit 3: media encryption
+        * bit 2: locked
+        * bit 1: locking enabled
+        * bit 0: locking supported
+        */
+       u8 supported_features;
+       /*
+        * bytes 5 through 15 are reserved, but we represent the first 3 as
+        * u8 to keep the other two 32bits integers aligned.
+        */
+       u8 reserved01[3];
+       __be32 reserved02;
+       __be32 reserved03;
+};
+
+/*
+ * Geometry Feature Descriptor. Contains flags indicating support for the
+ * geometry features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x0003 in 2.00.100
+ */
+struct d0_geometry_features {
+       /*
+        * skip 32 bits from header, needed to align the struct to 64 bits.
+        */
+       u8 header[4];
+       /*
+        * reserved01:
+        * bits 1-6: reserved
+        * bit 0: align
+        */
+       u8 reserved01;
+       u8 reserved02[7];
+       __be32 logical_block_size;
+       __be64 alignment_granularity;
+       __be64 lowest_aligned_lba;
+};
+
+/*
+ * Enterprise SSC Feature
+ *
+ * code == 0x0100
+ */
+struct d0_enterprise_ssc {
+       __be16 baseComID;
+       __be16 numComIDs;
+       /* range_crossing:
+        * bits 1-6: reserved
+        * bit 0: range crossing
+        */
+       u8 range_crossing;
+       u8 reserved01;
+       __be16 reserved02;
+       __be32 reserved03;
+       __be32 reserved04;
+};
+
+/*
+ * Opal V1 feature
+ *
+ * code == 0x0200
+ */
+struct d0_opal_v100 {
+       __be16 baseComID;
+       __be16 numComIDs;
+};
+
+/*
+ * Single User Mode feature
+ *
+ * code == 0x0201
+ */
+struct d0_single_user_mode {
+       __be32 num_locking_objects;
+       /* reserved01:
+        * bit 0: any
+        * bit 1: all
+        * bit 2: policy
+        * bits 3-7: reserved
+        */
+       u8 reserved01;
+       u8 reserved02;
+       __be16 reserved03;
+       __be32 reserved04;
+};
+
+/*
+ * Additonal Datastores feature
+ *
+ * code == 0x0202
+ */
+struct d0_datastore_table {
+       __be16 reserved01;
+       __be16 max_tables;
+       __be32 max_size_tables;
+       __be32 table_size_alignment;
+};
+
+/*
+ * OPAL 2.0 feature
+ *
+ * code == 0x0203
+ */
+struct d0_opal_v200 {
+       __be16 baseComID;
+       __be16 numComIDs;
+       /* range_crossing:
+        * bits 1-6: reserved
+        * bit 0: range crossing
+        */
+       u8 range_crossing;
+       /* num_locking_admin_auth:
+        * not aligned to 16 bits, so use two u8.
+        * stored in big endian:
+        * 0: MSB
+        * 1: LSB
+        */
+       u8 num_locking_admin_auth[2];
+       /* num_locking_user_auth:
+        * not aligned to 16 bits, so use two u8.
+        * stored in big endian:
+        * 0: MSB
+        * 1: LSB
+        */
+       u8 num_locking_user_auth[2];
+       u8 initialPIN;
+       u8 revertedPIN;
+       u8 reserved01;
+       __be32 reserved02;
+};
+
+/* Union of features used to parse the discovery 0 response */
+struct d0_features {
+       __be16 code;
+       /*
+        * r_version bits:
+        * bits 4-7: version
+        * bits 0-3: reserved
+        */
+       u8 r_version;
+       u8 length;
+       u8 features[];
+};
+
+#endif /* _OPAL_PROTO_H */
index bcd86e5..39f70d9 100644 (file)
@@ -293,7 +293,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
        if (!gpt)
                return NULL;
 
-       count = le32_to_cpu(gpt->num_partition_entries) *
+       count = (size_t)le32_to_cpu(gpt->num_partition_entries) *
                 le32_to_cpu(gpt->sizeof_partition_entry);
        if (!count)
                return NULL;
@@ -352,7 +352,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
                        gpt_header **gpt, gpt_entry **ptes)
 {
        u32 crc, origcrc;
-       u64 lastlba;
+       u64 lastlba, pt_size;
 
        if (!ptes)
                return 0;
@@ -434,13 +434,20 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
                goto fail;
        }
 
+       /* Sanity check partition table size */
+       pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) *
+               le32_to_cpu((*gpt)->sizeof_partition_entry);
+       if (pt_size > KMALLOC_MAX_SIZE) {
+               pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n",
+                        (unsigned long long)pt_size, KMALLOC_MAX_SIZE);
+               goto fail;
+       }
+
        if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
                goto fail;
 
        /* Check the GUID Partition Entry Array CRC */
-       crc = efi_crc32((const unsigned char *) (*ptes),
-                       le32_to_cpu((*gpt)->num_partition_entries) *
-                       le32_to_cpu((*gpt)->sizeof_partition_entry));
+       crc = efi_crc32((const unsigned char *) (*ptes), pt_size);
 
        if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
                pr_debug("GUID Partition Entry Array CRC check failed.\n");
diff --git a/block/sed-opal.c b/block/sed-opal.c
new file mode 100644 (file)
index 0000000..d1c52ba
--- /dev/null
@@ -0,0 +1,2488 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ *    Scott  Bauer      <scott.bauer@intel.com>
+ *    Rafael Antognolli <rafael.antognolli@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":OPAL: " fmt
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/sed-opal.h>
+#include <linux/sed-opal.h>
+#include <linux/string.h>
+#include <linux/kdev_t.h>
+
+#include "opal_proto.h"
+
+#define IO_BUFFER_LENGTH 2048
+#define MAX_TOKS 64
+
+typedef int (*opal_step)(struct opal_dev *dev);
+
+enum opal_atom_width {
+       OPAL_WIDTH_TINY,
+       OPAL_WIDTH_SHORT,
+       OPAL_WIDTH_MEDIUM,
+       OPAL_WIDTH_LONG,
+       OPAL_WIDTH_TOKEN
+};
+
+/*
+ * On the parsed response, we don't store again the toks that are already
+ * stored in the response buffer. Instead, for each token, we just store a
+ * pointer to the position in the buffer where the token starts, and the size
+ * of the token in bytes.
+ */
+struct opal_resp_tok {
+       const u8 *pos;
+       size_t len;
+       enum opal_response_token type;
+       enum opal_atom_width width;
+       union {
+               u64 u;
+               s64 s;
+       } stored;
+};
+
+/*
+ * From the response header it's not possible to know how many tokens there are
+ * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later
+ * if we start dealing with messages that have more than that, we can increase
+ * this number. This is done to avoid having to make two passes through the
+ * response, the first one counting how many tokens we have and the second one
+ * actually storing the positions.
+ */
+struct parsed_resp {
+       int num;
+       struct opal_resp_tok toks[MAX_TOKS];
+};
+
+struct opal_dev {
+       bool supported;
+
+       void *data;
+       sec_send_recv *send_recv;
+
+       const opal_step *funcs;
+       void **func_data;
+       int state;
+       struct mutex dev_lock;
+       u16 comid;
+       u32 hsn;
+       u32 tsn;
+       u64 align;
+       u64 lowest_lba;
+
+       size_t pos;
+       u8 cmd[IO_BUFFER_LENGTH];
+       u8 resp[IO_BUFFER_LENGTH];
+
+       struct parsed_resp parsed;
+       size_t prev_d_len;
+       void *prev_data;
+
+       struct list_head unlk_lst;
+};
+
+
+static const u8 opaluid[][OPAL_UID_LENGTH] = {
+       /* users */
+       [OPAL_SMUID_UID] =
+               { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff },
+       [OPAL_THISSP_UID] =
+               { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+       [OPAL_ADMINSP_UID] =
+               { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 },
+       [OPAL_LOCKINGSP_UID] =
+               { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 },
+       [OPAL_ENTERPRISE_LOCKINGSP_UID] =
+               { 0x00, 0x00, 0x02, 0x05, 0x00, 0x01, 0x00, 0x01 },
+       [OPAL_ANYBODY_UID] =
+               { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 },
+       [OPAL_SID_UID] =
+               { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 },
+       [OPAL_ADMIN1_UID] =
+               { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 },
+       [OPAL_USER1_UID] =
+               { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 },
+       [OPAL_USER2_UID] =
+               { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 },
+       [OPAL_PSID_UID] =
+               { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 },
+       [OPAL_ENTERPRISE_BANDMASTER0_UID] =
+               { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x80, 0x01 },
+       [OPAL_ENTERPRISE_ERASEMASTER_UID] =
+               { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 },
+
+       /* tables */
+
+       [OPAL_LOCKINGRANGE_GLOBAL] =
+               { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 },
+       [OPAL_LOCKINGRANGE_ACE_RDLOCKED] =
+               { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 },
+       [OPAL_LOCKINGRANGE_ACE_WRLOCKED] =
+               { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 },
+       [OPAL_MBRCONTROL] =
+               { 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 },
+       [OPAL_MBR] =
+               { 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 },
+       [OPAL_AUTHORITY_TABLE] =
+               { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00},
+       [OPAL_C_PIN_TABLE] =
+               { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00},
+       [OPAL_LOCKING_INFO_TABLE] =
+               { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 },
+       [OPAL_ENTERPRISE_LOCKING_INFO_TABLE] =
+               { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 },
+
+       /* C_PIN_TABLE object ID's */
+
+        [OPAL_C_PIN_MSID] =
+               { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02},
+       [OPAL_C_PIN_SID] =
+               { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01},
+       [OPAL_C_PIN_ADMIN1] =
+               { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01},
+
+       /* half UID's (only first 4 bytes used) */
+
+       [OPAL_HALF_UID_AUTHORITY_OBJ_REF] =
+               { 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff },
+       [OPAL_HALF_UID_BOOLEAN_ACE] =
+               { 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff },
+
+       /* special value for omitted optional parameter */
+       [OPAL_UID_HEXFF] =
+               { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+};
+
+/*
+ * TCG Storage SSC Methods.
+ * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 6.3 Assigned UIDs
+ */
+static const u8 opalmethod[][OPAL_UID_LENGTH] = {
+       [OPAL_PROPERTIES] =
+               { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 },
+       [OPAL_STARTSESSION] =
+               { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 },
+       [OPAL_REVERT] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 },
+       [OPAL_ACTIVATE] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 },
+       [OPAL_EGET] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06 },
+       [OPAL_ESET] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07 },
+       [OPAL_NEXT] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 },
+       [OPAL_EAUTHENTICATE] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0c },
+       [OPAL_GETACL] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d },
+       [OPAL_GENKEY] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 },
+       [OPAL_REVERTSP] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 },
+       [OPAL_GET] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 },
+       [OPAL_SET] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 },
+       [OPAL_AUTHENTICATE] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c },
+       [OPAL_RANDOM] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 },
+       [OPAL_ERASE] =
+               { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 },
+};
+
+typedef int (cont_fn)(struct opal_dev *dev);
+
+static int end_opal_session_error(struct opal_dev *dev);
+
+struct opal_suspend_data {
+       struct opal_lock_unlock unlk;
+       u8 lr;
+       struct list_head node;
+};
+
+/*
+ * Derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 5.1.5 Method Status Codes
+ */
+static const char * const opal_errors[] = {
+       "Success",
+       "Not Authorized",
+       "Unknown Error",
+       "SP Busy",
+       "SP Failed",
+       "SP Disabled",
+       "SP Frozen",
+       "No Sessions Available",
+       "Uniqueness Conflict",
+       "Insufficient Space",
+       "Insufficient Rows",
+       "Invalid Function",
+       "Invalid Parameter",
+       "Invalid Reference",
+       "Unknown Error",
+       "TPER Malfunction",
+       "Transaction Failure",
+       "Response Overflow",
+       "Authority Locked Out",
+};
+
+static const char *opal_error_to_human(int error)
+{
+       if (error == 0x3f)
+               return "Failed";
+
+       if (error >= ARRAY_SIZE(opal_errors) || error < 0)
+               return "Unknown Error";
+
+       return opal_errors[error];
+}
+
+static void print_buffer(const u8 *ptr, u32 length)
+{
+#ifdef DEBUG
+       print_hex_dump_bytes("OPAL: ", DUMP_PREFIX_OFFSET, ptr, length);
+       pr_debug("\n");
+#endif
+}
+
+static bool check_tper(const void *data)
+{
+       const struct d0_tper_features *tper = data;
+       u8 flags = tper->supported_features;
+
+       if (!(flags & TPER_SYNC_SUPPORTED)) {
+               pr_err("TPer sync not supported. flags = %d\n",
+                      tper->supported_features);
+               return false;
+       }
+
+       return true;
+}
+
+static bool check_sum(const void *data)
+{
+       const struct d0_single_user_mode *sum = data;
+       u32 nlo = be32_to_cpu(sum->num_locking_objects);
+
+       if (nlo == 0) {
+               pr_err("Need at least one locking object.\n");
+               return false;
+       }
+
+       pr_debug("Number of locking objects: %d\n", nlo);
+
+       return true;
+}
+
+static u16 get_comid_v100(const void *data)
+{
+       const struct d0_opal_v100 *v100 = data;
+
+       return be16_to_cpu(v100->baseComID);
+}
+
+static u16 get_comid_v200(const void *data)
+{
+       const struct d0_opal_v200 *v200 = data;
+
+       return be16_to_cpu(v200->baseComID);
+}
+
+static int opal_send_cmd(struct opal_dev *dev)
+{
+       return dev->send_recv(dev->data, dev->comid, TCG_SECP_01,
+                             dev->cmd, IO_BUFFER_LENGTH,
+                             true);
+}
+
+static int opal_recv_cmd(struct opal_dev *dev)
+{
+       return dev->send_recv(dev->data, dev->comid, TCG_SECP_01,
+                             dev->resp, IO_BUFFER_LENGTH,
+                             false);
+}
+
+static int opal_recv_check(struct opal_dev *dev)
+{
+       size_t buflen = IO_BUFFER_LENGTH;
+       void *buffer = dev->resp;
+       struct opal_header *hdr = buffer;
+       int ret;
+
+       do {
+               pr_debug("Sent OPAL command: outstanding=%d, minTransfer=%d\n",
+                        hdr->cp.outstandingData,
+                        hdr->cp.minTransfer);
+
+               if (hdr->cp.outstandingData == 0 ||
+                   hdr->cp.minTransfer != 0)
+                       return 0;
+
+               memset(buffer, 0, buflen);
+               ret = opal_recv_cmd(dev);
+       } while (!ret);
+
+       return ret;
+}
+
+static int opal_send_recv(struct opal_dev *dev, cont_fn *cont)
+{
+       int ret;
+
+       ret = opal_send_cmd(dev);
+       if (ret)
+               return ret;
+       ret = opal_recv_cmd(dev);
+       if (ret)
+               return ret;
+       ret = opal_recv_check(dev);
+       if (ret)
+               return ret;
+       return cont(dev);
+}
+
+static void check_geometry(struct opal_dev *dev, const void *data)
+{
+       const struct d0_geometry_features *geo = data;
+
+       dev->align = geo->alignment_granularity;
+       dev->lowest_lba = geo->lowest_aligned_lba;
+}
+
+static int next(struct opal_dev *dev)
+{
+       opal_step func;
+       int error = 0;
+
+       do {
+               func = dev->funcs[dev->state];
+               if (!func)
+                       break;
+
+               error = func(dev);
+               if (error) {
+                       pr_err("Error on step function: %d with error %d: %s\n",
+                              dev->state, error,
+                              opal_error_to_human(error));
+
+                       /* For each OPAL command we do a discovery0 then we
+                        * start some sort of session.
+                        * If we haven't passed state 1 then there was an error
+                        * on discovery0 or during the attempt to start a
+                        * session. Therefore we shouldn't attempt to terminate
+                        * a session, as one has not yet been created.
+                        */
+                       if (dev->state > 1)
+                               return end_opal_session_error(dev);
+               }
+               dev->state++;
+       } while (!error);
+
+       return error;
+}
+
+static int opal_discovery0_end(struct opal_dev *dev)
+{
+       bool found_com_id = false, supported = true, single_user = false;
+       const struct d0_header *hdr = (struct d0_header *)dev->resp;
+       const u8 *epos = dev->resp, *cpos = dev->resp;
+       u16 comid = 0;
+
+       print_buffer(dev->resp, be32_to_cpu(hdr->length));
+
+       epos += be32_to_cpu(hdr->length); /* end of buffer */
+       cpos += sizeof(*hdr); /* current position on buffer */
+
+       while (cpos < epos && supported) {
+               const struct d0_features *body =
+                       (const struct d0_features *)cpos;
+
+               switch (be16_to_cpu(body->code)) {
+               case FC_TPER:
+                       supported = check_tper(body->features);
+                       break;
+               case FC_SINGLEUSER:
+                       single_user = check_sum(body->features);
+                       break;
+               case FC_GEOMETRY:
+                       check_geometry(dev, body);
+                       break;
+               case FC_LOCKING:
+               case FC_ENTERPRISE:
+               case FC_DATASTORE:
+                       /* some ignored properties */
+                       pr_debug("Found OPAL feature description: %d\n",
+                                be16_to_cpu(body->code));
+                       break;
+               case FC_OPALV100:
+                       comid = get_comid_v100(body->features);
+                       found_com_id = true;
+                       break;
+               case FC_OPALV200:
+                       comid = get_comid_v200(body->features);
+                       found_com_id = true;
+                       break;
+               case 0xbfff ... 0xffff:
+                       /* vendor specific, just ignore */
+                       break;
+               default:
+                       pr_debug("OPAL Unknown feature: %d\n",
+                                be16_to_cpu(body->code));
+
+               }
+               cpos += body->length + 4;
+       }
+
+       if (!supported) {
+               pr_debug("This device is not Opal enabled. Not Supported!\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (!single_user)
+               pr_debug("Device doesn't support single user mode\n");
+
+
+       if (!found_com_id) {
+               pr_debug("Could not find OPAL comid for device. Returning early\n");
+               return -EOPNOTSUPP;;
+       }
+
+       dev->comid = comid;
+
+       return 0;
+}
+
+static int opal_discovery0(struct opal_dev *dev)
+{
+       int ret;
+
+       memset(dev->resp, 0, IO_BUFFER_LENGTH);
+       dev->comid = OPAL_DISCOVERY_COMID;
+       ret = opal_recv_cmd(dev);
+       if (ret)
+               return ret;
+       return opal_discovery0_end(dev);
+}
+
+static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
+{
+       if (*err)
+               return;
+       if (cmd->pos >= IO_BUFFER_LENGTH - 1) {
+               pr_err("Error adding u8: end of buffer.\n");
+               *err = -ERANGE;
+               return;
+       }
+       cmd->cmd[cmd->pos++] = tok;
+}
+
+static void add_short_atom_header(struct opal_dev *cmd, bool bytestring,
+                                 bool has_sign, int len)
+{
+       u8 atom;
+       int err = 0;
+
+       atom = SHORT_ATOM_ID;
+       atom |= bytestring ? SHORT_ATOM_BYTESTRING : 0;
+       atom |= has_sign ? SHORT_ATOM_SIGNED : 0;
+       atom |= len & SHORT_ATOM_LEN_MASK;
+
+       add_token_u8(&err, cmd, atom);
+}
+
+static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring,
+                                  bool has_sign, int len)
+{
+       u8 header0;
+
+       header0 = MEDIUM_ATOM_ID;
+       header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0;
+       header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0;
+       header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK;
+       cmd->cmd[cmd->pos++] = header0;
+       cmd->cmd[cmd->pos++] = len;
+}
+
+static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
+{
+
+       size_t len;
+       int msb;
+       u8 n;
+
+       if (!(number & ~TINY_ATOM_DATA_MASK)) {
+               add_token_u8(err, cmd, number);
+               return;
+       }
+
+       msb = fls(number);
+       len = DIV_ROUND_UP(msb, 4);
+
+       if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) {
+               pr_err("Error adding u64: end of buffer.\n");
+               *err = -ERANGE;
+               return;
+       }
+       add_short_atom_header(cmd, false, false, len);
+       while (len--) {
+               n = number >> (len * 8);
+               add_token_u8(err, cmd, n);
+       }
+}
+
+static void add_token_bytestring(int *err, struct opal_dev *cmd,
+                                const u8 *bytestring, size_t len)
+{
+       size_t header_len = 1;
+       bool is_short_atom = true;
+
+       if (*err)
+               return;
+
+       if (len & ~SHORT_ATOM_LEN_MASK) {
+               header_len = 2;
+               is_short_atom = false;
+       }
+
+       if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) {
+               pr_err("Error adding bytestring: end of buffer.\n");
+               *err = -ERANGE;
+               return;
+       }
+
+       if (is_short_atom)
+               add_short_atom_header(cmd, true, false, len);
+       else
+               add_medium_atom_header(cmd, true, false, len);
+
+       memcpy(&cmd->cmd[cmd->pos], bytestring, len);
+       cmd->pos += len;
+
+}
+
+static int build_locking_range(u8 *buffer, size_t length, u8 lr)
+{
+       if (length > OPAL_UID_LENGTH) {
+               pr_err("Can't build locking range. Length OOB\n");
+               return -ERANGE;
+       }
+
+       memcpy(buffer, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH);
+
+       if (lr == 0)
+               return 0;
+       buffer[5] = LOCKING_RANGE_NON_GLOBAL;
+       buffer[7] = lr;
+
+       return 0;
+}
+
+static int build_locking_user(u8 *buffer, size_t length, u8 lr)
+{
+       if (length > OPAL_UID_LENGTH) {
+               pr_err("Can't build locking range user, Length OOB\n");
+               return -ERANGE;
+       }
+
+       memcpy(buffer, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+
+       buffer[7] = lr + 1;
+
+       return 0;
+}
+
+static void set_comid(struct opal_dev *cmd, u16 comid)
+{
+       struct opal_header *hdr = (struct opal_header *)cmd->cmd;
+
+       hdr->cp.extendedComID[0] = comid >> 8;
+       hdr->cp.extendedComID[1] = comid;
+       hdr->cp.extendedComID[2] = 0;
+       hdr->cp.extendedComID[3] = 0;
+}
+
+static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
+{
+       struct opal_header *hdr;
+       int err = 0;
+
+       add_token_u8(&err, cmd, OPAL_ENDOFDATA);
+       add_token_u8(&err, cmd, OPAL_STARTLIST);
+       add_token_u8(&err, cmd, 0);
+       add_token_u8(&err, cmd, 0);
+       add_token_u8(&err, cmd, 0);
+       add_token_u8(&err, cmd, OPAL_ENDLIST);
+
+       if (err) {
+               pr_err("Error finalizing command.\n");
+               return -EFAULT;
+       }
+
+       hdr = (struct opal_header *) cmd->cmd;
+
+       hdr->pkt.tsn = cpu_to_be32(tsn);
+       hdr->pkt.hsn = cpu_to_be32(hsn);
+
+       hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr));
+       while (cmd->pos % 4) {
+               if (cmd->pos >= IO_BUFFER_LENGTH) {
+                       pr_err("Error: Buffer overrun\n");
+                       return -ERANGE;
+               }
+               cmd->cmd[cmd->pos++] = 0;
+       }
+       hdr->pkt.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp) -
+                                     sizeof(hdr->pkt));
+       hdr->cp.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp));
+
+       return 0;
+}
+
+static enum opal_response_token token_type(const struct parsed_resp *resp,
+                                          int n)
+{
+       const struct opal_resp_tok *tok;
+
+       if (n >= resp->num) {
+               pr_err("Token number doesn't exist: %d, resp: %d\n",
+                      n, resp->num);
+               return OPAL_DTA_TOKENID_INVALID;
+       }
+
+       tok = &resp->toks[n];
+       if (tok->len == 0) {
+               pr_err("Token length must be non-zero\n");
+               return OPAL_DTA_TOKENID_INVALID;
+       }
+
+       return tok->type;
+}
+
+/*
+ * This function returns 0 in case of invalid token. One should call
+ * token_type() first to find out if the token is valid or not.
+ */
+static enum opal_token response_get_token(const struct parsed_resp *resp,
+                                         int n)
+{
+       const struct opal_resp_tok *tok;
+
+       if (n >= resp->num) {
+               pr_err("Token number doesn't exist: %d, resp: %d\n",
+                      n, resp->num);
+               return 0;
+       }
+
+       tok = &resp->toks[n];
+       if (tok->len == 0) {
+               pr_err("Token length must be non-zero\n");
+               return 0;
+       }
+
+       return tok->pos[0];
+}
+
+static size_t response_parse_tiny(struct opal_resp_tok *tok,
+                                 const u8 *pos)
+{
+       tok->pos = pos;
+       tok->len = 1;
+       tok->width = OPAL_WIDTH_TINY;
+
+       if (pos[0] & TINY_ATOM_SIGNED) {
+               tok->type = OPAL_DTA_TOKENID_SINT;
+       } else {
+               tok->type = OPAL_DTA_TOKENID_UINT;
+               tok->stored.u = pos[0] & 0x3f;
+       }
+
+       return tok->len;
+}
+
+static size_t response_parse_short(struct opal_resp_tok *tok,
+                                  const u8 *pos)
+{
+       tok->pos = pos;
+       tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1;
+       tok->width = OPAL_WIDTH_SHORT;
+
+       if (pos[0] & SHORT_ATOM_BYTESTRING) {
+               tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+       } else if (pos[0] & SHORT_ATOM_SIGNED) {
+               tok->type = OPAL_DTA_TOKENID_SINT;
+       } else {
+               u64 u_integer = 0;
+               int i, b = 0;
+
+               tok->type = OPAL_DTA_TOKENID_UINT;
+               if (tok->len > 9) {
+                       pr_warn("uint64 with more than 8 bytes\n");
+                       return -EINVAL;
+               }
+               for (i = tok->len - 1; i > 0; i--) {
+                       u_integer |= ((u64)pos[i] << (8 * b));
+                       b++;
+               }
+               tok->stored.u = u_integer;
+       }
+
+       return tok->len;
+}
+
+static size_t response_parse_medium(struct opal_resp_tok *tok,
+                                   const u8 *pos)
+{
+       tok->pos = pos;
+       tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2;
+       tok->width = OPAL_WIDTH_MEDIUM;
+
+       if (pos[0] & MEDIUM_ATOM_BYTESTRING)
+               tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+       else if (pos[0] & MEDIUM_ATOM_SIGNED)
+               tok->type = OPAL_DTA_TOKENID_SINT;
+       else
+               tok->type = OPAL_DTA_TOKENID_UINT;
+
+       return tok->len;
+}
+
+static size_t response_parse_long(struct opal_resp_tok *tok,
+                                 const u8 *pos)
+{
+       tok->pos = pos;
+       tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4;
+       tok->width = OPAL_WIDTH_LONG;
+
+       if (pos[0] & LONG_ATOM_BYTESTRING)
+               tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+       else if (pos[0] & LONG_ATOM_SIGNED)
+               tok->type = OPAL_DTA_TOKENID_SINT;
+       else
+               tok->type = OPAL_DTA_TOKENID_UINT;
+
+       return tok->len;
+}
+
+static size_t response_parse_token(struct opal_resp_tok *tok,
+                                  const u8 *pos)
+{
+       tok->pos = pos;
+       tok->len = 1;
+       tok->type = OPAL_DTA_TOKENID_TOKEN;
+       tok->width = OPAL_WIDTH_TOKEN;
+
+       return tok->len;
+}
+
+static int response_parse(const u8 *buf, size_t length,
+                         struct parsed_resp *resp)
+{
+       const struct opal_header *hdr;
+       struct opal_resp_tok *iter;
+       int num_entries = 0;
+       int total;
+       size_t token_length;
+       const u8 *pos;
+
+       if (!buf)
+               return -EFAULT;
+
+       if (!resp)
+               return -EFAULT;
+
+       hdr = (struct opal_header *)buf;
+       pos = buf;
+       pos += sizeof(*hdr);
+
+       pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n",
+                be32_to_cpu(hdr->cp.length),
+                be32_to_cpu(hdr->pkt.length),
+                be32_to_cpu(hdr->subpkt.length));
+
+       if (hdr->cp.length == 0 || hdr->pkt.length == 0 ||
+           hdr->subpkt.length == 0) {
+               pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n",
+                      be32_to_cpu(hdr->cp.length),
+                      be32_to_cpu(hdr->pkt.length),
+                      be32_to_cpu(hdr->subpkt.length));
+               print_buffer(pos, sizeof(*hdr));
+               return -EINVAL;
+       }
+
+       if (pos > buf + length)
+               return -EFAULT;
+
+       iter = resp->toks;
+       total = be32_to_cpu(hdr->subpkt.length);
+       print_buffer(pos, total);
+       while (total > 0) {
+               if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */
+                       token_length = response_parse_tiny(iter, pos);
+               else if (pos[0] <= SHORT_ATOM_BYTE) /* short atom */
+                       token_length = response_parse_short(iter, pos);
+               else if (pos[0] <= MEDIUM_ATOM_BYTE) /* medium atom */
+                       token_length = response_parse_medium(iter, pos);
+               else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */
+                       token_length = response_parse_long(iter, pos);
+               else /* TOKEN */
+                       token_length = response_parse_token(iter, pos);
+
+               if (token_length == -EINVAL)
+                       return -EINVAL;
+
+               pos += token_length;
+               total -= token_length;
+               iter++;
+               num_entries++;
+       }
+
+       if (num_entries == 0) {
+               pr_err("Couldn't parse response.\n");
+               return -EINVAL;
+       }
+       resp->num = num_entries;
+
+       return 0;
+}
+
+static size_t response_get_string(const struct parsed_resp *resp, int n,
+                                 const char **store)
+{
+       *store = NULL;
+       if (!resp) {
+               pr_err("Response is NULL\n");
+               return 0;
+       }
+
+       if (n > resp->num) {
+               pr_err("Response has %d tokens. Can't access %d\n",
+                      resp->num, n);
+               return 0;
+       }
+
+       if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) {
+               pr_err("Token is not a byte string!\n");
+               return 0;
+       }
+
+       *store = resp->toks[n].pos + 1;
+       return resp->toks[n].len - 1;
+}
+
+static u64 response_get_u64(const struct parsed_resp *resp, int n)
+{
+       if (!resp) {
+               pr_err("Response is NULL\n");
+               return 0;
+       }
+
+       if (n > resp->num) {
+               pr_err("Response has %d tokens. Can't access %d\n",
+                      resp->num, n);
+               return 0;
+       }
+
+       if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) {
+               pr_err("Token is not unsigned it: %d\n",
+                      resp->toks[n].type);
+               return 0;
+       }
+
+       if (!(resp->toks[n].width == OPAL_WIDTH_TINY ||
+             resp->toks[n].width == OPAL_WIDTH_SHORT)) {
+               pr_err("Atom is not short or tiny: %d\n",
+                      resp->toks[n].width);
+               return 0;
+       }
+
+       return resp->toks[n].stored.u;
+}
+
+static u8 response_status(const struct parsed_resp *resp)
+{
+       if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN &&
+           response_get_token(resp, 0) == OPAL_ENDOFSESSION) {
+               return 0;
+       }
+
+       if (resp->num < 5)
+               return DTAERROR_NO_METHOD_STATUS;
+
+       if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN ||
+           token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN ||
+           response_get_token(resp, resp->num - 1) != OPAL_ENDLIST ||
+           response_get_token(resp, resp->num - 5) != OPAL_STARTLIST)
+               return DTAERROR_NO_METHOD_STATUS;
+
+       return response_get_u64(resp, resp->num - 4);
+}
+
+/* Parses and checks for errors */
+static int parse_and_check_status(struct opal_dev *dev)
+{
+       int error;
+
+       print_buffer(dev->cmd, dev->pos);
+
+       error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed);
+       if (error) {
+               pr_err("Couldn't parse response.\n");
+               return error;
+       }
+
+       return response_status(&dev->parsed);
+}
+
+static void clear_opal_cmd(struct opal_dev *dev)
+{
+       dev->pos = sizeof(struct opal_header);
+       memset(dev->cmd, 0, IO_BUFFER_LENGTH);
+}
+
+static int start_opal_session_cont(struct opal_dev *dev)
+{
+       u32 hsn, tsn;
+       int error = 0;
+
+       error = parse_and_check_status(dev);
+       if (error)
+               return error;
+
+       hsn = response_get_u64(&dev->parsed, 4);
+       tsn = response_get_u64(&dev->parsed, 5);
+
+       if (hsn == 0 && tsn == 0) {
+               pr_err("Couldn't authenticate session\n");
+               return -EPERM;
+       }
+
+       dev->hsn = hsn;
+       dev->tsn = tsn;
+       return 0;
+}
+
+static void add_suspend_info(struct opal_dev *dev,
+                            struct opal_suspend_data *sus)
+{
+       struct opal_suspend_data *iter;
+
+       list_for_each_entry(iter, &dev->unlk_lst, node) {
+               if (iter->lr == sus->lr) {
+                       list_del(&iter->node);
+                       kfree(iter);
+                       break;
+               }
+       }
+       list_add_tail(&sus->node, &dev->unlk_lst);
+}
+
+static int end_session_cont(struct opal_dev *dev)
+{
+       dev->hsn = 0;
+       dev->tsn = 0;
+       return parse_and_check_status(dev);
+}
+
+static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
+{
+       int ret;
+
+       ret = cmd_finalize(dev, dev->hsn, dev->tsn);
+       if (ret) {
+               pr_err("Error finalizing command buffer: %d\n", ret);
+               return ret;
+       }
+
+       print_buffer(dev->cmd, dev->pos);
+
+       return opal_send_recv(dev, cont);
+}
+
+static int gen_key(struct opal_dev *dev)
+{
+       const u8 *method;
+       u8 uid[OPAL_UID_LENGTH];
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len));
+       method = opalmethod[OPAL_GENKEY];
+       kfree(dev->prev_data);
+       dev->prev_data = NULL;
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_GENKEY],
+                            OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       if (err) {
+               pr_err("Error building gen key command\n");
+               return err;
+
+       }
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int get_active_key_cont(struct opal_dev *dev)
+{
+       const char *activekey;
+       size_t keylen;
+       int error = 0;
+
+       error = parse_and_check_status(dev);
+       if (error)
+               return error;
+       keylen = response_get_string(&dev->parsed, 4, &activekey);
+       if (!activekey) {
+               pr_err("%s: Couldn't extract the Activekey from the response\n",
+                      __func__);
+               return OPAL_INVAL_PARAM;
+       }
+       dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL);
+
+       if (!dev->prev_data)
+               return -ENOMEM;
+
+       dev->prev_d_len = keylen;
+
+       return 0;
+}
+
+static int get_active_key(struct opal_dev *dev)
+{
+       u8 uid[OPAL_UID_LENGTH];
+       int err = 0;
+       u8 *lr;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+       lr = dev->func_data[dev->state];
+
+       err = build_locking_range(uid, sizeof(uid), *lr);
+       if (err)
+               return err;
+
+       err = 0;
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 3); /* startCloumn */
+       add_token_u8(&err, dev, 10); /* ActiveKey */
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 4); /* endColumn */
+       add_token_u8(&err, dev, 10); /* ActiveKey */
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       if (err) {
+               pr_err("Error building get active key command\n");
+               return err;
+       }
+
+       return finalize_and_send(dev, get_active_key_cont);
+}
+
+static int generic_lr_enable_disable(struct opal_dev *dev,
+                                    u8 *uid, bool rle, bool wle,
+                                    bool rl, bool wl)
+{
+       int err = 0;
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, OPAL_VALUES);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 5); /* ReadLockEnabled */
+       add_token_u8(&err, dev, rle);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 6); /* WriteLockEnabled */
+       add_token_u8(&err, dev, wle);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, OPAL_READLOCKED);
+       add_token_u8(&err, dev, rl);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, OPAL_WRITELOCKED);
+       add_token_u8(&err, dev, wl);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       return err;
+}
+
+static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
+                                  struct opal_user_lr_setup *setup)
+{
+       int err;
+
+       err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE,
+                                       0, 0);
+       if (err)
+               pr_err("Failed to create enable global lr command\n");
+       return err;
+}
+
+static int setup_locking_range(struct opal_dev *dev)
+{
+       u8 uid[OPAL_UID_LENGTH];
+       struct opal_user_lr_setup *setup;
+       u8 lr;
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       setup = dev->func_data[dev->state];
+       lr = setup->session.opal_key.lr;
+       err = build_locking_range(uid, sizeof(uid), lr);
+       if (err)
+               return err;
+
+       if (lr == 0)
+               err = enable_global_lr(dev, uid, setup);
+       else {
+               add_token_u8(&err, dev, OPAL_CALL);
+               add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+               add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+                                    OPAL_UID_LENGTH);
+
+               add_token_u8(&err, dev, OPAL_STARTLIST);
+               add_token_u8(&err, dev, OPAL_STARTNAME);
+               add_token_u8(&err, dev, OPAL_VALUES);
+               add_token_u8(&err, dev, OPAL_STARTLIST);
+
+               add_token_u8(&err, dev, OPAL_STARTNAME);
+               add_token_u8(&err, dev, 3); /* Ranges Start */
+               add_token_u64(&err, dev, setup->range_start);
+               add_token_u8(&err, dev, OPAL_ENDNAME);
+
+               add_token_u8(&err, dev, OPAL_STARTNAME);
+               add_token_u8(&err, dev, 4); /* Ranges length */
+               add_token_u64(&err, dev, setup->range_length);
+               add_token_u8(&err, dev, OPAL_ENDNAME);
+
+               add_token_u8(&err, dev, OPAL_STARTNAME);
+               add_token_u8(&err, dev, 5); /*ReadLockEnabled */
+               add_token_u64(&err, dev, !!setup->RLE);
+               add_token_u8(&err, dev, OPAL_ENDNAME);
+
+               add_token_u8(&err, dev, OPAL_STARTNAME);
+               add_token_u8(&err, dev, 6); /*WriteLockEnabled*/
+               add_token_u64(&err, dev, !!setup->WLE);
+               add_token_u8(&err, dev, OPAL_ENDNAME);
+
+               add_token_u8(&err, dev, OPAL_ENDLIST);
+               add_token_u8(&err, dev, OPAL_ENDNAME);
+               add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       }
+       if (err) {
+               pr_err("Error building Setup Locking range command.\n");
+               return err;
+
+       }
+
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int start_generic_opal_session(struct opal_dev *dev,
+                                     enum opal_uid auth,
+                                     enum opal_uid sp_type,
+                                     const char *key,
+                                     u8 key_len)
+{
+       u32 hsn;
+       int err = 0;
+
+       if (key == NULL && auth != OPAL_ANYBODY_UID) {
+               pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \
+                      "Challenge, and not as the Anybody UID\n", __func__);
+               return OPAL_INVAL_PARAM;
+       }
+
+       clear_opal_cmd(dev);
+
+       set_comid(dev, dev->comid);
+       hsn = GENERIC_HOST_SESSION_NUM;
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID],
+                            OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION],
+                            OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u64(&err, dev, hsn);
+       add_token_bytestring(&err, dev, opaluid[sp_type], OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, 1);
+
+       switch (auth) {
+       case OPAL_ANYBODY_UID:
+               add_token_u8(&err, dev, OPAL_ENDLIST);
+               break;
+       case OPAL_ADMIN1_UID:
+       case OPAL_SID_UID:
+               add_token_u8(&err, dev, OPAL_STARTNAME);
+               add_token_u8(&err, dev, 0); /* HostChallenge */
+               add_token_bytestring(&err, dev, key, key_len);
+               add_token_u8(&err, dev, OPAL_ENDNAME);
+               add_token_u8(&err, dev, OPAL_STARTNAME);
+               add_token_u8(&err, dev, 3); /* HostSignAuth */
+               add_token_bytestring(&err, dev, opaluid[auth],
+                                    OPAL_UID_LENGTH);
+               add_token_u8(&err, dev, OPAL_ENDNAME);
+               add_token_u8(&err, dev, OPAL_ENDLIST);
+               break;
+       default:
+               pr_err("Cannot start Admin SP session with auth %d\n", auth);
+               return OPAL_INVAL_PARAM;
+       }
+
+       if (err) {
+               pr_err("Error building start adminsp session command.\n");
+               return err;
+       }
+
+       return finalize_and_send(dev, start_opal_session_cont);
+}
+
+static int start_anybodyASP_opal_session(struct opal_dev *dev)
+{
+       return start_generic_opal_session(dev, OPAL_ANYBODY_UID,
+                                         OPAL_ADMINSP_UID, NULL, 0);
+}
+
+static int start_SIDASP_opal_session(struct opal_dev *dev)
+{
+       int ret;
+       const u8 *key = dev->prev_data;
+       struct opal_key *okey;
+
+       if (!key) {
+               okey = dev->func_data[dev->state];
+               ret = start_generic_opal_session(dev, OPAL_SID_UID,
+                                                OPAL_ADMINSP_UID,
+                                                okey->key,
+                                                okey->key_len);
+       } else {
+               ret = start_generic_opal_session(dev, OPAL_SID_UID,
+                                                OPAL_ADMINSP_UID,
+                                                key, dev->prev_d_len);
+               kfree(key);
+               dev->prev_data = NULL;
+       }
+       return ret;
+}
+
+static inline int start_admin1LSP_opal_session(struct opal_dev *dev)
+{
+       struct opal_key *key = dev->func_data[dev->state];
+
+       return start_generic_opal_session(dev, OPAL_ADMIN1_UID,
+                                         OPAL_LOCKINGSP_UID,
+                                         key->key, key->key_len);
+}
+
+static int start_auth_opal_session(struct opal_dev *dev)
+{
+       u8 lk_ul_user[OPAL_UID_LENGTH];
+       int err = 0;
+
+       struct opal_session_info *session = dev->func_data[dev->state];
+       size_t keylen = session->opal_key.key_len;
+       u8 *key = session->opal_key.key;
+       u32 hsn = GENERIC_HOST_SESSION_NUM;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       if (session->sum) {
+               err = build_locking_user(lk_ul_user, sizeof(lk_ul_user),
+                                        session->opal_key.lr);
+               if (err)
+                       return err;
+
+       } else if (session->who != OPAL_ADMIN1 && !session->sum) {
+               err = build_locking_user(lk_ul_user, sizeof(lk_ul_user),
+                                        session->who - 1);
+               if (err)
+                       return err;
+       } else
+               memcpy(lk_ul_user, opaluid[OPAL_ADMIN1_UID], OPAL_UID_LENGTH);
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID],
+                            OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION],
+                            OPAL_UID_LENGTH);
+
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u64(&err, dev, hsn);
+       add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+                            OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, 1);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 0);
+       add_token_bytestring(&err, dev, key, keylen);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 3);
+       add_token_bytestring(&err, dev, lk_ul_user, OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       if (err) {
+               pr_err("Error building STARTSESSION command.\n");
+               return err;
+       }
+
+       return finalize_and_send(dev, start_opal_session_cont);
+}
+
+static int revert_tper(struct opal_dev *dev)
+{
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, opaluid[OPAL_ADMINSP_UID],
+                            OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_REVERT],
+                            OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       if (err) {
+               pr_err("Error building REVERT TPER command.\n");
+               return err;
+       }
+
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int internal_activate_user(struct opal_dev *dev)
+{
+       struct opal_session_info *session = dev->func_data[dev->state];
+       u8 uid[OPAL_UID_LENGTH];
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       memcpy(uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+       uid[7] = session->who;
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, OPAL_VALUES);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 5); /* Enabled */
+       add_token_u8(&err, dev, OPAL_TRUE);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       if (err) {
+               pr_err("Error building Activate UserN command.\n");
+               return err;
+       }
+
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int erase_locking_range(struct opal_dev *dev)
+{
+       struct opal_session_info *session;
+       u8 uid[OPAL_UID_LENGTH];
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+       session = dev->func_data[dev->state];
+
+       if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0)
+               return -ERANGE;
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_ERASE],
+                            OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       if (err) {
+               pr_err("Error building Erase Locking Range Command.\n");
+               return err;
+       }
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_mbr_done(struct opal_dev *dev)
+{
+       u8 mbr_done_tf = *(u8 *)dev->func_data[dev->state];
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL],
+                            OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, OPAL_VALUES);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 2); /* Done */
+       add_token_u8(&err, dev, mbr_done_tf); /* Done T or F */
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       if (err) {
+               pr_err("Error Building set MBR Done command\n");
+               return err;
+       }
+
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_mbr_enable_disable(struct opal_dev *dev)
+{
+       u8 mbr_en_dis = *(u8 *)dev->func_data[dev->state];
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL],
+                            OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, OPAL_VALUES);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 1);
+       add_token_u8(&err, dev, mbr_en_dis);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       if (err) {
+               pr_err("Error Building set MBR done command\n");
+               return err;
+       }
+
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
+                         struct opal_dev *dev)
+{
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, cpin_uid, OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+                            OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, OPAL_VALUES);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 3); /* PIN */
+       add_token_bytestring(&err, dev, key, key_len);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       return err;
+}
+
+static int set_new_pw(struct opal_dev *dev)
+{
+       u8 cpin_uid[OPAL_UID_LENGTH];
+       struct opal_session_info *usr = dev->func_data[dev->state];
+
+
+       memcpy(cpin_uid, opaluid[OPAL_C_PIN_ADMIN1], OPAL_UID_LENGTH);
+
+       if (usr->who != OPAL_ADMIN1) {
+               cpin_uid[5] = 0x03;
+               if (usr->sum)
+                       cpin_uid[7] = usr->opal_key.lr + 1;
+               else
+                       cpin_uid[7] = usr->who;
+       }
+
+       if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len,
+                          cpin_uid, dev)) {
+               pr_err("Error building set password command.\n");
+               return -ERANGE;
+       }
+
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_sid_cpin_pin(struct opal_dev *dev)
+{
+       u8 cpin_uid[OPAL_UID_LENGTH];
+       struct opal_key *key = dev->func_data[dev->state];
+
+       memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH);
+
+       if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) {
+               pr_err("Error building Set SID cpin\n");
+               return -ERANGE;
+       }
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int add_user_to_lr(struct opal_dev *dev)
+{
+       u8 lr_buffer[OPAL_UID_LENGTH];
+       u8 user_uid[OPAL_UID_LENGTH];
+       struct opal_lock_unlock *lkul;
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       lkul = dev->func_data[dev->state];
+
+       memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED],
+              OPAL_UID_LENGTH);
+
+       if (lkul->l_state == OPAL_RW)
+               memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED],
+                      OPAL_UID_LENGTH);
+
+       lr_buffer[7] = lkul->session.opal_key.lr;
+
+       memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+
+       user_uid[7] = lkul->session.who;
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+                            OPAL_UID_LENGTH);
+
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, OPAL_VALUES);
+
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 3);
+
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_bytestring(&err, dev,
+                            opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF],
+                            OPAL_UID_LENGTH/2);
+       add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_bytestring(&err, dev,
+                            opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF],
+                            OPAL_UID_LENGTH/2);
+       add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE],
+                            OPAL_UID_LENGTH/2);
+       add_token_u8(&err, dev, 1);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       if (err) {
+               pr_err("Error building add user to locking range command.\n");
+               return err;
+       }
+
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int lock_unlock_locking_range(struct opal_dev *dev)
+{
+       u8 lr_buffer[OPAL_UID_LENGTH];
+       const u8 *method;
+       struct opal_lock_unlock *lkul;
+       u8 read_locked = 1, write_locked = 1;
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       method = opalmethod[OPAL_SET];
+       lkul = dev->func_data[dev->state];
+       if (build_locking_range(lr_buffer, sizeof(lr_buffer),
+                               lkul->session.opal_key.lr) < 0)
+               return -ERANGE;
+
+       switch (lkul->l_state) {
+       case OPAL_RO:
+               read_locked = 0;
+               write_locked = 1;
+               break;
+       case OPAL_RW:
+               read_locked = 0;
+               write_locked = 0;
+               break;
+       case OPAL_LK:
+               /* vars are initalized to locked */
+               break;
+       default:
+               pr_err("Tried to set an invalid locking state... returning to uland\n");
+               return OPAL_INVAL_PARAM;
+       }
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, OPAL_VALUES);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, OPAL_READLOCKED);
+       add_token_u8(&err, dev, read_locked);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, OPAL_WRITELOCKED);
+       add_token_u8(&err, dev, write_locked);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       if (err) {
+               pr_err("Error building SET command.\n");
+               return err;
+       }
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+
+static int lock_unlock_locking_range_sum(struct opal_dev *dev)
+{
+       u8 lr_buffer[OPAL_UID_LENGTH];
+       u8 read_locked = 1, write_locked = 1;
+       const u8 *method;
+       struct opal_lock_unlock *lkul;
+       int ret;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       method = opalmethod[OPAL_SET];
+       lkul = dev->func_data[dev->state];
+       if (build_locking_range(lr_buffer, sizeof(lr_buffer),
+                               lkul->session.opal_key.lr) < 0)
+               return -ERANGE;
+
+       switch (lkul->l_state) {
+       case OPAL_RO:
+               read_locked = 0;
+               write_locked = 1;
+               break;
+       case OPAL_RW:
+               read_locked = 0;
+               write_locked = 0;
+               break;
+       case OPAL_LK:
+               /* vars are initalized to locked */
+               break;
+       default:
+               pr_err("Tried to set an invalid locking state.\n");
+               return OPAL_INVAL_PARAM;
+       }
+       ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1,
+                                       read_locked, write_locked);
+
+       if (ret < 0) {
+               pr_err("Error building SET command.\n");
+               return ret;
+       }
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int activate_lsp(struct opal_dev *dev)
+{
+       struct opal_lr_act *opal_act;
+       u8 user_lr[OPAL_UID_LENGTH];
+       u8 uint_3 = 0x83;
+       int err = 0, i;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       opal_act = dev->func_data[dev->state];
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+                            OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_ACTIVATE],
+                            OPAL_UID_LENGTH);
+
+
+       if (opal_act->sum) {
+               err = build_locking_range(user_lr, sizeof(user_lr),
+                                         opal_act->lr[0]);
+               if (err)
+                       return err;
+
+               add_token_u8(&err, dev, OPAL_STARTLIST);
+               add_token_u8(&err, dev, OPAL_STARTNAME);
+               add_token_u8(&err, dev, uint_3);
+               add_token_u8(&err, dev, 6);
+               add_token_u8(&err, dev, 0);
+               add_token_u8(&err, dev, 0);
+
+               add_token_u8(&err, dev, OPAL_STARTLIST);
+               add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
+               for (i = 1; i < opal_act->num_lrs; i++) {
+                       user_lr[7] = opal_act->lr[i];
+                       add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
+               }
+               add_token_u8(&err, dev, OPAL_ENDLIST);
+               add_token_u8(&err, dev, OPAL_ENDNAME);
+               add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       } else {
+               add_token_u8(&err, dev, OPAL_STARTLIST);
+               add_token_u8(&err, dev, OPAL_ENDLIST);
+       }
+
+       if (err) {
+               pr_err("Error building Activate LockingSP command.\n");
+               return err;
+       }
+
+       return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int get_lsp_lifecycle_cont(struct opal_dev *dev)
+{
+       u8 lc_status;
+       int error = 0;
+
+       error = parse_and_check_status(dev);
+       if (error)
+               return error;
+
+       lc_status = response_get_u64(&dev->parsed, 4);
+       /* 0x08 is Manufacured Inactive */
+       /* 0x09 is Manufactured */
+       if (lc_status != OPAL_MANUFACTURED_INACTIVE) {
+               pr_err("Couldn't determine the status of the Lifcycle state\n");
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+/* Determine if we're in the Manufactured Inactive or Active state */
+static int get_lsp_lifecycle(struct opal_dev *dev)
+{
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+                            OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 3); /* Start Column */
+       add_token_u8(&err, dev, 6); /* Lifecycle Column */
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 4); /* End Column */
+       add_token_u8(&err, dev, 6); /* Lifecycle Column */
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       if (err) {
+               pr_err("Error Building GET Lifecycle Status command\n");
+               return err;
+       }
+
+       return finalize_and_send(dev, get_lsp_lifecycle_cont);
+}
+
+static int get_msid_cpin_pin_cont(struct opal_dev *dev)
+{
+       const char *msid_pin;
+       size_t strlen;
+       int error = 0;
+
+       error = parse_and_check_status(dev);
+       if (error)
+               return error;
+
+       strlen = response_get_string(&dev->parsed, 4, &msid_pin);
+       if (!msid_pin) {
+               pr_err("%s: Couldn't extract PIN from response\n", __func__);
+               return OPAL_INVAL_PARAM;
+       }
+
+       dev->prev_data = kmemdup(msid_pin, strlen, GFP_KERNEL);
+       if (!dev->prev_data)
+               return -ENOMEM;
+
+       dev->prev_d_len = strlen;
+
+       return 0;
+}
+
+static int get_msid_cpin_pin(struct opal_dev *dev)
+{
+       int err = 0;
+
+       clear_opal_cmd(dev);
+       set_comid(dev, dev->comid);
+
+
+       add_token_u8(&err, dev, OPAL_CALL);
+       add_token_bytestring(&err, dev, opaluid[OPAL_C_PIN_MSID],
+                            OPAL_UID_LENGTH);
+       add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+       add_token_u8(&err, dev, OPAL_STARTLIST);
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 3); /* Start Column */
+       add_token_u8(&err, dev, 3); /* PIN */
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+       add_token_u8(&err, dev, OPAL_STARTNAME);
+       add_token_u8(&err, dev, 4); /* End Column */
+       add_token_u8(&err, dev, 3); /* Lifecycle Column */
+       add_token_u8(&err, dev, OPAL_ENDNAME);
+
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+       add_token_u8(&err, dev, OPAL_ENDLIST);
+
+       if (err) {
+               pr_err("Error building Get MSID CPIN PIN command.\n");
+               return err;
+       }
+
+       return finalize_and_send(dev, get_msid_cpin_pin_cont);
+}
+
+static int build_end_opal_session(struct opal_dev *dev)
+{
+       int err = 0;
+
+       clear_opal_cmd(dev);
+
+       set_comid(dev, dev->comid);
+       add_token_u8(&err, dev, OPAL_ENDOFSESSION);
+       return err;
+}
+
+static int end_opal_session(struct opal_dev *dev)
+{
+       int ret = build_end_opal_session(dev);
+
+       if (ret < 0)
+               return ret;
+       return finalize_and_send(dev, end_session_cont);
+}
+
+static int end_opal_session_error(struct opal_dev *dev)
+{
+       const opal_step error_end_session[] = {
+               end_opal_session,
+               NULL,
+       };
+       dev->funcs = error_end_session;
+       dev->state = 0;
+       return next(dev);
+}
+
+static inline void setup_opal_dev(struct opal_dev *dev,
+                                 const opal_step *funcs)
+{
+       dev->state = 0;
+       dev->funcs = funcs;
+       dev->tsn = 0;
+       dev->hsn = 0;
+       dev->func_data = NULL;
+       dev->prev_data = NULL;
+}
+
+static int check_opal_support(struct opal_dev *dev)
+{
+       static const opal_step funcs[] = {
+               opal_discovery0,
+               NULL
+       };
+       int ret;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, funcs);
+       ret = next(dev);
+       dev->supported = !ret;
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
+{
+       struct opal_dev *dev;
+
+       dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev)
+               return NULL;
+
+       INIT_LIST_HEAD(&dev->unlk_lst);
+       mutex_init(&dev->dev_lock);
+       dev->data = data;
+       dev->send_recv = send_recv;
+       if (check_opal_support(dev) != 0) {
+               pr_debug("Opal is not supported on this device\n");
+               kfree(dev);
+               return NULL;
+       }
+       return dev;
+}
+EXPORT_SYMBOL(init_opal_dev);
+
+static int opal_secure_erase_locking_range(struct opal_dev *dev,
+                                          struct opal_session_info *opal_session)
+{
+       void *data[3] = { NULL };
+       static const opal_step erase_funcs[] = {
+               opal_discovery0,
+               start_auth_opal_session,
+               get_active_key,
+               gen_key,
+               end_opal_session,
+               NULL,
+       };
+       int ret;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, erase_funcs);
+
+       dev->func_data = data;
+       dev->func_data[1] = opal_session;
+       dev->func_data[2] = &opal_session->opal_key.lr;
+
+       ret = next(dev);
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+static int opal_erase_locking_range(struct opal_dev *dev,
+                                   struct opal_session_info *opal_session)
+{
+       void *data[3] = { NULL };
+       static const opal_step erase_funcs[] = {
+               opal_discovery0,
+               start_auth_opal_session,
+               erase_locking_range,
+               end_opal_session,
+               NULL,
+       };
+       int ret;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, erase_funcs);
+
+       dev->func_data = data;
+       dev->func_data[1] = opal_session;
+       dev->func_data[2] = opal_session;
+
+       ret = next(dev);
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
+                                         struct opal_mbr_data *opal_mbr)
+{
+       void *func_data[6] = { NULL };
+       static const opal_step mbr_funcs[] = {
+               opal_discovery0,
+               start_admin1LSP_opal_session,
+               set_mbr_done,
+               end_opal_session,
+               start_admin1LSP_opal_session,
+               set_mbr_enable_disable,
+               end_opal_session,
+               NULL,
+       };
+       int ret;
+
+       if (opal_mbr->enable_disable != OPAL_MBR_ENABLE &&
+           opal_mbr->enable_disable != OPAL_MBR_DISABLE)
+               return -EINVAL;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, mbr_funcs);
+       dev->func_data = func_data;
+       dev->func_data[1] = &opal_mbr->key;
+       dev->func_data[2] = &opal_mbr->enable_disable;
+       dev->func_data[4] = &opal_mbr->key;
+       dev->func_data[5] = &opal_mbr->enable_disable;
+       ret = next(dev);
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
+{
+       struct opal_suspend_data *suspend;
+
+       suspend = kzalloc(sizeof(*suspend), GFP_KERNEL);
+       if (!suspend)
+               return -ENOMEM;
+
+       suspend->unlk = *lk_unlk;
+       suspend->lr = lk_unlk->session.opal_key.lr;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, NULL);
+       add_suspend_info(dev, suspend);
+       mutex_unlock(&dev->dev_lock);
+       return 0;
+}
+
+static int opal_add_user_to_lr(struct opal_dev *dev,
+                              struct opal_lock_unlock *lk_unlk)
+{
+       void *func_data[3] = { NULL };
+       static const opal_step funcs[] = {
+               opal_discovery0,
+               start_admin1LSP_opal_session,
+               add_user_to_lr,
+               end_opal_session,
+               NULL
+       };
+       int ret;
+
+       if (lk_unlk->l_state != OPAL_RO &&
+           lk_unlk->l_state != OPAL_RW) {
+               pr_err("Locking state was not RO or RW\n");
+               return -EINVAL;
+       }
+       if (lk_unlk->session.who < OPAL_USER1 &&
+           lk_unlk->session.who > OPAL_USER9) {
+               pr_err("Authority was not within the range of users: %d\n",
+                      lk_unlk->session.who);
+               return -EINVAL;
+       }
+       if (lk_unlk->session.sum) {
+               pr_err("%s not supported in sum. Use setup locking range\n",
+                      __func__);
+               return -EINVAL;
+       }
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, funcs);
+       dev->func_data = func_data;
+       dev->func_data[1] = &lk_unlk->session.opal_key;
+       dev->func_data[2] = lk_unlk;
+       ret = next(dev);
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
+{
+       void *data[2] = { NULL };
+       static const opal_step revert_funcs[] = {
+               opal_discovery0,
+               start_SIDASP_opal_session,
+               revert_tper, /* controller will terminate session */
+               NULL,
+       };
+       int ret;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, revert_funcs);
+       dev->func_data = data;
+       dev->func_data[1] = opal;
+       ret = next(dev);
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+static int __opal_lock_unlock_sum(struct opal_dev *dev)
+{
+       static const opal_step ulk_funcs_sum[] = {
+               opal_discovery0,
+               start_auth_opal_session,
+               lock_unlock_locking_range_sum,
+               end_opal_session,
+               NULL
+       };
+
+       dev->funcs = ulk_funcs_sum;
+       return next(dev);
+}
+
+static int __opal_lock_unlock(struct opal_dev *dev)
+{
+       static const opal_step _unlock_funcs[] = {
+               opal_discovery0,
+               start_auth_opal_session,
+               lock_unlock_locking_range,
+               end_opal_session,
+               NULL
+       };
+
+       dev->funcs = _unlock_funcs;
+       return next(dev);
+}
+
+static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
+{
+       void *func_data[3] = { NULL };
+       int ret;
+
+       if (lk_unlk->session.who < OPAL_ADMIN1 ||
+           lk_unlk->session.who > OPAL_USER9)
+               return -EINVAL;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, NULL);
+       dev->func_data = func_data;
+       dev->func_data[1] = &lk_unlk->session;
+       dev->func_data[2] = lk_unlk;
+
+       if (lk_unlk->session.sum)
+               ret = __opal_lock_unlock_sum(dev);
+       else
+               ret = __opal_lock_unlock(dev);
+
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
+{
+       static const opal_step owner_funcs[] = {
+               opal_discovery0,
+               start_anybodyASP_opal_session,
+               get_msid_cpin_pin,
+               end_opal_session,
+               start_SIDASP_opal_session,
+               set_sid_cpin_pin,
+               end_opal_session,
+               NULL
+       };
+       void *data[6] = { NULL };
+       int ret;
+
+       if (!dev)
+               return -ENODEV;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, owner_funcs);
+       dev->func_data = data;
+       dev->func_data[4] = opal;
+       dev->func_data[5] = opal;
+       ret = next(dev);
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_act)
+{
+       void *data[4] = { NULL };
+       static const opal_step active_funcs[] = {
+               opal_discovery0,
+               start_SIDASP_opal_session, /* Open session as SID auth */
+               get_lsp_lifecycle,
+               activate_lsp,
+               end_opal_session,
+               NULL
+       };
+       int ret;
+
+       if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS)
+               return -EINVAL;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, active_funcs);
+       dev->func_data = data;
+       dev->func_data[1] = &opal_lr_act->key;
+       dev->func_data[3] = opal_lr_act;
+       ret = next(dev);
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+static int opal_setup_locking_range(struct opal_dev *dev,
+                                   struct opal_user_lr_setup *opal_lrs)
+{
+       void *data[3] = { NULL };
+       static const opal_step lr_funcs[] = {
+               opal_discovery0,
+               start_auth_opal_session,
+               setup_locking_range,
+               end_opal_session,
+               NULL,
+       };
+       int ret;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, lr_funcs);
+       dev->func_data = data;
+       dev->func_data[1] = &opal_lrs->session;
+       dev->func_data[2] = opal_lrs;
+       ret = next(dev);
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
+{
+       static const opal_step pw_funcs[] = {
+               opal_discovery0,
+               start_auth_opal_session,
+               set_new_pw,
+               end_opal_session,
+               NULL
+       };
+       void *data[3] = { NULL };
+       int ret;
+
+       if (opal_pw->session.who < OPAL_ADMIN1 ||
+           opal_pw->session.who > OPAL_USER9  ||
+           opal_pw->new_user_pw.who < OPAL_ADMIN1 ||
+           opal_pw->new_user_pw.who > OPAL_USER9)
+               return -EINVAL;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, pw_funcs);
+       dev->func_data = data;
+       dev->func_data[1] = (void *) &opal_pw->session;
+       dev->func_data[2] = (void *) &opal_pw->new_user_pw;
+
+       ret = next(dev);
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+static int opal_activate_user(struct opal_dev *dev,
+                             struct opal_session_info *opal_session)
+{
+       static const opal_step act_funcs[] = {
+               opal_discovery0,
+               start_admin1LSP_opal_session,
+               internal_activate_user,
+               end_opal_session,
+               NULL
+       };
+       void *data[3] = { NULL };
+       int ret;
+
+       /* We can't activate Admin1 it's active as manufactured */
+       if (opal_session->who < OPAL_USER1 &&
+           opal_session->who > OPAL_USER9) {
+               pr_err("Who was not a valid user: %d\n", opal_session->who);
+               return -EINVAL;
+       }
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, act_funcs);
+       dev->func_data = data;
+       dev->func_data[1] = &opal_session->opal_key;
+       dev->func_data[2] = opal_session;
+       ret = next(dev);
+       mutex_unlock(&dev->dev_lock);
+       return ret;
+}
+
+bool opal_unlock_from_suspend(struct opal_dev *dev)
+{
+       struct opal_suspend_data *suspend;
+       void *func_data[3] = { NULL };
+       bool was_failure = false;
+       int ret = 0;
+
+       if (!dev)
+               return false;
+       if (!dev->supported)
+               return false;
+
+       mutex_lock(&dev->dev_lock);
+       setup_opal_dev(dev, NULL);
+       dev->func_data = func_data;
+
+       list_for_each_entry(suspend, &dev->unlk_lst, node) {
+               dev->state = 0;
+               dev->func_data[1] = &suspend->unlk.session;
+               dev->func_data[2] = &suspend->unlk;
+               dev->tsn = 0;
+               dev->hsn = 0;
+
+               if (suspend->unlk.session.sum)
+                       ret = __opal_lock_unlock_sum(dev);
+               else
+                       ret = __opal_lock_unlock(dev);
+               if (ret) {
+                       pr_warn("Failed to unlock LR %hhu with sum %d\n",
+                               suspend->unlk.session.opal_key.lr,
+                               suspend->unlk.session.sum);
+                       was_failure = true;
+               }
+       }
+       mutex_unlock(&dev->dev_lock);
+       return was_failure;
+}
+EXPORT_SYMBOL(opal_unlock_from_suspend);
+
+int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
+{
+       void *p;
+       int ret = -ENOTTY;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EACCES;
+       if (!dev)
+               return -ENOTSUPP;
+       if (!dev->supported) {
+               pr_err("Not supported\n");
+               return -ENOTSUPP;
+       }
+
+       p = memdup_user(arg,  _IOC_SIZE(cmd));
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       switch (cmd) {
+       case IOC_OPAL_SAVE:
+               ret = opal_save(dev, p);
+               break;
+       case IOC_OPAL_LOCK_UNLOCK:
+               ret = opal_lock_unlock(dev, p);
+               break;
+       case IOC_OPAL_TAKE_OWNERSHIP:
+               ret = opal_take_ownership(dev, p);
+               break;
+       case IOC_OPAL_ACTIVATE_LSP:
+               ret = opal_activate_lsp(dev, p);
+               break;
+       case IOC_OPAL_SET_PW:
+               ret = opal_set_new_pw(dev, p);
+               break;
+       case IOC_OPAL_ACTIVATE_USR:
+               ret = opal_activate_user(dev, p);
+               break;
+       case IOC_OPAL_REVERT_TPR:
+               ret = opal_reverttper(dev, p);
+               break;
+       case IOC_OPAL_LR_SETUP:
+               ret = opal_setup_locking_range(dev, p);
+               break;
+       case IOC_OPAL_ADD_USR_TO_LR:
+               ret = opal_add_user_to_lr(dev, p);
+               break;
+       case IOC_OPAL_ENABLE_DISABLE_MBR:
+               ret = opal_enable_disable_shadow_mbr(dev, p);
+               break;
+       case IOC_OPAL_ERASE_LR:
+               ret = opal_erase_locking_range(dev, p);
+               break;
+       case IOC_OPAL_SECURE_ERASE_LR:
+               ret = opal_secure_erase_locking_range(dev, p);
+               break;
+       default:
+               pr_warn("No such Opal Ioctl %u\n", cmd);
+       }
+
+       kfree(p);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(sed_ioctl);
index e5c5b8e..3a44438 100644 (file)
@@ -4074,41 +4074,27 @@ clean_up:
 
 static void cciss_interrupt_mode(ctlr_info_t *h)
 {
-#ifdef CONFIG_PCI_MSI
-       int err;
-       struct msix_entry cciss_msix_entries[4] = { {0, 0}, {0, 1},
-       {0, 2}, {0, 3}
-       };
+       int ret;
 
        /* Some boards advertise MSI but don't really support it */
        if ((h->board_id == 0x40700E11) || (h->board_id == 0x40800E11) ||
            (h->board_id == 0x40820E11) || (h->board_id == 0x40830E11))
                goto default_int_mode;
 
-       if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) {
-               err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4);
-               if (!err) {
-                       h->intr[0] = cciss_msix_entries[0].vector;
-                       h->intr[1] = cciss_msix_entries[1].vector;
-                       h->intr[2] = cciss_msix_entries[2].vector;
-                       h->intr[3] = cciss_msix_entries[3].vector;
-                       h->msix_vector = 1;
-                       return;
-               } else {
-                       dev_warn(&h->pdev->dev,
-                               "MSI-X init failed %d\n", err);
-               }
-       }
-       if (pci_find_capability(h->pdev, PCI_CAP_ID_MSI)) {
-               if (!pci_enable_msi(h->pdev))
-                       h->msi_vector = 1;
-               else
-                       dev_warn(&h->pdev->dev, "MSI init failed\n");
+       ret = pci_alloc_irq_vectors(h->pdev, 4, 4, PCI_IRQ_MSIX);
+       if (ret >= 0)   {
+               h->intr[0] = pci_irq_vector(h->pdev, 0);
+               h->intr[1] = pci_irq_vector(h->pdev, 1);
+               h->intr[2] = pci_irq_vector(h->pdev, 2);
+               h->intr[3] = pci_irq_vector(h->pdev, 3);
+               return;
        }
+
+       ret = pci_alloc_irq_vectors(h->pdev, 1, 1, PCI_IRQ_MSI);
+
 default_int_mode:
-#endif                         /* CONFIG_PCI_MSI */
        /* if we get here we're going to use the default interrupt mode */
-       h->intr[h->intr_mode] = h->pdev->irq;
+       h->intr[h->intr_mode] = pci_irq_vector(h->pdev, 0);
        return;
 }
 
@@ -4888,7 +4874,7 @@ static int cciss_request_irq(ctlr_info_t *h,
        irqreturn_t (*msixhandler)(int, void *),
        irqreturn_t (*intxhandler)(int, void *))
 {
-       if (h->msix_vector || h->msi_vector) {
+       if (h->pdev->msi_enabled || h->pdev->msix_enabled) {
                if (!request_irq(h->intr[h->intr_mode], msixhandler,
                                0, h->devname, h))
                        return 0;
@@ -4934,12 +4920,7 @@ static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h)
        int ctlr = h->ctlr;
 
        free_irq(h->intr[h->intr_mode], h);
-#ifdef CONFIG_PCI_MSI
-       if (h->msix_vector)
-               pci_disable_msix(h->pdev);
-       else if (h->msi_vector)
-               pci_disable_msi(h->pdev);
-#endif /* CONFIG_PCI_MSI */
+       pci_free_irq_vectors(h->pdev);
        cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
        cciss_free_scatterlists(h);
        cciss_free_cmd_pool(h);
@@ -5295,12 +5276,7 @@ static void cciss_remove_one(struct pci_dev *pdev)
 
        cciss_shutdown(pdev);
 
-#ifdef CONFIG_PCI_MSI
-       if (h->msix_vector)
-               pci_disable_msix(h->pdev);
-       else if (h->msi_vector)
-               pci_disable_msi(h->pdev);
-#endif                         /* CONFIG_PCI_MSI */
+       pci_free_irq_vectors(h->pdev);
 
        iounmap(h->transtable);
        iounmap(h->cfgtable);
index 7fda30e..4affa94 100644 (file)
@@ -90,8 +90,6 @@ struct ctlr_info
 #      define SIMPLE_MODE_INT  2
 #      define MEMQ_MODE_INT    3
        unsigned int intr[4];
-       unsigned int msix_vector;
-       unsigned int msi_vector;
        int     intr_mode;
        int     cciss_max_sectors;
        BYTE    cciss_read;
@@ -333,7 +331,7 @@ static unsigned long SA5_performant_completed(ctlr_info_t *h)
         */
        register_value = readl(h->vaddr + SA5_OUTDB_STATUS);
        /* msi auto clears the interrupt pending bit. */
-       if (!(h->msi_vector || h->msix_vector)) {
+       if (!(h->pdev->msi_enabled || h->pdev->msix_enabled)) {
                writel(SA5_OUTDB_CLEAR_PERF_BIT, h->vaddr + SA5_OUTDB_CLEAR);
                /* Do a read in order to flush the write to the controller
                 * (as per spec.)
@@ -393,7 +391,7 @@ static bool SA5_performant_intr_pending(ctlr_info_t *h)
        if (!register_value)
                return false;
 
-       if (h->msi_vector || h->msix_vector)
+       if (h->pdev->msi_enabled || h->pdev->msix_enabled)
                return true;
 
        /* Read outbound doorbell to flush */
index a391a3c..184887a 100644 (file)
@@ -3119,7 +3119,7 @@ static int raw_cmd_copyin(int cmd, void __user *param,
        *rcmd = NULL;
 
 loop:
-       ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_USER);
+       ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_KERNEL);
        if (!ptr)
                return -ENOMEM;
        *rcmd = ptr;
index f347285..3043771 100644 (file)
@@ -1097,9 +1097,12 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
        if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
                return -EINVAL;
 
+       /* I/O need to be drained during transfer transition */
+       blk_mq_freeze_queue(lo->lo_queue);
+
        err = loop_release_xfer(lo);
        if (err)
-               return err;
+               goto exit;
 
        if (info->lo_encrypt_type) {
                unsigned int type = info->lo_encrypt_type;
@@ -1114,12 +1117,14 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 
        err = loop_init_xfer(lo, xfer, info);
        if (err)
-               return err;
+               goto exit;
 
        if (lo->lo_offset != info->lo_offset ||
            lo->lo_sizelimit != info->lo_sizelimit)
-               if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit))
-                       return -EFBIG;
+               if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
+                       err = -EFBIG;
+                       goto exit;
+               }
 
        loop_config_discard(lo);
 
@@ -1156,7 +1161,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
        /* update dio if lo_offset or transfer is changed */
        __loop_update_dio(lo, lo->use_dio);
 
-       return 0;
+ exit:
+       blk_mq_unfreeze_queue(lo->lo_queue);
+       return err;
 }
 
 static int
index c0e14e5..a67b7ea 100644 (file)
@@ -420,7 +420,8 @@ static void null_lnvm_end_io(struct request *rq, int error)
 {
        struct nvm_rq *rqd = rq->end_io_data;
 
-       nvm_end_io(rqd, error);
+       rqd->error = error;
+       nvm_end_io(rqd);
 
        blk_put_request(rq);
 }
@@ -460,7 +461,6 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
 
        id->ver_id = 0x1;
        id->vmnt = 0;
-       id->cgrps = 1;
        id->cap = 0x2;
        id->dom = 0x1;
 
@@ -479,7 +479,7 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
 
        sector_div(size, bs); /* convert size to pages */
        size >>= 8; /* concert size to pgs pr blk */
-       grp = &id->groups[0];
+       grp = &id->grp;
        grp->mtype = 0;
        grp->fmtype = 0;
        grp->num_ch = 1;
index 5fd2d0e..10aed84 100644 (file)
@@ -273,7 +273,7 @@ static const struct block_device_operations pcd_bdops = {
        .check_events   = pcd_block_check_events,
 };
 
-static struct cdrom_device_ops pcd_dops = {
+static const struct cdrom_device_ops pcd_dops = {
        .open           = pcd_open,
        .release        = pcd_release,
        .drive_status   = pcd_drive_status,
index 59cca72..bbbd3ca 100644 (file)
@@ -342,8 +342,8 @@ static void cdrom_sysctl_register(void);
 
 static LIST_HEAD(cdrom_list);
 
-static int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
-                                     struct packet_command *cgc)
+int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
+                              struct packet_command *cgc)
 {
        if (cgc->sense) {
                cgc->sense->sense_key = 0x05;
@@ -354,6 +354,7 @@ static int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
        cgc->stat = -EIO;
        return -EIO;
 }
+EXPORT_SYMBOL(cdrom_dummy_generic_packet);
 
 static int cdrom_flush_cache(struct cdrom_device_info *cdi)
 {
@@ -371,7 +372,7 @@ static int cdrom_flush_cache(struct cdrom_device_info *cdi)
 static int cdrom_get_disc_info(struct cdrom_device_info *cdi,
                               disc_information *di)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        struct packet_command cgc;
        int ret, buflen;
 
@@ -586,7 +587,7 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space)
 int register_cdrom(struct cdrom_device_info *cdi)
 {
        static char banner_printed;
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        int *change_capability = (int *)&cdo->capability; /* hack */
 
        cd_dbg(CD_OPEN, "entering register_cdrom\n");
@@ -610,7 +611,6 @@ int register_cdrom(struct cdrom_device_info *cdi)
        ENSURE(reset, CDC_RESET);
        ENSURE(generic_packet, CDC_GENERIC_PACKET);
        cdi->mc_flags = 0;
-       cdo->n_minors = 0;
        cdi->options = CDO_USE_FFLAGS;
 
        if (autoclose == 1 && CDROM_CAN(CDC_CLOSE_TRAY))
@@ -630,8 +630,7 @@ int register_cdrom(struct cdrom_device_info *cdi)
        else
                cdi->cdda_method = CDDA_OLD;
 
-       if (!cdo->generic_packet)
-               cdo->generic_packet = cdrom_dummy_generic_packet;
+       WARN_ON(!cdo->generic_packet);
 
        cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name);
        mutex_lock(&cdrom_mutex);
@@ -652,7 +651,6 @@ void unregister_cdrom(struct cdrom_device_info *cdi)
        if (cdi->exit)
                cdi->exit(cdi);
 
-       cdi->ops->n_minors--;
        cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name);
 }
 
@@ -1036,7 +1034,7 @@ static
 int open_for_data(struct cdrom_device_info *cdi)
 {
        int ret;
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        tracktype tracks;
        cd_dbg(CD_OPEN, "entering open_for_data\n");
        /* Check if the driver can report drive status.  If it can, we
@@ -1198,8 +1196,8 @@ err:
 /* This code is similar to that in open_for_data. The routine is called
    whenever an audio play operation is requested.
 */
-static int check_for_audio_disc(struct cdrom_device_info * cdi,
-                               struct cdrom_device_ops * cdo)
+static int check_for_audio_disc(struct cdrom_device_info *cdi,
+                               const struct cdrom_device_ops *cdo)
 {
         int ret;
        tracktype tracks;
@@ -1254,7 +1252,7 @@ static int check_for_audio_disc(struct cdrom_device_info * cdi,
 
 void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        int opened_for_data;
 
        cd_dbg(CD_CLOSE, "entering cdrom_release\n");
@@ -1294,7 +1292,7 @@ static int cdrom_read_mech_status(struct cdrom_device_info *cdi,
                                  struct cdrom_changer_info *buf)
 {
        struct packet_command cgc;
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        int length;
 
        /*
@@ -1643,7 +1641,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
        int ret;
        u_char buf[20];
        struct packet_command cgc;
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        rpc_state_t rpc_state;
 
        memset(buf, 0, sizeof(buf));
@@ -1791,7 +1789,7 @@ static int dvd_read_physical(struct cdrom_device_info *cdi, dvd_struct *s,
 {
        unsigned char buf[21], *base;
        struct dvd_layer *layer;
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        int ret, layer_num = s->physical.layer_num;
 
        if (layer_num >= DVD_LAYERS)
@@ -1842,7 +1840,7 @@ static int dvd_read_copyright(struct cdrom_device_info *cdi, dvd_struct *s,
 {
        int ret;
        u_char buf[8];
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
 
        init_cdrom_command(cgc, buf, sizeof(buf), CGC_DATA_READ);
        cgc->cmd[0] = GPCMD_READ_DVD_STRUCTURE;
@@ -1866,7 +1864,7 @@ static int dvd_read_disckey(struct cdrom_device_info *cdi, dvd_struct *s,
 {
        int ret, size;
        u_char *buf;
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
 
        size = sizeof(s->disckey.value) + 4;
 
@@ -1894,7 +1892,7 @@ static int dvd_read_bca(struct cdrom_device_info *cdi, dvd_struct *s,
 {
        int ret, size = 4 + 188;
        u_char *buf;
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
 
        buf = kmalloc(size, GFP_KERNEL);
        if (!buf)
@@ -1928,7 +1926,7 @@ static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s,
 {
        int ret = 0, size;
        u_char *buf;
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
 
        size = sizeof(s->manufact.value) + 4;
 
@@ -1995,7 +1993,7 @@ int cdrom_mode_sense(struct cdrom_device_info *cdi,
                     struct packet_command *cgc,
                     int page_code, int page_control)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
 
        memset(cgc->cmd, 0, sizeof(cgc->cmd));
 
@@ -2010,7 +2008,7 @@ int cdrom_mode_sense(struct cdrom_device_info *cdi,
 int cdrom_mode_select(struct cdrom_device_info *cdi,
                      struct packet_command *cgc)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
 
        memset(cgc->cmd, 0, sizeof(cgc->cmd));
        memset(cgc->buffer, 0, 2);
@@ -2025,7 +2023,7 @@ int cdrom_mode_select(struct cdrom_device_info *cdi,
 static int cdrom_read_subchannel(struct cdrom_device_info *cdi,
                                 struct cdrom_subchnl *subchnl, int mcn)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        struct packet_command cgc;
        char buffer[32];
        int ret;
@@ -2073,7 +2071,7 @@ static int cdrom_read_cd(struct cdrom_device_info *cdi,
                         struct packet_command *cgc, int lba,
                         int blocksize, int nblocks)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
 
        memset(&cgc->cmd, 0, sizeof(cgc->cmd));
        cgc->cmd[0] = GPCMD_READ_10;
@@ -2093,7 +2091,7 @@ static int cdrom_read_block(struct cdrom_device_info *cdi,
                            struct packet_command *cgc,
                            int lba, int nblocks, int format, int blksize)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
 
        memset(&cgc->cmd, 0, sizeof(cgc->cmd));
        cgc->cmd[0] = GPCMD_READ_CD;
@@ -2764,7 +2762,7 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi,
  */
 static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        struct packet_command cgc;
        struct modesel_head mh;
 
@@ -2790,7 +2788,7 @@ static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size)
 static int cdrom_get_track_info(struct cdrom_device_info *cdi,
                                __u16 track, __u8 type, track_information *ti)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        struct packet_command cgc;
        int ret, buflen;
 
@@ -3049,7 +3047,7 @@ static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi,
                                             void __user *arg,
                                             struct packet_command *cgc)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        struct cdrom_msf msf;
        cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
        if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf)))
@@ -3069,7 +3067,7 @@ static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi,
                                             void __user *arg,
                                             struct packet_command *cgc)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        struct cdrom_blk blk;
        cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYBLK\n");
        if (copy_from_user(&blk, (struct cdrom_blk __user *)arg, sizeof(blk)))
@@ -3164,7 +3162,7 @@ static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi,
                                               struct packet_command *cgc,
                                               int cmd)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        cd_dbg(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n");
        cgc->cmd[0] = GPCMD_START_STOP_UNIT;
        cgc->cmd[1] = 1;
@@ -3177,7 +3175,7 @@ static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi,
                                                 struct packet_command *cgc,
                                                 int cmd)
 {
-       struct cdrom_device_ops *cdo = cdi->ops;
+       const struct cdrom_device_ops *cdo = cdi->ops;
        cd_dbg(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n");
        cgc->cmd[0] = GPCMD_PAUSE_RESUME;
        cgc->cmd[8] = (cmd == CDROMRESUME) ? 1 : 0;
index 584bc31..1afab65 100644 (file)
@@ -481,7 +481,7 @@ static int gdrom_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
        return -EINVAL;
 }
 
-static struct cdrom_device_ops gdrom_ops = {
+static const struct cdrom_device_ops gdrom_ops = {
        .open                   = gdrom_open,
        .release                = gdrom_release,
        .drive_status           = gdrom_drivestatus,
@@ -489,9 +489,9 @@ static struct cdrom_device_ops gdrom_ops = {
        .get_last_session       = gdrom_get_last_session,
        .reset                  = gdrom_hardreset,
        .audio_ioctl            = gdrom_audio_ioctl,
+       .generic_packet         = cdrom_dummy_generic_packet,
        .capability             = CDC_MULTI_SESSION | CDC_MEDIA_CHANGED |
                                  CDC_RESET | CDC_DRIVE_STATUS | CDC_CD_R,
-       .n_minors               = 1,
 };
 
 static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode)
@@ -807,16 +807,20 @@ static int probe_gdrom(struct platform_device *devptr)
        if (err)
                goto probe_fail_cmdirq_register;
        gd.gdrom_rq = blk_init_queue(gdrom_request, &gdrom_lock);
-       if (!gd.gdrom_rq)
+       if (!gd.gdrom_rq) {
+               err = -ENOMEM;
                goto probe_fail_requestq;
+       }
 
        err = probe_gdrom_setupqueue();
        if (err)
                goto probe_fail_toc;
 
        gd.toc = kzalloc(sizeof(struct gdromtoc), GFP_KERNEL);
-       if (!gd.toc)
+       if (!gd.toc) {
+               err = -ENOMEM;
                goto probe_fail_toc;
+       }
        add_disk(gd.disk);
        return 0;
 
index 9cbd217..ab9232e 100644 (file)
@@ -1166,7 +1166,7 @@ void ide_cdrom_update_speed(ide_drive_t *drive, u8 *buf)
         CDC_CD_RW | CDC_DVD | CDC_DVD_R | CDC_DVD_RAM | CDC_GENERIC_PACKET | \
         CDC_MO_DRIVE | CDC_MRW | CDC_MRW_W | CDC_RAM)
 
-static struct cdrom_device_ops ide_cdrom_dops = {
+static const struct cdrom_device_ops ide_cdrom_dops = {
        .open                   = ide_cdrom_open_real,
        .release                = ide_cdrom_release_real,
        .drive_status           = ide_cdrom_drive_status,
index 2f5d5f4..0527141 100644 (file)
@@ -26,15 +26,6 @@ config NVM_DEBUG
 
        It is required to create/remove targets without IOCTLs.
 
-config NVM_GENNVM
-       tristate "General Non-Volatile Memory Manager for Open-Channel SSDs"
-       ---help---
-       Non-volatile memory media manager for Open-Channel SSDs that implements
-       physical media metadata management and block provisioning API.
-
-       This is the standard media manager for using Open-Channel SSDs, and
-       required for targets to be instantiated.
-
 config NVM_RRPC
        tristate "Round-robin Hybrid Open-Channel SSD target"
        ---help---
index a7a0a22..b2a39e2 100644 (file)
@@ -2,6 +2,5 @@
 # Makefile for Open-Channel SSDs.
 #
 
-obj-$(CONFIG_NVM)              := core.o sysblk.o
-obj-$(CONFIG_NVM_GENNVM)       += gennvm.o
+obj-$(CONFIG_NVM)              := core.o
 obj-$(CONFIG_NVM_RRPC)         += rrpc.o
index 02240a0..5262ba6 100644 (file)
 
 static LIST_HEAD(nvm_tgt_types);
 static DECLARE_RWSEM(nvm_tgtt_lock);
-static LIST_HEAD(nvm_mgrs);
 static LIST_HEAD(nvm_devices);
 static DECLARE_RWSEM(nvm_lock);
 
+/* Map between virtual and physical channel and lun */
+struct nvm_ch_map {
+       int ch_off;
+       int nr_luns;
+       int *lun_offs;
+};
+
+struct nvm_dev_map {
+       struct nvm_ch_map *chnls;
+       int nr_chnls;
+};
+
+struct nvm_area {
+       struct list_head list;
+       sector_t begin;
+       sector_t end;   /* end is excluded */
+};
+
+static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
+{
+       struct nvm_target *tgt;
+
+       list_for_each_entry(tgt, &dev->targets, list)
+               if (!strcmp(name, tgt->disk->disk_name))
+                       return tgt;
+
+       return NULL;
+}
+
+static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end)
+{
+       int i;
+
+       for (i = lun_begin; i <= lun_end; i++) {
+               if (test_and_set_bit(i, dev->lun_map)) {
+                       pr_err("nvm: lun %d already allocated\n", i);
+                       goto err;
+               }
+       }
+
+       return 0;
+err:
+       while (--i > lun_begin)
+               clear_bit(i, dev->lun_map);
+
+       return -EBUSY;
+}
+
+static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin,
+                                int lun_end)
+{
+       int i;
+
+       for (i = lun_begin; i <= lun_end; i++)
+               WARN_ON(!test_and_clear_bit(i, dev->lun_map));
+}
+
+static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
+{
+       struct nvm_dev *dev = tgt_dev->parent;
+       struct nvm_dev_map *dev_map = tgt_dev->map;
+       int i, j;
+
+       for (i = 0; i < dev_map->nr_chnls; i++) {
+               struct nvm_ch_map *ch_map = &dev_map->chnls[i];
+               int *lun_offs = ch_map->lun_offs;
+               int ch = i + ch_map->ch_off;
+
+               for (j = 0; j < ch_map->nr_luns; j++) {
+                       int lun = j + lun_offs[j];
+                       int lunid = (ch * dev->geo.luns_per_chnl) + lun;
+
+                       WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
+               }
+
+               kfree(ch_map->lun_offs);
+       }
+
+       kfree(dev_map->chnls);
+       kfree(dev_map);
+
+       kfree(tgt_dev->luns);
+       kfree(tgt_dev);
+}
+
+static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
+                                             int lun_begin, int lun_end)
+{
+       struct nvm_tgt_dev *tgt_dev = NULL;
+       struct nvm_dev_map *dev_rmap = dev->rmap;
+       struct nvm_dev_map *dev_map;
+       struct ppa_addr *luns;
+       int nr_luns = lun_end - lun_begin + 1;
+       int luns_left = nr_luns;
+       int nr_chnls = nr_luns / dev->geo.luns_per_chnl;
+       int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl;
+       int bch = lun_begin / dev->geo.luns_per_chnl;
+       int blun = lun_begin % dev->geo.luns_per_chnl;
+       int lunid = 0;
+       int lun_balanced = 1;
+       int prev_nr_luns;
+       int i, j;
+
+       nr_chnls = nr_luns / dev->geo.luns_per_chnl;
+       nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1;
+
+       dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
+       if (!dev_map)
+               goto err_dev;
+
+       dev_map->chnls = kcalloc(nr_chnls, sizeof(struct nvm_ch_map),
+                                                               GFP_KERNEL);
+       if (!dev_map->chnls)
+               goto err_chnls;
+
+       luns = kcalloc(nr_luns, sizeof(struct ppa_addr), GFP_KERNEL);
+       if (!luns)
+               goto err_luns;
+
+       prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ?
+                                       dev->geo.luns_per_chnl : luns_left;
+       for (i = 0; i < nr_chnls; i++) {
+               struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
+               int *lun_roffs = ch_rmap->lun_offs;
+               struct nvm_ch_map *ch_map = &dev_map->chnls[i];
+               int *lun_offs;
+               int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ?
+                                       dev->geo.luns_per_chnl : luns_left;
+
+               if (lun_balanced && prev_nr_luns != luns_in_chnl)
+                       lun_balanced = 0;
+
+               ch_map->ch_off = ch_rmap->ch_off = bch;
+               ch_map->nr_luns = luns_in_chnl;
+
+               lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
+               if (!lun_offs)
+                       goto err_ch;
+
+               for (j = 0; j < luns_in_chnl; j++) {
+                       luns[lunid].ppa = 0;
+                       luns[lunid].g.ch = i;
+                       luns[lunid++].g.lun = j;
+
+                       lun_offs[j] = blun;
+                       lun_roffs[j + blun] = blun;
+               }
+
+               ch_map->lun_offs = lun_offs;
+
+               /* when starting a new channel, lun offset is reset */
+               blun = 0;
+               luns_left -= luns_in_chnl;
+       }
+
+       dev_map->nr_chnls = nr_chnls;
+
+       tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL);
+       if (!tgt_dev)
+               goto err_ch;
+
+       memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
+       /* Target device only owns a portion of the physical device */
+       tgt_dev->geo.nr_chnls = nr_chnls;
+       tgt_dev->geo.nr_luns = nr_luns;
+       tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1;
+       tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
+       tgt_dev->q = dev->q;
+       tgt_dev->map = dev_map;
+       tgt_dev->luns = luns;
+       memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id));
+
+       tgt_dev->parent = dev;
+
+       return tgt_dev;
+err_ch:
+       while (--i > 0)
+               kfree(dev_map->chnls[i].lun_offs);
+       kfree(luns);
+err_luns:
+       kfree(dev_map->chnls);
+err_chnls:
+       kfree(dev_map);
+err_dev:
+       return tgt_dev;
+}
+
+static const struct block_device_operations nvm_fops = {
+       .owner          = THIS_MODULE,
+};
+
+static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
+{
+       struct nvm_ioctl_create_simple *s = &create->conf.s;
+       struct request_queue *tqueue;
+       struct gendisk *tdisk;
+       struct nvm_tgt_type *tt;
+       struct nvm_target *t;
+       struct nvm_tgt_dev *tgt_dev;
+       void *targetdata;
+
+       tt = nvm_find_target_type(create->tgttype, 1);
+       if (!tt) {
+               pr_err("nvm: target type %s not found\n", create->tgttype);
+               return -EINVAL;
+       }
+
+       mutex_lock(&dev->mlock);
+       t = nvm_find_target(dev, create->tgtname);
+       if (t) {
+               pr_err("nvm: target name already exists.\n");
+               mutex_unlock(&dev->mlock);
+               return -EINVAL;
+       }
+       mutex_unlock(&dev->mlock);
+
+       if (nvm_reserve_luns(dev, s->lun_begin, s->lun_end))
+               return -ENOMEM;
+
+       t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
+       if (!t)
+               goto err_reserve;
+
+       tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end);
+       if (!tgt_dev) {
+               pr_err("nvm: could not create target device\n");
+               goto err_t;
+       }
+
+       tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
+       if (!tqueue)
+               goto err_dev;
+       blk_queue_make_request(tqueue, tt->make_rq);
+
+       tdisk = alloc_disk(0);
+       if (!tdisk)
+               goto err_queue;
+
+       sprintf(tdisk->disk_name, "%s", create->tgtname);
+       tdisk->flags = GENHD_FL_EXT_DEVT;
+       tdisk->major = 0;
+       tdisk->first_minor = 0;
+       tdisk->fops = &nvm_fops;
+       tdisk->queue = tqueue;
+
+       targetdata = tt->init(tgt_dev, tdisk);
+       if (IS_ERR(targetdata))
+               goto err_init;
+
+       tdisk->private_data = targetdata;
+       tqueue->queuedata = targetdata;
+
+       blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect);
+
+       set_capacity(tdisk, tt->capacity(targetdata));
+       add_disk(tdisk);
+
+       if (tt->sysfs_init && tt->sysfs_init(tdisk))
+               goto err_sysfs;
+
+       t->type = tt;
+       t->disk = tdisk;
+       t->dev = tgt_dev;
+
+       mutex_lock(&dev->mlock);
+       list_add_tail(&t->list, &dev->targets);
+       mutex_unlock(&dev->mlock);
+
+       return 0;
+err_sysfs:
+       if (tt->exit)
+               tt->exit(targetdata);
+err_init:
+       put_disk(tdisk);
+err_queue:
+       blk_cleanup_queue(tqueue);
+err_dev:
+       nvm_remove_tgt_dev(tgt_dev);
+err_t:
+       kfree(t);
+err_reserve:
+       nvm_release_luns_err(dev, s->lun_begin, s->lun_end);
+       return -ENOMEM;
+}
+
+static void __nvm_remove_target(struct nvm_target *t)
+{
+       struct nvm_tgt_type *tt = t->type;
+       struct gendisk *tdisk = t->disk;
+       struct request_queue *q = tdisk->queue;
+
+       del_gendisk(tdisk);
+       blk_cleanup_queue(q);
+
+       if (tt->sysfs_exit)
+               tt->sysfs_exit(tdisk);
+
+       if (tt->exit)
+               tt->exit(tdisk->private_data);
+
+       nvm_remove_tgt_dev(t->dev);
+       put_disk(tdisk);
+
+       list_del(&t->list);
+       kfree(t);
+}
+
+/**
+ * nvm_remove_tgt - Removes a target from the media manager
+ * @dev:       device
+ * @remove:    ioctl structure with target name to remove.
+ *
+ * Returns:
+ * 0: on success
+ * 1: on not found
+ * <0: on error
+ */
+static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
+{
+       struct nvm_target *t;
+
+       mutex_lock(&dev->mlock);
+       t = nvm_find_target(dev, remove->tgtname);
+       if (!t) {
+               mutex_unlock(&dev->mlock);
+               return 1;
+       }
+       __nvm_remove_target(t);
+       mutex_unlock(&dev->mlock);
+
+       return 0;
+}
+
+static int nvm_register_map(struct nvm_dev *dev)
+{
+       struct nvm_dev_map *rmap;
+       int i, j;
+
+       rmap = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
+       if (!rmap)
+               goto err_rmap;
+
+       rmap->chnls = kcalloc(dev->geo.nr_chnls, sizeof(struct nvm_ch_map),
+                                                               GFP_KERNEL);
+       if (!rmap->chnls)
+               goto err_chnls;
+
+       for (i = 0; i < dev->geo.nr_chnls; i++) {
+               struct nvm_ch_map *ch_rmap;
+               int *lun_roffs;
+               int luns_in_chnl = dev->geo.luns_per_chnl;
+
+               ch_rmap = &rmap->chnls[i];
+
+               ch_rmap->ch_off = -1;
+               ch_rmap->nr_luns = luns_in_chnl;
+
+               lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
+               if (!lun_roffs)
+                       goto err_ch;
+
+               for (j = 0; j < luns_in_chnl; j++)
+                       lun_roffs[j] = -1;
+
+               ch_rmap->lun_offs = lun_roffs;
+       }
+
+       dev->rmap = rmap;
+
+       return 0;
+err_ch:
+       while (--i >= 0)
+               kfree(rmap->chnls[i].lun_offs);
+err_chnls:
+       kfree(rmap);
+err_rmap:
+       return -ENOMEM;
+}
+
+static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
+{
+       struct nvm_dev_map *dev_map = tgt_dev->map;
+       struct nvm_ch_map *ch_map = &dev_map->chnls[p->g.ch];
+       int lun_off = ch_map->lun_offs[p->g.lun];
+
+       p->g.ch += ch_map->ch_off;
+       p->g.lun += lun_off;
+}
+
+static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
+{
+       struct nvm_dev *dev = tgt_dev->parent;
+       struct nvm_dev_map *dev_rmap = dev->rmap;
+       struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->g.ch];
+       int lun_roff = ch_rmap->lun_offs[p->g.lun];
+
+       p->g.ch -= ch_rmap->ch_off;
+       p->g.lun -= lun_roff;
+}
+
+static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev,
+                               struct ppa_addr *ppa_list, int nr_ppas)
+{
+       int i;
+
+       for (i = 0; i < nr_ppas; i++) {
+               nvm_map_to_dev(tgt_dev, &ppa_list[i]);
+               ppa_list[i] = generic_to_dev_addr(tgt_dev, ppa_list[i]);
+       }
+}
+
+static void nvm_ppa_dev_to_tgt(struct nvm_tgt_dev *tgt_dev,
+                               struct ppa_addr *ppa_list, int nr_ppas)
+{
+       int i;
+
+       for (i = 0; i < nr_ppas; i++) {
+               ppa_list[i] = dev_to_generic_addr(tgt_dev, ppa_list[i]);
+               nvm_map_to_tgt(tgt_dev, &ppa_list[i]);
+       }
+}
+
+static void nvm_rq_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
+{
+       if (rqd->nr_ppas == 1) {
+               nvm_ppa_tgt_to_dev(tgt_dev, &rqd->ppa_addr, 1);
+               return;
+       }
+
+       nvm_ppa_tgt_to_dev(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
+}
+
+static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
+{
+       if (rqd->nr_ppas == 1) {
+               nvm_ppa_dev_to_tgt(tgt_dev, &rqd->ppa_addr, 1);
+               return;
+       }
+
+       nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
+}
+
+void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
+                    int len)
+{
+       struct nvm_geo *geo = &dev->geo;
+       struct nvm_dev_map *dev_rmap = dev->rmap;
+       u64 i;
+
+       for (i = 0; i < len; i++) {
+               struct nvm_ch_map *ch_rmap;
+               int *lun_roffs;
+               struct ppa_addr gaddr;
+               u64 pba = le64_to_cpu(entries[i]);
+               int off;
+               u64 diff;
+
+               if (!pba)
+                       continue;
+
+               gaddr = linear_to_generic_addr(geo, pba);
+               ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
+               lun_roffs = ch_rmap->lun_offs;
+
+               off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun;
+
+               diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
+                               (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
+
+               entries[i] -= cpu_to_le64(diff);
+       }
+}
+EXPORT_SYMBOL(nvm_part_to_tgt);
+
 struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
 {
        struct nvm_tgt_type *tmp, *tt = NULL;
@@ -92,78 +565,6 @@ void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler)
 }
 EXPORT_SYMBOL(nvm_dev_dma_free);
 
-static struct nvmm_type *nvm_find_mgr_type(const char *name)
-{
-       struct nvmm_type *mt;
-
-       list_for_each_entry(mt, &nvm_mgrs, list)
-               if (!strcmp(name, mt->name))
-                       return mt;
-
-       return NULL;
-}
-
-static struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev)
-{
-       struct nvmm_type *mt;
-       int ret;
-
-       lockdep_assert_held(&nvm_lock);
-
-       list_for_each_entry(mt, &nvm_mgrs, list) {
-               if (strncmp(dev->sb.mmtype, mt->name, NVM_MMTYPE_LEN))
-                       continue;
-
-               ret = mt->register_mgr(dev);
-               if (ret < 0) {
-                       pr_err("nvm: media mgr failed to init (%d) on dev %s\n",
-                                                               ret, dev->name);
-                       return NULL; /* initialization failed */
-               } else if (ret > 0)
-                       return mt;
-       }
-
-       return NULL;
-}
-
-int nvm_register_mgr(struct nvmm_type *mt)
-{
-       struct nvm_dev *dev;
-       int ret = 0;
-
-       down_write(&nvm_lock);
-       if (nvm_find_mgr_type(mt->name)) {
-               ret = -EEXIST;
-               goto finish;
-       } else {
-               list_add(&mt->list, &nvm_mgrs);
-       }
-
-       /* try to register media mgr if any device have none configured */
-       list_for_each_entry(dev, &nvm_devices, devices) {
-               if (dev->mt)
-                       continue;
-
-               dev->mt = nvm_init_mgr(dev);
-       }
-finish:
-       up_write(&nvm_lock);
-
-       return ret;
-}
-EXPORT_SYMBOL(nvm_register_mgr);
-
-void nvm_unregister_mgr(struct nvmm_type *mt)
-{
-       if (!mt)
-               return;
-
-       down_write(&nvm_lock);
-       list_del(&mt->list);
-       up_write(&nvm_lock);
-}
-EXPORT_SYMBOL(nvm_unregister_mgr);
-
 static struct nvm_dev *nvm_find_nvm_dev(const char *name)
 {
        struct nvm_dev *dev;
@@ -175,53 +576,6 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
        return NULL;
 }
 
-static void nvm_tgt_generic_to_addr_mode(struct nvm_tgt_dev *tgt_dev,
-                                        struct nvm_rq *rqd)
-{
-       struct nvm_dev *dev = tgt_dev->parent;
-       int i;
-
-       if (rqd->nr_ppas > 1) {
-               for (i = 0; i < rqd->nr_ppas; i++) {
-                       rqd->ppa_list[i] = dev->mt->trans_ppa(tgt_dev,
-                                       rqd->ppa_list[i], TRANS_TGT_TO_DEV);
-                       rqd->ppa_list[i] = generic_to_dev_addr(dev,
-                                                       rqd->ppa_list[i]);
-               }
-       } else {
-               rqd->ppa_addr = dev->mt->trans_ppa(tgt_dev, rqd->ppa_addr,
-                                               TRANS_TGT_TO_DEV);
-               rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
-       }
-}
-
-int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas,
-                                                               int type)
-{
-       struct nvm_rq rqd;
-       int ret;
-
-       if (nr_ppas > dev->ops->max_phys_sect) {
-               pr_err("nvm: unable to update all sysblocks atomically\n");
-               return -EINVAL;
-       }
-
-       memset(&rqd, 0, sizeof(struct nvm_rq));
-
-       nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
-       nvm_generic_to_addr_mode(dev, &rqd);
-
-       ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
-       nvm_free_rqd_ppalist(dev, &rqd);
-       if (ret) {
-               pr_err("nvm: sysblk failed bb mark\n");
-               return -EINVAL;
-       }
-
-       return 0;
-}
-EXPORT_SYMBOL(nvm_set_bb_tbl);
-
 int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
                       int nr_ppas, int type)
 {
@@ -237,12 +591,12 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
        memset(&rqd, 0, sizeof(struct nvm_rq));
 
        nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
-       nvm_tgt_generic_to_addr_mode(tgt_dev, &rqd);
+       nvm_rq_tgt_to_dev(tgt_dev, &rqd);
 
        ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
        nvm_free_rqd_ppalist(dev, &rqd);
        if (ret) {
-               pr_err("nvm: sysblk failed bb mark\n");
+               pr_err("nvm: failed bb mark\n");
                return -EINVAL;
        }
 
@@ -262,15 +616,42 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
        struct nvm_dev *dev = tgt_dev->parent;
 
-       return dev->mt->submit_io(tgt_dev, rqd);
+       if (!dev->ops->submit_io)
+               return -ENODEV;
+
+       nvm_rq_tgt_to_dev(tgt_dev, rqd);
+
+       rqd->dev = tgt_dev;
+       return dev->ops->submit_io(dev, rqd);
 }
 EXPORT_SYMBOL(nvm_submit_io);
 
-int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p, int flags)
+int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags)
 {
        struct nvm_dev *dev = tgt_dev->parent;
+       struct nvm_rq rqd;
+       int ret;
+
+       if (!dev->ops->erase_block)
+               return 0;
+
+       nvm_map_to_dev(tgt_dev, ppas);
+
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+
+       ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, 1, 1);
+       if (ret)
+               return ret;
+
+       nvm_rq_tgt_to_dev(tgt_dev, &rqd);
+
+       rqd.flags = flags;
+
+       ret = dev->ops->erase_block(dev, &rqd);
 
-       return dev->mt->erase_blk(tgt_dev, p, flags);
+       nvm_free_rqd_ppalist(dev, &rqd);
+
+       return ret;
 }
 EXPORT_SYMBOL(nvm_erase_blk);
 
@@ -289,46 +670,67 @@ EXPORT_SYMBOL(nvm_get_l2p_tbl);
 int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len)
 {
        struct nvm_dev *dev = tgt_dev->parent;
+       struct nvm_geo *geo = &dev->geo;
+       struct nvm_area *area, *prev, *next;
+       sector_t begin = 0;
+       sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9;
 
-       return dev->mt->get_area(dev, lba, len);
-}
-EXPORT_SYMBOL(nvm_get_area);
+       if (len > max_sectors)
+               return -EINVAL;
 
-void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t lba)
-{
-       struct nvm_dev *dev = tgt_dev->parent;
+       area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL);
+       if (!area)
+               return -ENOMEM;
 
-       dev->mt->put_area(dev, lba);
-}
-EXPORT_SYMBOL(nvm_put_area);
+       prev = NULL;
 
-void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-       int i;
+       spin_lock(&dev->lock);
+       list_for_each_entry(next, &dev->area_list, list) {
+               if (begin + len > next->begin) {
+                       begin = next->end;
+                       prev = next;
+                       continue;
+               }
+               break;
+       }
 
-       if (rqd->nr_ppas > 1) {
-               for (i = 0; i < rqd->nr_ppas; i++)
-                       rqd->ppa_list[i] = dev_to_generic_addr(dev,
-                                                       rqd->ppa_list[i]);
-       } else {
-               rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr);
+       if ((begin + len) > max_sectors) {
+               spin_unlock(&dev->lock);
+               kfree(area);
+               return -EINVAL;
        }
+
+       area->begin = *lba = begin;
+       area->end = begin + len;
+
+       if (prev) /* insert into sorted order */
+               list_add(&area->list, &prev->list);
+       else
+               list_add(&area->list, &dev->area_list);
+       spin_unlock(&dev->lock);
+
+       return 0;
 }
-EXPORT_SYMBOL(nvm_addr_to_generic_mode);
+EXPORT_SYMBOL(nvm_get_area);
 
-void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
+void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
 {
-       int i;
+       struct nvm_dev *dev = tgt_dev->parent;
+       struct nvm_area *area;
 
-       if (rqd->nr_ppas > 1) {
-               for (i = 0; i < rqd->nr_ppas; i++)
-                       rqd->ppa_list[i] = generic_to_dev_addr(dev,
-                                                       rqd->ppa_list[i]);
-       } else {
-               rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
+       spin_lock(&dev->lock);
+       list_for_each_entry(area, &dev->area_list, list) {
+               if (area->begin != begin)
+                       continue;
+
+               list_del(&area->list);
+               spin_unlock(&dev->lock);
+               kfree(area);
+               return;
        }
+       spin_unlock(&dev->lock);
 }
-EXPORT_SYMBOL(nvm_generic_to_addr_mode);
+EXPORT_SYMBOL(nvm_put_area);
 
 int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
                        const struct ppa_addr *ppas, int nr_ppas, int vblk)
@@ -380,149 +782,19 @@ void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_free_rqd_ppalist);
 
-int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas,
-                                                               int flags)
+void nvm_end_io(struct nvm_rq *rqd)
 {
-       struct nvm_rq rqd;
-       int ret;
+       struct nvm_tgt_dev *tgt_dev = rqd->dev;
 
-       if (!dev->ops->erase_block)
-               return 0;
+       /* Convert address space */
+       if (tgt_dev)
+               nvm_rq_dev_to_tgt(tgt_dev, rqd);
 
-       memset(&rqd, 0, sizeof(struct nvm_rq));
-
-       ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
-       if (ret)
-               return ret;
-
-       nvm_generic_to_addr_mode(dev, &rqd);
-
-       rqd.flags = flags;
-
-       ret = dev->ops->erase_block(dev, &rqd);
-
-       nvm_free_rqd_ppalist(dev, &rqd);
-
-       return ret;
-}
-EXPORT_SYMBOL(nvm_erase_ppa);
-
-void nvm_end_io(struct nvm_rq *rqd, int error)
-{
-       rqd->error = error;
-       rqd->end_io(rqd);
+       if (rqd->end_io)
+               rqd->end_io(rqd);
 }
 EXPORT_SYMBOL(nvm_end_io);
 
-static void nvm_end_io_sync(struct nvm_rq *rqd)
-{
-       struct completion *waiting = rqd->wait;
-
-       rqd->wait = NULL;
-
-       complete(waiting);
-}
-
-static int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode,
-                                               int flags, void *buf, int len)
-{
-       DECLARE_COMPLETION_ONSTACK(wait);
-       struct bio *bio;
-       int ret;
-       unsigned long hang_check;
-
-       bio = bio_map_kern(dev->q, buf, len, GFP_KERNEL);
-       if (IS_ERR_OR_NULL(bio))
-               return -ENOMEM;
-
-       nvm_generic_to_addr_mode(dev, rqd);
-
-       rqd->dev = NULL;
-       rqd->opcode = opcode;
-       rqd->flags = flags;
-       rqd->bio = bio;
-       rqd->wait = &wait;
-       rqd->end_io = nvm_end_io_sync;
-
-       ret = dev->ops->submit_io(dev, rqd);
-       if (ret) {
-               bio_put(bio);
-               return ret;
-       }
-
-       /* Prevent hang_check timer from firing at us during very long I/O */
-       hang_check = sysctl_hung_task_timeout_secs;
-       if (hang_check)
-               while (!wait_for_completion_io_timeout(&wait,
-                                                       hang_check * (HZ/2)))
-                       ;
-       else
-               wait_for_completion_io(&wait);
-
-       return rqd->error;
-}
-
-/**
- * nvm_submit_ppa_list - submit user-defined ppa list to device. The user must
- *                      take to free ppa list if necessary.
- * @dev:       device
- * @ppa_list:  user created ppa_list
- * @nr_ppas:   length of ppa_list
- * @opcode:    device opcode
- * @flags:     device flags
- * @buf:       data buffer
- * @len:       data buffer length
- */
-int nvm_submit_ppa_list(struct nvm_dev *dev, struct ppa_addr *ppa_list,
-                       int nr_ppas, int opcode, int flags, void *buf, int len)
-{
-       struct nvm_rq rqd;
-
-       if (dev->ops->max_phys_sect < nr_ppas)
-               return -EINVAL;
-
-       memset(&rqd, 0, sizeof(struct nvm_rq));
-
-       rqd.nr_ppas = nr_ppas;
-       if (nr_ppas > 1)
-               rqd.ppa_list = ppa_list;
-       else
-               rqd.ppa_addr = ppa_list[0];
-
-       return __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len);
-}
-EXPORT_SYMBOL(nvm_submit_ppa_list);
-
-/**
- * nvm_submit_ppa - submit PPAs to device. PPAs will automatically be unfolded
- *                 as single, dual, quad plane PPAs depending on device type.
- * @dev:       device
- * @ppa:       user created ppa_list
- * @nr_ppas:   length of ppa_list
- * @opcode:    device opcode
- * @flags:     device flags
- * @buf:       data buffer
- * @len:       data buffer length
- */
-int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas,
-                               int opcode, int flags, void *buf, int len)
-{
-       struct nvm_rq rqd;
-       int ret;
-
-       memset(&rqd, 0, sizeof(struct nvm_rq));
-       ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas, 1);
-       if (ret)
-               return ret;
-
-       ret = __nvm_submit_ppa(dev, &rqd, opcode, flags, buf, len);
-
-       nvm_free_rqd_ppalist(dev, &rqd);
-
-       return ret;
-}
-EXPORT_SYMBOL(nvm_submit_ppa);
-
 /*
  * folds a bad block list from its plane representation to its virtual
  * block representation. The fold is done in place and reduced size is
@@ -559,21 +831,14 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
 }
 EXPORT_SYMBOL(nvm_bb_tbl_fold);
 
-int nvm_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, u8 *blks)
-{
-       ppa = generic_to_dev_addr(dev, ppa);
-
-       return dev->ops->get_bb_tbl(dev, ppa, blks);
-}
-EXPORT_SYMBOL(nvm_get_bb_tbl);
-
 int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
                       u8 *blks)
 {
        struct nvm_dev *dev = tgt_dev->parent;
 
-       ppa = dev->mt->trans_ppa(tgt_dev, ppa, TRANS_TGT_TO_DEV);
-       return nvm_get_bb_tbl(dev, ppa, blks);
+       nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1);
+
+       return dev->ops->get_bb_tbl(dev, ppa, blks);
 }
 EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
 
@@ -627,7 +892,7 @@ static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
 static int nvm_core_init(struct nvm_dev *dev)
 {
        struct nvm_id *id = &dev->identity;
-       struct nvm_id_group *grp = &id->groups[0];
+       struct nvm_id_group *grp = &id->grp;
        struct nvm_geo *geo = &dev->geo;
        int ret;
 
@@ -691,36 +956,31 @@ static int nvm_core_init(struct nvm_dev *dev)
                goto err_fmtype;
        }
 
+       INIT_LIST_HEAD(&dev->area_list);
+       INIT_LIST_HEAD(&dev->targets);
        mutex_init(&dev->mlock);
        spin_lock_init(&dev->lock);
 
-       blk_queue_logical_block_size(dev->q, geo->sec_size);
+       ret = nvm_register_map(dev);
+       if (ret)
+               goto err_fmtype;
 
+       blk_queue_logical_block_size(dev->q, geo->sec_size);
        return 0;
 err_fmtype:
        kfree(dev->lun_map);
        return ret;
 }
 
-static void nvm_free_mgr(struct nvm_dev *dev)
-{
-       if (!dev->mt)
-               return;
-
-       dev->mt->unregister_mgr(dev);
-       dev->mt = NULL;
-}
-
 void nvm_free(struct nvm_dev *dev)
 {
        if (!dev)
                return;
 
-       nvm_free_mgr(dev);
-
        if (dev->dma_pool)
                dev->ops->destroy_dma_pool(dev->dma_pool);
 
+       kfree(dev->rmap);
        kfree(dev->lptbl);
        kfree(dev->lun_map);
        kfree(dev);
@@ -731,28 +991,19 @@ static int nvm_init(struct nvm_dev *dev)
        struct nvm_geo *geo = &dev->geo;
        int ret = -EINVAL;
 
-       if (!dev->q || !dev->ops)
-               return ret;
-
        if (dev->ops->identity(dev, &dev->identity)) {
                pr_err("nvm: device could not be identified\n");
                goto err;
        }
 
-       pr_debug("nvm: ver:%x nvm_vendor:%x groups:%u\n",
-                       dev->identity.ver_id, dev->identity.vmnt,
-                                                       dev->identity.cgrps);
+       pr_debug("nvm: ver:%x nvm_vendor:%x\n",
+                       dev->identity.ver_id, dev->identity.vmnt);
 
        if (dev->identity.ver_id != 1) {
                pr_err("nvm: device not supported by kernel.");
                goto err;
        }
 
-       if (dev->identity.cgrps != 1) {
-               pr_err("nvm: only one group configuration supported.");
-               goto err;
-       }
-
        ret = nvm_core_init(dev);
        if (ret) {
                pr_err("nvm: could not initialize core structures.\n");
@@ -779,49 +1030,50 @@ int nvm_register(struct nvm_dev *dev)
 {
        int ret;
 
-       ret = nvm_init(dev);
-       if (ret)
-               goto err_init;
+       if (!dev->q || !dev->ops)
+               return -EINVAL;
 
        if (dev->ops->max_phys_sect > 256) {
                pr_info("nvm: max sectors supported is 256.\n");
-               ret = -EINVAL;
-               goto err_init;
+               return -EINVAL;
        }
 
        if (dev->ops->max_phys_sect > 1) {
                dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist");
                if (!dev->dma_pool) {
                        pr_err("nvm: could not create dma pool\n");
-                       ret = -ENOMEM;
-                       goto err_init;
+                       return -ENOMEM;
                }
        }
 
-       if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) {
-               ret = nvm_get_sysblock(dev, &dev->sb);
-               if (!ret)
-                       pr_err("nvm: device not initialized.\n");
-               else if (ret < 0)
-                       pr_err("nvm: err (%d) on device initialization\n", ret);
-       }
+       ret = nvm_init(dev);
+       if (ret)
+               goto err_init;
 
        /* register device with a supported media manager */
        down_write(&nvm_lock);
-       if (ret > 0)
-               dev->mt = nvm_init_mgr(dev);
        list_add(&dev->devices, &nvm_devices);
        up_write(&nvm_lock);
 
        return 0;
 err_init:
-       kfree(dev->lun_map);
+       dev->ops->destroy_dma_pool(dev->dma_pool);
        return ret;
 }
 EXPORT_SYMBOL(nvm_register);
 
 void nvm_unregister(struct nvm_dev *dev)
 {
+       struct nvm_target *t, *tmp;
+
+       mutex_lock(&dev->mlock);
+       list_for_each_entry_safe(t, tmp, &dev->targets, list) {
+               if (t->dev->parent != dev)
+                       continue;
+               __nvm_remove_target(t);
+       }
+       mutex_unlock(&dev->mlock);
+
        down_write(&nvm_lock);
        list_del(&dev->devices);
        up_write(&nvm_lock);
@@ -844,24 +1096,24 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
                return -EINVAL;
        }
 
-       if (!dev->mt) {
-               pr_info("nvm: device has no media manager registered.\n");
-               return -ENODEV;
-       }
-
        if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) {
                pr_err("nvm: config type not valid\n");
                return -EINVAL;
        }
        s = &create->conf.s;
 
-       if (s->lun_begin > s->lun_end || s->lun_end > dev->geo.nr_luns) {
+       if (s->lun_begin == -1 && s->lun_end == -1) {
+               s->lun_begin = 0;
+               s->lun_end = dev->geo.nr_luns - 1;
+       }
+
+       if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) {
                pr_err("nvm: lun out of bound (%u:%u > %u)\n",
-                       s->lun_begin, s->lun_end, dev->geo.nr_luns);
+                       s->lun_begin, s->lun_end, dev->geo.nr_luns - 1);
                return -EINVAL;
        }
 
-       return dev->mt->create_tgt(dev, create);
+       return nvm_create_tgt(dev, create);
 }
 
 static long nvm_ioctl_info(struct file *file, void __user *arg)
@@ -923,16 +1175,14 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
                struct nvm_ioctl_device_info *info = &devices->info[i];
 
                sprintf(info->devname, "%s", dev->name);
-               if (dev->mt) {
-                       info->bmversion[0] = dev->mt->version[0];
-                       info->bmversion[1] = dev->mt->version[1];
-                       info->bmversion[2] = dev->mt->version[2];
-                       sprintf(info->bmname, "%s", dev->mt->name);
-               } else {
-                       sprintf(info->bmname, "none");
-               }
 
+               /* kept for compatibility */
+               info->bmversion[0] = 1;
+               info->bmversion[1] = 0;
+               info->bmversion[2] = 0;
+               sprintf(info->bmname, "%s", "gennvm");
                i++;
+
                if (i > 31) {
                        pr_err("nvm: max 31 devices can be reported.\n");
                        break;
@@ -994,7 +1244,7 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
        }
 
        list_for_each_entry(dev, &nvm_devices, devices) {
-               ret = dev->mt->remove_tgt(dev, &remove);
+               ret = nvm_remove_tgt(dev, &remove);
                if (!ret)
                        break;
        }
@@ -1002,47 +1252,7 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
        return ret;
 }
 
-static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info)
-{
-       info->seqnr = 1;
-       info->erase_cnt = 0;
-       info->version = 1;
-}
-
-static long __nvm_ioctl_dev_init(struct nvm_ioctl_dev_init *init)
-{
-       struct nvm_dev *dev;
-       struct nvm_sb_info info;
-       int ret;
-
-       down_write(&nvm_lock);
-       dev = nvm_find_nvm_dev(init->dev);
-       up_write(&nvm_lock);
-       if (!dev) {
-               pr_err("nvm: device not found\n");
-               return -EINVAL;
-       }
-
-       nvm_setup_nvm_sb_info(&info);
-
-       strncpy(info.mmtype, init->mmtype, NVM_MMTYPE_LEN);
-       info.fs_ppa.ppa = -1;
-
-       if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) {
-               ret = nvm_init_sysblock(dev, &info);
-               if (ret)
-                       return ret;
-       }
-
-       memcpy(&dev->sb, &info, sizeof(struct nvm_sb_info));
-
-       down_write(&nvm_lock);
-       dev->mt = nvm_init_mgr(dev);
-       up_write(&nvm_lock);
-
-       return 0;
-}
-
+/* kept for compatibility reasons */
 static long nvm_ioctl_dev_init(struct file *file, void __user *arg)
 {
        struct nvm_ioctl_dev_init init;
@@ -1058,15 +1268,13 @@ static long nvm_ioctl_dev_init(struct file *file, void __user *arg)
                return -EINVAL;
        }
 
-       init.dev[DISK_NAME_LEN - 1] = '\0';
-
-       return __nvm_ioctl_dev_init(&init);
+       return 0;
 }
 
+/* Kept for compatibility reasons */
 static long nvm_ioctl_dev_factory(struct file *file, void __user *arg)
 {
        struct nvm_ioctl_dev_factory fact;
-       struct nvm_dev *dev;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -1079,19 +1287,6 @@ static long nvm_ioctl_dev_factory(struct file *file, void __user *arg)
        if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1))
                return -EINVAL;
 
-       down_write(&nvm_lock);
-       dev = nvm_find_nvm_dev(fact.dev);
-       up_write(&nvm_lock);
-       if (!dev) {
-               pr_err("nvm: device not found\n");
-               return -EINVAL;
-       }
-
-       nvm_free_mgr(dev);
-
-       if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT)
-               return nvm_dev_factory(dev, fact.flags);
-
        return 0;
 }
 
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
deleted file mode 100644 (file)
index ca78800..0000000
+++ /dev/null
@@ -1,657 +0,0 @@
-/*
- * Copyright (C) 2015 Matias Bjorling <m@bjorling.me>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING.  If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- *
- * Implementation of a general nvm manager for Open-Channel SSDs.
- */
-
-#include "gennvm.h"
-
-static struct nvm_target *gen_find_target(struct gen_dev *gn, const char *name)
-{
-       struct nvm_target *tgt;
-
-       list_for_each_entry(tgt, &gn->targets, list)
-               if (!strcmp(name, tgt->disk->disk_name))
-                       return tgt;
-
-       return NULL;
-}
-
-static const struct block_device_operations gen_fops = {
-       .owner          = THIS_MODULE,
-};
-
-static int gen_reserve_luns(struct nvm_dev *dev, struct nvm_target *t,
-                           int lun_begin, int lun_end)
-{
-       int i;
-
-       for (i = lun_begin; i <= lun_end; i++) {
-               if (test_and_set_bit(i, dev->lun_map)) {
-                       pr_err("nvm: lun %d already allocated\n", i);
-                       goto err;
-               }
-       }
-
-       return 0;
-
-err:
-       while (--i > lun_begin)
-               clear_bit(i, dev->lun_map);
-
-       return -EBUSY;
-}
-
-static void gen_release_luns_err(struct nvm_dev *dev, int lun_begin,
-                                int lun_end)
-{
-       int i;
-
-       for (i = lun_begin; i <= lun_end; i++)
-               WARN_ON(!test_and_clear_bit(i, dev->lun_map));
-}
-
-static void gen_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
-{
-       struct nvm_dev *dev = tgt_dev->parent;
-       struct gen_dev_map *dev_map = tgt_dev->map;
-       int i, j;
-
-       for (i = 0; i < dev_map->nr_chnls; i++) {
-               struct gen_ch_map *ch_map = &dev_map->chnls[i];
-               int *lun_offs = ch_map->lun_offs;
-               int ch = i + ch_map->ch_off;
-
-               for (j = 0; j < ch_map->nr_luns; j++) {
-                       int lun = j + lun_offs[j];
-                       int lunid = (ch * dev->geo.luns_per_chnl) + lun;
-
-                       WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
-               }
-
-               kfree(ch_map->lun_offs);
-       }
-
-       kfree(dev_map->chnls);
-       kfree(dev_map);
-       kfree(tgt_dev->luns);
-       kfree(tgt_dev);
-}
-
-static struct nvm_tgt_dev *gen_create_tgt_dev(struct nvm_dev *dev,
-                                             int lun_begin, int lun_end)
-{
-       struct nvm_tgt_dev *tgt_dev = NULL;
-       struct gen_dev_map *dev_rmap = dev->rmap;
-       struct gen_dev_map *dev_map;
-       struct ppa_addr *luns;
-       int nr_luns = lun_end - lun_begin + 1;
-       int luns_left = nr_luns;
-       int nr_chnls = nr_luns / dev->geo.luns_per_chnl;
-       int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl;
-       int bch = lun_begin / dev->geo.luns_per_chnl;
-       int blun = lun_begin % dev->geo.luns_per_chnl;
-       int lunid = 0;
-       int lun_balanced = 1;
-       int prev_nr_luns;
-       int i, j;
-
-       nr_chnls = nr_luns / dev->geo.luns_per_chnl;
-       nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1;
-
-       dev_map = kmalloc(sizeof(struct gen_dev_map), GFP_KERNEL);
-       if (!dev_map)
-               goto err_dev;
-
-       dev_map->chnls = kcalloc(nr_chnls, sizeof(struct gen_ch_map),
-                                                               GFP_KERNEL);
-       if (!dev_map->chnls)
-               goto err_chnls;
-
-       luns = kcalloc(nr_luns, sizeof(struct ppa_addr), GFP_KERNEL);
-       if (!luns)
-               goto err_luns;
-
-       prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ?
-                                       dev->geo.luns_per_chnl : luns_left;
-       for (i = 0; i < nr_chnls; i++) {
-               struct gen_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
-               int *lun_roffs = ch_rmap->lun_offs;
-               struct gen_ch_map *ch_map = &dev_map->chnls[i];
-               int *lun_offs;
-               int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ?
-                                       dev->geo.luns_per_chnl : luns_left;
-
-               if (lun_balanced && prev_nr_luns != luns_in_chnl)
-                       lun_balanced = 0;
-
-               ch_map->ch_off = ch_rmap->ch_off = bch;
-               ch_map->nr_luns = luns_in_chnl;
-
-               lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
-               if (!lun_offs)
-                       goto err_ch;
-
-               for (j = 0; j < luns_in_chnl; j++) {
-                       luns[lunid].ppa = 0;
-                       luns[lunid].g.ch = i;
-                       luns[lunid++].g.lun = j;
-
-                       lun_offs[j] = blun;
-                       lun_roffs[j + blun] = blun;
-               }
-
-               ch_map->lun_offs = lun_offs;
-
-               /* when starting a new channel, lun offset is reset */
-               blun = 0;
-               luns_left -= luns_in_chnl;
-       }
-
-       dev_map->nr_chnls = nr_chnls;
-
-       tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL);
-       if (!tgt_dev)
-               goto err_ch;
-
-       memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
-       /* Target device only owns a portion of the physical device */
-       tgt_dev->geo.nr_chnls = nr_chnls;
-       tgt_dev->geo.nr_luns = nr_luns;
-       tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1;
-       tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
-       tgt_dev->q = dev->q;
-       tgt_dev->map = dev_map;
-       tgt_dev->luns = luns;
-       memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id));
-
-       tgt_dev->parent = dev;
-
-       return tgt_dev;
-err_ch:
-       while (--i > 0)
-               kfree(dev_map->chnls[i].lun_offs);
-       kfree(luns);
-err_luns:
-       kfree(dev_map->chnls);
-err_chnls:
-       kfree(dev_map);
-err_dev:
-       return tgt_dev;
-}
-
-static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
-{
-       struct gen_dev *gn = dev->mp;
-       struct nvm_ioctl_create_simple *s = &create->conf.s;
-       struct request_queue *tqueue;
-       struct gendisk *tdisk;
-       struct nvm_tgt_type *tt;
-       struct nvm_target *t;
-       struct nvm_tgt_dev *tgt_dev;
-       void *targetdata;
-
-       tt = nvm_find_target_type(create->tgttype, 1);
-       if (!tt) {
-               pr_err("nvm: target type %s not found\n", create->tgttype);
-               return -EINVAL;
-       }
-
-       mutex_lock(&gn->lock);
-       t = gen_find_target(gn, create->tgtname);
-       if (t) {
-               pr_err("nvm: target name already exists.\n");
-               mutex_unlock(&gn->lock);
-               return -EINVAL;
-       }
-       mutex_unlock(&gn->lock);
-
-       t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
-       if (!t)
-               return -ENOMEM;
-
-       if (gen_reserve_luns(dev, t, s->lun_begin, s->lun_end))
-               goto err_t;
-
-       tgt_dev = gen_create_tgt_dev(dev, s->lun_begin, s->lun_end);
-       if (!tgt_dev) {
-               pr_err("nvm: could not create target device\n");
-               goto err_reserve;
-       }
-
-       tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
-       if (!tqueue)
-               goto err_dev;
-       blk_queue_make_request(tqueue, tt->make_rq);
-
-       tdisk = alloc_disk(0);
-       if (!tdisk)
-               goto err_queue;
-
-       sprintf(tdisk->disk_name, "%s", create->tgtname);
-       tdisk->flags = GENHD_FL_EXT_DEVT;
-       tdisk->major = 0;
-       tdisk->first_minor = 0;
-       tdisk->fops = &gen_fops;
-       tdisk->queue = tqueue;
-
-       targetdata = tt->init(tgt_dev, tdisk);
-       if (IS_ERR(targetdata))
-               goto err_init;
-
-       tdisk->private_data = targetdata;
-       tqueue->queuedata = targetdata;
-
-       blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect);
-
-       set_capacity(tdisk, tt->capacity(targetdata));
-       add_disk(tdisk);
-
-       t->type = tt;
-       t->disk = tdisk;
-       t->dev = tgt_dev;
-
-       mutex_lock(&gn->lock);
-       list_add_tail(&t->list, &gn->targets);
-       mutex_unlock(&gn->lock);
-
-       return 0;
-err_init:
-       put_disk(tdisk);
-err_queue:
-       blk_cleanup_queue(tqueue);
-err_dev:
-       kfree(tgt_dev);
-err_reserve:
-       gen_release_luns_err(dev, s->lun_begin, s->lun_end);
-err_t:
-       kfree(t);
-       return -ENOMEM;
-}
-
-static void __gen_remove_target(struct nvm_target *t)
-{
-       struct nvm_tgt_type *tt = t->type;
-       struct gendisk *tdisk = t->disk;
-       struct request_queue *q = tdisk->queue;
-
-       del_gendisk(tdisk);
-       blk_cleanup_queue(q);
-
-       if (tt->exit)
-               tt->exit(tdisk->private_data);
-
-       gen_remove_tgt_dev(t->dev);
-       put_disk(tdisk);
-
-       list_del(&t->list);
-       kfree(t);
-}
-
-/**
- * gen_remove_tgt - Removes a target from the media manager
- * @dev:       device
- * @remove:    ioctl structure with target name to remove.
- *
- * Returns:
- * 0: on success
- * 1: on not found
- * <0: on error
- */
-static int gen_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
-{
-       struct gen_dev *gn = dev->mp;
-       struct nvm_target *t;
-
-       if (!gn)
-               return 1;
-
-       mutex_lock(&gn->lock);
-       t = gen_find_target(gn, remove->tgtname);
-       if (!t) {
-               mutex_unlock(&gn->lock);
-               return 1;
-       }
-       __gen_remove_target(t);
-       mutex_unlock(&gn->lock);
-
-       return 0;
-}
-
-static int gen_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
-{
-       struct nvm_geo *geo = &dev->geo;
-       struct gen_dev *gn = dev->mp;
-       struct gen_area *area, *prev, *next;
-       sector_t begin = 0;
-       sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9;
-
-       if (len > max_sectors)
-               return -EINVAL;
-
-       area = kmalloc(sizeof(struct gen_area), GFP_KERNEL);
-       if (!area)
-               return -ENOMEM;
-
-       prev = NULL;
-
-       spin_lock(&dev->lock);
-       list_for_each_entry(next, &gn->area_list, list) {
-               if (begin + len > next->begin) {
-                       begin = next->end;
-                       prev = next;
-                       continue;
-               }
-               break;
-       }
-
-       if ((begin + len) > max_sectors) {
-               spin_unlock(&dev->lock);
-               kfree(area);
-               return -EINVAL;
-       }
-
-       area->begin = *lba = begin;
-       area->end = begin + len;
-
-       if (prev) /* insert into sorted order */
-               list_add(&area->list, &prev->list);
-       else
-               list_add(&area->list, &gn->area_list);
-       spin_unlock(&dev->lock);
-
-       return 0;
-}
-
-static void gen_put_area(struct nvm_dev *dev, sector_t begin)
-{
-       struct gen_dev *gn = dev->mp;
-       struct gen_area *area;
-
-       spin_lock(&dev->lock);
-       list_for_each_entry(area, &gn->area_list, list) {
-               if (area->begin != begin)
-                       continue;
-
-               list_del(&area->list);
-               spin_unlock(&dev->lock);
-               kfree(area);
-               return;
-       }
-       spin_unlock(&dev->lock);
-}
-
-static void gen_free(struct nvm_dev *dev)
-{
-       kfree(dev->mp);
-       kfree(dev->rmap);
-       dev->mp = NULL;
-}
-
-static int gen_register(struct nvm_dev *dev)
-{
-       struct gen_dev *gn;
-       struct gen_dev_map *dev_rmap;
-       int i, j;
-
-       if (!try_module_get(THIS_MODULE))
-               return -ENODEV;
-
-       gn = kzalloc(sizeof(struct gen_dev), GFP_KERNEL);
-       if (!gn)
-               goto err_gn;
-
-       dev_rmap = kmalloc(sizeof(struct gen_dev_map), GFP_KERNEL);
-       if (!dev_rmap)
-               goto err_rmap;
-
-       dev_rmap->chnls = kcalloc(dev->geo.nr_chnls, sizeof(struct gen_ch_map),
-                                                               GFP_KERNEL);
-       if (!dev_rmap->chnls)
-               goto err_chnls;
-
-       for (i = 0; i < dev->geo.nr_chnls; i++) {
-               struct gen_ch_map *ch_rmap;
-               int *lun_roffs;
-               int luns_in_chnl = dev->geo.luns_per_chnl;
-
-               ch_rmap = &dev_rmap->chnls[i];
-
-               ch_rmap->ch_off = -1;
-               ch_rmap->nr_luns = luns_in_chnl;
-
-               lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
-               if (!lun_roffs)
-                       goto err_ch;
-
-               for (j = 0; j < luns_in_chnl; j++)
-                       lun_roffs[j] = -1;
-
-               ch_rmap->lun_offs = lun_roffs;
-       }
-
-       gn->dev = dev;
-       gn->nr_luns = dev->geo.nr_luns;
-       INIT_LIST_HEAD(&gn->area_list);
-       mutex_init(&gn->lock);
-       INIT_LIST_HEAD(&gn->targets);
-       dev->mp = gn;
-       dev->rmap = dev_rmap;
-
-       return 1;
-err_ch:
-       while (--i >= 0)
-               kfree(dev_rmap->chnls[i].lun_offs);
-err_chnls:
-       kfree(dev_rmap);
-err_rmap:
-       gen_free(dev);
-err_gn:
-       module_put(THIS_MODULE);
-       return -ENOMEM;
-}
-
-static void gen_unregister(struct nvm_dev *dev)
-{
-       struct gen_dev *gn = dev->mp;
-       struct nvm_target *t, *tmp;
-
-       mutex_lock(&gn->lock);
-       list_for_each_entry_safe(t, tmp, &gn->targets, list) {
-               if (t->dev->parent != dev)
-                       continue;
-               __gen_remove_target(t);
-       }
-       mutex_unlock(&gn->lock);
-
-       gen_free(dev);
-       module_put(THIS_MODULE);
-}
-
-static int gen_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
-{
-       struct gen_dev_map *dev_map = tgt_dev->map;
-       struct gen_ch_map *ch_map = &dev_map->chnls[p->g.ch];
-       int lun_off = ch_map->lun_offs[p->g.lun];
-       struct nvm_dev *dev = tgt_dev->parent;
-       struct gen_dev_map *dev_rmap = dev->rmap;
-       struct gen_ch_map *ch_rmap;
-       int lun_roff;
-
-       p->g.ch += ch_map->ch_off;
-       p->g.lun += lun_off;
-
-       ch_rmap = &dev_rmap->chnls[p->g.ch];
-       lun_roff = ch_rmap->lun_offs[p->g.lun];
-
-       if (unlikely(ch_rmap->ch_off < 0 || lun_roff < 0)) {
-               pr_err("nvm: corrupted device partition table\n");
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static int gen_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
-{
-       struct nvm_dev *dev = tgt_dev->parent;
-       struct gen_dev_map *dev_rmap = dev->rmap;
-       struct gen_ch_map *ch_rmap = &dev_rmap->chnls[p->g.ch];
-       int lun_roff = ch_rmap->lun_offs[p->g.lun];
-
-       p->g.ch -= ch_rmap->ch_off;
-       p->g.lun -= lun_roff;
-
-       return 0;
-}
-
-static int gen_trans_rq(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
-                       int flag)
-{
-       gen_trans_fn *f;
-       int i;
-       int ret = 0;
-
-       f = (flag == TRANS_TGT_TO_DEV) ? gen_map_to_dev : gen_map_to_tgt;
-
-       if (rqd->nr_ppas == 1)
-               return f(tgt_dev, &rqd->ppa_addr);
-
-       for (i = 0; i < rqd->nr_ppas; i++) {
-               ret = f(tgt_dev, &rqd->ppa_list[i]);
-               if (ret)
-                       goto out;
-       }
-
-out:
-       return ret;
-}
-
-static void gen_end_io(struct nvm_rq *rqd)
-{
-       struct nvm_tgt_dev *tgt_dev = rqd->dev;
-       struct nvm_tgt_instance *ins = rqd->ins;
-
-       /* Convert address space */
-       if (tgt_dev)
-               gen_trans_rq(tgt_dev, rqd, TRANS_DEV_TO_TGT);
-
-       ins->tt->end_io(rqd);
-}
-
-static int gen_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
-{
-       struct nvm_dev *dev = tgt_dev->parent;
-
-       if (!dev->ops->submit_io)
-               return -ENODEV;
-
-       /* Convert address space */
-       gen_trans_rq(tgt_dev, rqd, TRANS_TGT_TO_DEV);
-       nvm_generic_to_addr_mode(dev, rqd);
-
-       rqd->dev = tgt_dev;
-       rqd->end_io = gen_end_io;
-       return dev->ops->submit_io(dev, rqd);
-}
-
-static int gen_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p,
-                        int flags)
-{
-       /* Convert address space */
-       gen_map_to_dev(tgt_dev, p);
-
-       return nvm_erase_ppa(tgt_dev->parent, p, 1, flags);
-}
-
-static struct ppa_addr gen_trans_ppa(struct nvm_tgt_dev *tgt_dev,
-                                    struct ppa_addr p, int direction)
-{
-       gen_trans_fn *f;
-       struct ppa_addr ppa = p;
-
-       f = (direction == TRANS_TGT_TO_DEV) ? gen_map_to_dev : gen_map_to_tgt;
-       f(tgt_dev, &ppa);
-
-       return ppa;
-}
-
-static void gen_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
-                              int len)
-{
-       struct nvm_geo *geo = &dev->geo;
-       struct gen_dev_map *dev_rmap = dev->rmap;
-       u64 i;
-
-       for (i = 0; i < len; i++) {
-               struct gen_ch_map *ch_rmap;
-               int *lun_roffs;
-               struct ppa_addr gaddr;
-               u64 pba = le64_to_cpu(entries[i]);
-               int off;
-               u64 diff;
-
-               if (!pba)
-                       continue;
-
-               gaddr = linear_to_generic_addr(geo, pba);
-               ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
-               lun_roffs = ch_rmap->lun_offs;
-
-               off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun;
-
-               diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
-                               (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
-
-               entries[i] -= cpu_to_le64(diff);
-       }
-}
-
-static struct nvmm_type gen = {
-       .name                   = "gennvm",
-       .version                = {0, 1, 0},
-
-       .register_mgr           = gen_register,
-       .unregister_mgr         = gen_unregister,
-
-       .create_tgt             = gen_create_tgt,
-       .remove_tgt             = gen_remove_tgt,
-
-       .submit_io              = gen_submit_io,
-       .erase_blk              = gen_erase_blk,
-
-       .get_area               = gen_get_area,
-       .put_area               = gen_put_area,
-
-       .trans_ppa              = gen_trans_ppa,
-       .part_to_tgt            = gen_part_to_tgt,
-};
-
-static int __init gen_module_init(void)
-{
-       return nvm_register_mgr(&gen);
-}
-
-static void gen_module_exit(void)
-{
-       nvm_unregister_mgr(&gen);
-}
-
-module_init(gen_module_init);
-module_exit(gen_module_exit);
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("General media manager for Open-Channel SSDs");
diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h
deleted file mode 100644 (file)
index 6a4b3f3..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright: Matias Bjorling <mb@bjorling.me>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- */
-
-#ifndef GENNVM_H_
-#define GENNVM_H_
-
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-
-#include <linux/lightnvm.h>
-
-struct gen_dev {
-       struct nvm_dev *dev;
-
-       int nr_luns;
-       struct list_head area_list;
-
-       struct mutex lock;
-       struct list_head targets;
-};
-
-/* Map between virtual and physical channel and lun */
-struct gen_ch_map {
-       int ch_off;
-       int nr_luns;
-       int *lun_offs;
-};
-
-struct gen_dev_map {
-       struct gen_ch_map *chnls;
-       int nr_chnls;
-};
-
-struct gen_area {
-       struct list_head list;
-       sector_t begin;
-       sector_t end;   /* end is excluded */
-};
-
-static inline void *ch_map_to_lun_offs(struct gen_ch_map *ch_map)
-{
-       return ch_map + 1;
-}
-
-typedef int (gen_trans_fn)(struct nvm_tgt_dev *, struct ppa_addr *);
-
-#define gen_for_each_lun(bm, lun, i) \
-               for ((i) = 0, lun = &(bm)->luns[0]; \
-                       (i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)])
-
-#endif /* GENNVM_H_ */
index 9fb7de3..e00b1d7 100644 (file)
@@ -779,7 +779,7 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
 
 static void rrpc_end_io(struct nvm_rq *rqd)
 {
-       struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance);
+       struct rrpc *rrpc = rqd->private;
        struct nvm_tgt_dev *dev = rrpc->dev;
        struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
        uint8_t npages = rqd->nr_ppas;
@@ -972,8 +972,9 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
 
        bio_get(bio);
        rqd->bio = bio;
-       rqd->ins = &rrpc->instance;
+       rqd->private = rrpc;
        rqd->nr_ppas = nr_pages;
+       rqd->end_io = rrpc_end_io;
        rrq->flags = flags;
 
        err = nvm_submit_io(dev, rqd);
@@ -1532,7 +1533,6 @@ static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk)
        if (!rrpc)
                return ERR_PTR(-ENOMEM);
 
-       rrpc->instance.tt = &tt_rrpc;
        rrpc->dev = dev;
        rrpc->disk = tdisk;
 
@@ -1611,7 +1611,6 @@ static struct nvm_tgt_type tt_rrpc = {
 
        .make_rq        = rrpc_make_rq,
        .capacity       = rrpc_capacity,
-       .end_io         = rrpc_end_io,
 
        .init           = rrpc_init,
        .exit           = rrpc_exit,
index 94e4d73..fdb6ff9 100644 (file)
@@ -102,9 +102,6 @@ struct rrpc_lun {
 };
 
 struct rrpc {
-       /* instance must be kept in top to resolve rrpc in unprep */
-       struct nvm_tgt_instance instance;
-
        struct nvm_tgt_dev *dev;
        struct gendisk *disk;
 
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
deleted file mode 100644 (file)
index 12002bf..0000000
+++ /dev/null
@@ -1,733 +0,0 @@
-/*
- * Copyright (C) 2015 Matias Bjorling. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING.  If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- *
- */
-
-#include <linux/lightnvm.h>
-
-#define MAX_SYSBLKS 3  /* remember to update mapping scheme on change */
-#define MAX_BLKS_PR_SYSBLK 2 /* 2 blks with 256 pages and 3000 erases
-                             * enables ~1.5M updates per sysblk unit
-                             */
-
-struct sysblk_scan {
-       /* A row is a collection of flash blocks for a system block. */
-       int nr_rows;
-       int row;
-       int act_blk[MAX_SYSBLKS];
-
-       int nr_ppas;
-       struct ppa_addr ppas[MAX_SYSBLKS * MAX_BLKS_PR_SYSBLK];/* all sysblks */
-};
-
-static inline int scan_ppa_idx(int row, int blkid)
-{
-       return (row * MAX_BLKS_PR_SYSBLK) + blkid;
-}
-
-static void nvm_sysblk_to_cpu(struct nvm_sb_info *info,
-                             struct nvm_system_block *sb)
-{
-       info->seqnr = be32_to_cpu(sb->seqnr);
-       info->erase_cnt = be32_to_cpu(sb->erase_cnt);
-       info->version = be16_to_cpu(sb->version);
-       strncpy(info->mmtype, sb->mmtype, NVM_MMTYPE_LEN);
-       info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa);
-}
-
-static void nvm_cpu_to_sysblk(struct nvm_system_block *sb,
-                             struct nvm_sb_info *info)
-{
-       sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC);
-       sb->seqnr = cpu_to_be32(info->seqnr);
-       sb->erase_cnt = cpu_to_be32(info->erase_cnt);
-       sb->version = cpu_to_be16(info->version);
-       strncpy(sb->mmtype, info->mmtype, NVM_MMTYPE_LEN);
-       sb->fs_ppa = cpu_to_be64(info->fs_ppa.ppa);
-}
-
-static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas)
-{
-       struct nvm_geo *geo = &dev->geo;
-       int nr_rows = min_t(int, MAX_SYSBLKS, geo->nr_chnls);
-       int i;
-
-       for (i = 0; i < nr_rows; i++)
-               sysblk_ppas[i].ppa = 0;
-
-       /* if possible, place sysblk at first channel, middle channel and last
-        * channel of the device. If not, create only one or two sys blocks
-        */
-       switch (geo->nr_chnls) {
-       case 2:
-               sysblk_ppas[1].g.ch = 1;
-               /* fall-through */
-       case 1:
-               sysblk_ppas[0].g.ch = 0;
-               break;
-       default:
-               sysblk_ppas[0].g.ch = 0;
-               sysblk_ppas[1].g.ch = geo->nr_chnls / 2;
-               sysblk_ppas[2].g.ch = geo->nr_chnls - 1;
-               break;
-       }
-
-       return nr_rows;
-}
-
-static void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s,
-                                               struct ppa_addr *sysblk_ppas)
-{
-       memset(s, 0, sizeof(struct sysblk_scan));
-       s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas);
-}
-
-static int sysblk_get_free_blks(struct nvm_dev *dev, struct ppa_addr ppa,
-                                       u8 *blks, int nr_blks,
-                                       struct sysblk_scan *s)
-{
-       struct ppa_addr *sppa;
-       int i, blkid = 0;
-
-       nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
-       if (nr_blks < 0)
-               return nr_blks;
-
-       for (i = 0; i < nr_blks; i++) {
-               if (blks[i] == NVM_BLK_T_HOST)
-                       return -EEXIST;
-
-               if (blks[i] != NVM_BLK_T_FREE)
-                       continue;
-
-               sppa = &s->ppas[scan_ppa_idx(s->row, blkid)];
-               sppa->g.ch = ppa.g.ch;
-               sppa->g.lun = ppa.g.lun;
-               sppa->g.blk = i;
-               s->nr_ppas++;
-               blkid++;
-
-               pr_debug("nvm: use (%u %u %u) as sysblk\n",
-                                       sppa->g.ch, sppa->g.lun, sppa->g.blk);
-               if (blkid > MAX_BLKS_PR_SYSBLK - 1)
-                       return 0;
-       }
-
-       pr_err("nvm: sysblk failed get sysblk\n");
-       return -EINVAL;
-}
-
-static int sysblk_get_host_blks(struct nvm_dev *dev, struct ppa_addr ppa,
-                                       u8 *blks, int nr_blks,
-                                       struct sysblk_scan *s)
-{
-       int i, nr_sysblk = 0;
-
-       nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
-       if (nr_blks < 0)
-               return nr_blks;
-
-       for (i = 0; i < nr_blks; i++) {
-               if (blks[i] != NVM_BLK_T_HOST)
-                       continue;
-
-               if (s->nr_ppas == MAX_BLKS_PR_SYSBLK * MAX_SYSBLKS) {
-                       pr_err("nvm: too many host blks\n");
-                       return -EINVAL;
-               }
-
-               ppa.g.blk = i;
-
-               s->ppas[scan_ppa_idx(s->row, nr_sysblk)] = ppa;
-               s->nr_ppas++;
-               nr_sysblk++;
-       }
-
-       return 0;
-}
-
-static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s,
-                               struct ppa_addr *ppas, int get_free)
-{
-       struct nvm_geo *geo = &dev->geo;
-       int i, nr_blks, ret = 0;
-       u8 *blks;
-
-       s->nr_ppas = 0;
-       nr_blks = geo->blks_per_lun * geo->plane_mode;
-
-       blks = kmalloc(nr_blks, GFP_KERNEL);
-       if (!blks)
-               return -ENOMEM;
-
-       for (i = 0; i < s->nr_rows; i++) {
-               s->row = i;
-
-               ret = nvm_get_bb_tbl(dev, ppas[i], blks);
-               if (ret) {
-                       pr_err("nvm: failed bb tbl for ppa (%u %u)\n",
-                                                       ppas[i].g.ch,
-                                                       ppas[i].g.blk);
-                       goto err_get;
-               }
-
-               if (get_free)
-                       ret = sysblk_get_free_blks(dev, ppas[i], blks, nr_blks,
-                                                                       s);
-               else
-                       ret = sysblk_get_host_blks(dev, ppas[i], blks, nr_blks,
-                                                                       s);
-
-               if (ret)
-                       goto err_get;
-       }
-
-err_get:
-       kfree(blks);
-       return ret;
-}
-
-/*
- * scans a block for latest sysblk.
- * Returns:
- *     0 - newer sysblk not found. PPA is updated to latest page.
- *     1 - newer sysblk found and stored in *cur. PPA is updated to
- *         next valid page.
- *     <0- error.
- */
-static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa,
-                                               struct nvm_system_block *sblk)
-{
-       struct nvm_geo *geo = &dev->geo;
-       struct nvm_system_block *cur;
-       int pg, ret, found = 0;
-
-       /* the full buffer for a flash page is allocated. Only the first of it
-        * contains the system block information
-        */
-       cur = kmalloc(geo->pfpg_size, GFP_KERNEL);
-       if (!cur)
-               return -ENOMEM;
-
-       /* perform linear scan through the block */
-       for (pg = 0; pg < dev->lps_per_blk; pg++) {
-               ppa->g.pg = ppa_to_slc(dev, pg);
-
-               ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE,
-                                                       cur, geo->pfpg_size);
-               if (ret) {
-                       if (ret == NVM_RSP_ERR_EMPTYPAGE) {
-                               pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n",
-                                                       ppa->g.ch,
-                                                       ppa->g.lun,
-                                                       ppa->g.blk,
-                                                       ppa->g.pg);
-                               break;
-                       }
-                       pr_err("nvm: read failed (%x) for ppa (%u %u %u %u)",
-                                                       ret,
-                                                       ppa->g.ch,
-                                                       ppa->g.lun,
-                                                       ppa->g.blk,
-                                                       ppa->g.pg);
-                       break; /* if we can't read a page, continue to the
-                               * next blk
-                               */
-               }
-
-               if (be32_to_cpu(cur->magic) != NVM_SYSBLK_MAGIC) {
-                       pr_debug("nvm: scan break for ppa (%u %u %u %u)\n",
-                                                       ppa->g.ch,
-                                                       ppa->g.lun,
-                                                       ppa->g.blk,
-                                                       ppa->g.pg);
-                       break; /* last valid page already found */
-               }
-
-               if (be32_to_cpu(cur->seqnr) < be32_to_cpu(sblk->seqnr))
-                       continue;
-
-               memcpy(sblk, cur, sizeof(struct nvm_system_block));
-               found = 1;
-       }
-
-       kfree(cur);
-
-       return found;
-}
-
-static int nvm_sysblk_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s,
-                                                               int type)
-{
-       return nvm_set_bb_tbl(dev, s->ppas, s->nr_ppas, type);
-}
-
-static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
-                                                       struct sysblk_scan *s)
-{
-       struct nvm_geo *geo = &dev->geo;
-       struct nvm_system_block nvmsb;
-       void *buf;
-       int i, sect, ret = 0;
-       struct ppa_addr *ppas;
-
-       nvm_cpu_to_sysblk(&nvmsb, info);
-
-       buf = kzalloc(geo->pfpg_size, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
-       memcpy(buf, &nvmsb, sizeof(struct nvm_system_block));
-
-       ppas = kcalloc(geo->sec_per_pg, sizeof(struct ppa_addr), GFP_KERNEL);
-       if (!ppas) {
-               ret = -ENOMEM;
-               goto err;
-       }
-
-       /* Write and verify */
-       for (i = 0; i < s->nr_rows; i++) {
-               ppas[0] = s->ppas[scan_ppa_idx(i, s->act_blk[i])];
-
-               pr_debug("nvm: writing sysblk to ppa (%u %u %u %u)\n",
-                                                       ppas[0].g.ch,
-                                                       ppas[0].g.lun,
-                                                       ppas[0].g.blk,
-                                                       ppas[0].g.pg);
-
-               /* Expand to all sectors within a flash page */
-               if (geo->sec_per_pg > 1) {
-                       for (sect = 1; sect < geo->sec_per_pg; sect++) {
-                               ppas[sect].ppa = ppas[0].ppa;
-                               ppas[sect].g.sec = sect;
-                       }
-               }
-
-               ret = nvm_submit_ppa(dev, ppas, geo->sec_per_pg, NVM_OP_PWRITE,
-                                       NVM_IO_SLC_MODE, buf, geo->pfpg_size);
-               if (ret) {
-                       pr_err("nvm: sysblk failed program (%u %u %u)\n",
-                                                       ppas[0].g.ch,
-                                                       ppas[0].g.lun,
-                                                       ppas[0].g.blk);
-                       break;
-               }
-
-               ret = nvm_submit_ppa(dev, ppas, geo->sec_per_pg, NVM_OP_PREAD,
-                                       NVM_IO_SLC_MODE, buf, geo->pfpg_size);
-               if (ret) {
-                       pr_err("nvm: sysblk failed read (%u %u %u)\n",
-                                                       ppas[0].g.ch,
-                                                       ppas[0].g.lun,
-                                                       ppas[0].g.blk);
-                       break;
-               }
-
-               if (memcmp(buf, &nvmsb, sizeof(struct nvm_system_block))) {
-                       pr_err("nvm: sysblk failed verify (%u %u %u)\n",
-                                                       ppas[0].g.ch,
-                                                       ppas[0].g.lun,
-                                                       ppas[0].g.blk);
-                       ret = -EINVAL;
-                       break;
-               }
-       }
-
-       kfree(ppas);
-err:
-       kfree(buf);
-
-       return ret;
-}
-
-static int nvm_prepare_new_sysblks(struct nvm_dev *dev, struct sysblk_scan *s)
-{
-       int i, ret;
-       unsigned long nxt_blk;
-       struct ppa_addr *ppa;
-
-       for (i = 0; i < s->nr_rows; i++) {
-               nxt_blk = (s->act_blk[i] + 1) % MAX_BLKS_PR_SYSBLK;
-               ppa = &s->ppas[scan_ppa_idx(i, nxt_blk)];
-               ppa->g.pg = ppa_to_slc(dev, 0);
-
-               ret = nvm_erase_ppa(dev, ppa, 1, 0);
-               if (ret)
-                       return ret;
-
-               s->act_blk[i] = nxt_blk;
-       }
-
-       return 0;
-}
-
-int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
-{
-       struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
-       struct sysblk_scan s;
-       struct nvm_system_block *cur;
-       int i, j, found = 0;
-       int ret = -ENOMEM;
-
-       /*
-        * 1. setup sysblk locations
-        * 2. get bad block list
-        * 3. filter on host-specific (type 3)
-        * 4. iterate through all and find the highest seq nr.
-        * 5. return superblock information
-        */
-
-       if (!dev->ops->get_bb_tbl)
-               return -EINVAL;
-
-       nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
-
-       mutex_lock(&dev->mlock);
-       ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0);
-       if (ret)
-               goto err_sysblk;
-
-       /* no sysblocks initialized */
-       if (!s.nr_ppas)
-               goto err_sysblk;
-
-       cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL);
-       if (!cur)
-               goto err_sysblk;
-
-       /* find the latest block across all sysblocks */
-       for (i = 0; i < s.nr_rows; i++) {
-               for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) {
-                       struct ppa_addr ppa = s.ppas[scan_ppa_idx(i, j)];
-
-                       ret = nvm_scan_block(dev, &ppa, cur);
-                       if (ret > 0)
-                               found = 1;
-                       else if (ret < 0)
-                               break;
-               }
-       }
-
-       nvm_sysblk_to_cpu(info, cur);
-
-       kfree(cur);
-err_sysblk:
-       mutex_unlock(&dev->mlock);
-
-       if (found)
-               return 1;
-       return ret;
-}
-
-int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new)
-{
-       /* 1. for each latest superblock
-        * 2. if room
-        *    a. write new flash page entry with the updated information
-        * 3. if no room
-        *    a. find next available block on lun (linear search)
-        *       if none, continue to next lun
-        *       if none at all, report error. also report that it wasn't
-        *       possible to write to all superblocks.
-        *    c. write data to block.
-        */
-       struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
-       struct sysblk_scan s;
-       struct nvm_system_block *cur;
-       int i, j, ppaidx, found = 0;
-       int ret = -ENOMEM;
-
-       if (!dev->ops->get_bb_tbl)
-               return -EINVAL;
-
-       nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
-
-       mutex_lock(&dev->mlock);
-       ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0);
-       if (ret)
-               goto err_sysblk;
-
-       cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL);
-       if (!cur)
-               goto err_sysblk;
-
-       /* Get the latest sysblk for each sysblk row */
-       for (i = 0; i < s.nr_rows; i++) {
-               found = 0;
-               for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) {
-                       ppaidx = scan_ppa_idx(i, j);
-                       ret = nvm_scan_block(dev, &s.ppas[ppaidx], cur);
-                       if (ret > 0) {
-                               s.act_blk[i] = j;
-                               found = 1;
-                       } else if (ret < 0)
-                               break;
-               }
-       }
-
-       if (!found) {
-               pr_err("nvm: no valid sysblks found to update\n");
-               ret = -EINVAL;
-               goto err_cur;
-       }
-
-       /*
-        * All sysblocks found. Check that they have same page id in their flash
-        * blocks
-        */
-       for (i = 1; i < s.nr_rows; i++) {
-               struct ppa_addr l = s.ppas[scan_ppa_idx(0, s.act_blk[0])];
-               struct ppa_addr r = s.ppas[scan_ppa_idx(i, s.act_blk[i])];
-
-               if (l.g.pg != r.g.pg) {
-                       pr_err("nvm: sysblks not on same page. Previous update failed.\n");
-                       ret = -EINVAL;
-                       goto err_cur;
-               }
-       }
-
-       /*
-        * Check that there haven't been another update to the seqnr since we
-        * began
-        */
-       if ((new->seqnr - 1) != be32_to_cpu(cur->seqnr)) {
-               pr_err("nvm: seq is not sequential\n");
-               ret = -EINVAL;
-               goto err_cur;
-       }
-
-       /*
-        * When all pages in a block has been written, a new block is selected
-        * and writing is performed on the new block.
-        */
-       if (s.ppas[scan_ppa_idx(0, s.act_blk[0])].g.pg ==
-                                               dev->lps_per_blk - 1) {
-               ret = nvm_prepare_new_sysblks(dev, &s);
-               if (ret)
-                       goto err_cur;
-       }
-
-       ret = nvm_write_and_verify(dev, new, &s);
-err_cur:
-       kfree(cur);
-err_sysblk:
-       mutex_unlock(&dev->mlock);
-
-       return ret;
-}
-
-int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
-{
-       struct nvm_geo *geo = &dev->geo;
-       struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
-       struct sysblk_scan s;
-       int ret;
-
-       /*
-        * 1. select master blocks and select first available blks
-        * 2. get bad block list
-        * 3. mark MAX_SYSBLKS block as host-based device allocated.
-        * 4. write and verify data to block
-        */
-
-       if (!dev->ops->get_bb_tbl || !dev->ops->set_bb_tbl)
-               return -EINVAL;
-
-       if (!(geo->mccap & NVM_ID_CAP_SLC) || !dev->lps_per_blk) {
-               pr_err("nvm: memory does not support SLC access\n");
-               return -EINVAL;
-       }
-
-       /* Index all sysblocks and mark them as host-driven */
-       nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
-
-       mutex_lock(&dev->mlock);
-       ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 1);
-       if (ret)
-               goto err_mark;
-
-       ret = nvm_sysblk_set_bb_tbl(dev, &s, NVM_BLK_T_HOST);
-       if (ret)
-               goto err_mark;
-
-       /* Write to the first block of each row */
-       ret = nvm_write_and_verify(dev, info, &s);
-err_mark:
-       mutex_unlock(&dev->mlock);
-       return ret;
-}
-
-static int factory_nblks(int nblks)
-{
-       /* Round up to nearest BITS_PER_LONG */
-       return (nblks + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
-}
-
-static unsigned int factory_blk_offset(struct nvm_geo *geo, struct ppa_addr ppa)
-{
-       int nblks = factory_nblks(geo->blks_per_lun);
-
-       return ((ppa.g.ch * geo->luns_per_chnl * nblks) + (ppa.g.lun * nblks)) /
-                                                               BITS_PER_LONG;
-}
-
-static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa,
-                                       u8 *blks, int nr_blks,
-                                       unsigned long *blk_bitmap, int flags)
-{
-       int i, lunoff;
-
-       nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
-       if (nr_blks < 0)
-               return nr_blks;
-
-       lunoff = factory_blk_offset(&dev->geo, ppa);
-
-       /* non-set bits correspond to the block must be erased */
-       for (i = 0; i < nr_blks; i++) {
-               switch (blks[i]) {
-               case NVM_BLK_T_FREE:
-                       if (flags & NVM_FACTORY_ERASE_ONLY_USER)
-                               set_bit(i, &blk_bitmap[lunoff]);
-                       break;
-               case NVM_BLK_T_HOST:
-                       if (!(flags & NVM_FACTORY_RESET_HOST_BLKS))
-                               set_bit(i, &blk_bitmap[lunoff]);
-                       break;
-               case NVM_BLK_T_GRWN_BAD:
-                       if (!(flags & NVM_FACTORY_RESET_GRWN_BBLKS))
-                               set_bit(i, &blk_bitmap[lunoff]);
-                       break;
-               default:
-                       set_bit(i, &blk_bitmap[lunoff]);
-                       break;
-               }
-       }
-
-       return 0;
-}
-
-static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list,
-                                       int max_ppas, unsigned long *blk_bitmap)
-{
-       struct nvm_geo *geo = &dev->geo;
-       struct ppa_addr ppa;
-       int ch, lun, blkid, idx, done = 0, ppa_cnt = 0;
-       unsigned long *offset;
-
-       while (!done) {
-               done = 1;
-               nvm_for_each_lun_ppa(geo, ppa, ch, lun) {
-                       idx = factory_blk_offset(geo, ppa);
-                       offset = &blk_bitmap[idx];
-
-                       blkid = find_first_zero_bit(offset, geo->blks_per_lun);
-                       if (blkid >= geo->blks_per_lun)
-                               continue;
-                       set_bit(blkid, offset);
-
-                       ppa.g.blk = blkid;
-                       pr_debug("nvm: erase ppa (%u %u %u)\n",
-                                                       ppa.g.ch,
-                                                       ppa.g.lun,
-                                                       ppa.g.blk);
-
-                       erase_list[ppa_cnt] = ppa;
-                       ppa_cnt++;
-                       done = 0;
-
-                       if (ppa_cnt == max_ppas)
-                               return ppa_cnt;
-               }
-       }
-
-       return ppa_cnt;
-}
-
-static int nvm_fact_select_blks(struct nvm_dev *dev, unsigned long *blk_bitmap,
-                                                               int flags)
-{
-       struct nvm_geo *geo = &dev->geo;
-       struct ppa_addr ppa;
-       int ch, lun, nr_blks, ret = 0;
-       u8 *blks;
-
-       nr_blks = geo->blks_per_lun * geo->plane_mode;
-       blks = kmalloc(nr_blks, GFP_KERNEL);
-       if (!blks)
-               return -ENOMEM;
-
-       nvm_for_each_lun_ppa(geo, ppa, ch, lun) {
-               ret = nvm_get_bb_tbl(dev, ppa, blks);
-               if (ret)
-                       pr_err("nvm: failed bb tbl for ch%u lun%u\n",
-                                                       ppa.g.ch, ppa.g.blk);
-
-               ret = nvm_factory_blks(dev, ppa, blks, nr_blks, blk_bitmap,
-                                                                       flags);
-               if (ret)
-                       break;
-       }
-
-       kfree(blks);
-       return ret;
-}
-
-int nvm_dev_factory(struct nvm_dev *dev, int flags)
-{
-       struct nvm_geo *geo = &dev->geo;
-       struct ppa_addr *ppas;
-       int ppa_cnt, ret = -ENOMEM;
-       int max_ppas = dev->ops->max_phys_sect / geo->nr_planes;
-       struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
-       struct sysblk_scan s;
-       unsigned long *blk_bitmap;
-
-       blk_bitmap = kzalloc(factory_nblks(geo->blks_per_lun) * geo->nr_luns,
-                                                               GFP_KERNEL);
-       if (!blk_bitmap)
-               return ret;
-
-       ppas = kcalloc(max_ppas, sizeof(struct ppa_addr), GFP_KERNEL);
-       if (!ppas)
-               goto err_blks;
-
-       /* create list of blks to be erased */
-       ret = nvm_fact_select_blks(dev, blk_bitmap, flags);
-       if (ret)
-               goto err_ppas;
-
-       /* continue to erase until list of blks until empty */
-       while ((ppa_cnt =
-                       nvm_fact_get_blks(dev, ppas, max_ppas, blk_bitmap)) > 0)
-               nvm_erase_ppa(dev, ppas, ppa_cnt, 0);
-
-       /* mark host reserved blocks free */
-       if (flags & NVM_FACTORY_RESET_HOST_BLKS) {
-               nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
-               mutex_lock(&dev->mlock);
-               ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0);
-               if (!ret)
-                       ret = nvm_sysblk_set_bb_tbl(dev, &s, NVM_BLK_T_FREE);
-               mutex_unlock(&dev->mlock);
-       }
-err_ppas:
-       kfree(ppas);
-err_blks:
-       kfree(blk_bitmap);
-       return ret;
-}
-EXPORT_SYMBOL(nvm_dev_factory);
index 76d2087..01035e7 100644 (file)
@@ -666,7 +666,7 @@ static inline struct search *search_alloc(struct bio *bio,
        s->iop.write_prio       = 0;
        s->iop.error            = 0;
        s->iop.flags            = 0;
-       s->iop.flush_journal    = (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) != 0;
+       s->iop.flush_journal    = op_is_flush(bio->bi_opf);
        s->iop.wq               = bcache_wq;
 
        return s;
index e04c61e..5b9cf56 100644 (file)
@@ -787,8 +787,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 
        spin_lock_irqsave(&cache->lock, flags);
-       if (cache->need_tick_bio &&
-           !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) &&
+       if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
            bio_op(bio) != REQ_OP_DISCARD) {
                pb->tick = true;
                cache->need_tick_bio = false;
@@ -828,11 +827,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
        return to_oblock(block_nr);
 }
 
-static int bio_triggers_commit(struct cache *cache, struct bio *bio)
-{
-       return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
-}
-
 /*
  * You must increment the deferred set whilst the prison cell is held.  To
  * encourage this, we ask for 'cell' to be passed in.
@@ -884,7 +878,7 @@ static void issue(struct cache *cache, struct bio *bio)
 {
        unsigned long flags;
 
-       if (!bio_triggers_commit(cache, bio)) {
+       if (!op_is_flush(bio->bi_opf)) {
                accounted_request(cache, bio);
                return;
        }
@@ -1069,8 +1063,7 @@ static void dec_io_migrations(struct cache *cache)
 
 static bool discard_or_flush(struct bio *bio)
 {
-       return bio_op(bio) == REQ_OP_DISCARD ||
-              bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
+       return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
 }
 
 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
index d1c05c1..110982d 100644 (file)
@@ -699,7 +699,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 
 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 {
-       return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) &&
+       return op_is_flush(bio->bi_opf) &&
                dm_thin_changed_this_transaction(tc->td);
 }
 
@@ -870,8 +870,7 @@ static void __inc_remap_and_issue_cell(void *context,
        struct bio *bio;
 
        while ((bio = bio_list_pop(&cell->bios))) {
-               if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
-                   bio_op(bio) == REQ_OP_DISCARD)
+               if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
                        bio_list_add(&info->defer_bios, bio);
                else {
                        inc_all_io_entry(info->tc->pool, bio);
@@ -1716,9 +1715,8 @@ static void __remap_and_issue_shared_cell(void *context,
        struct bio *bio;
 
        while ((bio = bio_list_pop(&cell->bios))) {
-               if ((bio_data_dir(bio) == WRITE) ||
-                   (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
-                    bio_op(bio) == REQ_OP_DISCARD))
+               if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) ||
+                   bio_op(bio) == REQ_OP_DISCARD)
                        bio_list_add(&info->defer_bios, bio);
                else {
                        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
@@ -2635,8 +2633,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
                return DM_MAPIO_SUBMITTED;
        }
 
-       if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
-           bio_op(bio) == REQ_OP_DISCARD) {
+       if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
                thin_defer_bio_with_throttle(tc, bio);
                return DM_MAPIO_SUBMITTED;
        }
index 8a3c3e3..138c6fa 100644 (file)
@@ -784,6 +784,13 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
                return nvme_sg_io(ns, (void __user *)arg);
 #endif
        default:
+#ifdef CONFIG_NVM
+               if (ns->ndev)
+                       return nvme_nvm_ioctl(ns, cmd, arg);
+#endif
+               if (is_sed_ioctl(cmd))
+                       return sed_ioctl(ns->ctrl->opal_dev, cmd,
+                                        (void __user *) arg);
                return -ENOTTY;
        }
 }
@@ -1051,6 +1058,28 @@ static const struct pr_ops nvme_pr_ops = {
        .pr_clear       = nvme_pr_clear,
 };
 
+#ifdef CONFIG_BLK_SED_OPAL
+int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
+               bool send)
+{
+       struct nvme_ctrl *ctrl = data;
+       struct nvme_command cmd;
+
+       memset(&cmd, 0, sizeof(cmd));
+       if (send)
+               cmd.common.opcode = nvme_admin_security_send;
+       else
+               cmd.common.opcode = nvme_admin_security_recv;
+       cmd.common.nsid = 0;
+       cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
+       cmd.common.cdw10[1] = cpu_to_le32(len);
+
+       return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
+                                     ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
+}
+EXPORT_SYMBOL_GPL(nvme_sec_submit);
+#endif /* CONFIG_BLK_SED_OPAL */
+
 static const struct block_device_operations nvme_fops = {
        .owner          = THIS_MODULE,
        .ioctl          = nvme_ioctl,
@@ -1230,6 +1259,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
                return -EIO;
        }
 
+       ctrl->oacs = le16_to_cpu(id->oacs);
        ctrl->vid = le16_to_cpu(id->vid);
        ctrl->oncs = le16_to_cpup(&id->oncs);
        atomic_set(&ctrl->abort_limit, id->acl + 1);
index 588d4a3..21cac85 100644 (file)
@@ -26,6 +26,8 @@
 #include <linux/bitops.h>
 #include <linux/lightnvm.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/sysctl.h>
+#include <uapi/linux/lightnvm.h>
 
 enum nvme_nvm_admin_opcode {
        nvme_nvm_admin_identity         = 0xe2,
@@ -248,50 +250,48 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
 {
        struct nvme_nvm_id_group *src;
        struct nvm_id_group *dst;
-       int i, end;
-
-       end = min_t(u32, 4, nvm_id->cgrps);
-
-       for (i = 0; i < end; i++) {
-               src = &nvme_nvm_id->groups[i];
-               dst = &nvm_id->groups[i];
-
-               dst->mtype = src->mtype;
-               dst->fmtype = src->fmtype;
-               dst->num_ch = src->num_ch;
-               dst->num_lun = src->num_lun;
-               dst->num_pln = src->num_pln;
-
-               dst->num_pg = le16_to_cpu(src->num_pg);
-               dst->num_blk = le16_to_cpu(src->num_blk);
-               dst->fpg_sz = le16_to_cpu(src->fpg_sz);
-               dst->csecs = le16_to_cpu(src->csecs);
-               dst->sos = le16_to_cpu(src->sos);
-
-               dst->trdt = le32_to_cpu(src->trdt);
-               dst->trdm = le32_to_cpu(src->trdm);
-               dst->tprt = le32_to_cpu(src->tprt);
-               dst->tprm = le32_to_cpu(src->tprm);
-               dst->tbet = le32_to_cpu(src->tbet);
-               dst->tbem = le32_to_cpu(src->tbem);
-               dst->mpos = le32_to_cpu(src->mpos);
-               dst->mccap = le32_to_cpu(src->mccap);
-
-               dst->cpar = le16_to_cpu(src->cpar);
-
-               if (dst->fmtype == NVM_ID_FMTYPE_MLC) {
-                       memcpy(dst->lptbl.id, src->lptbl.id, 8);
-                       dst->lptbl.mlc.num_pairs =
-                                       le16_to_cpu(src->lptbl.mlc.num_pairs);
-
-                       if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) {
-                               pr_err("nvm: number of MLC pairs not supported\n");
-                               return -EINVAL;
-                       }
 
-                       memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs,
-                                               dst->lptbl.mlc.num_pairs);
+       if (nvme_nvm_id->cgrps != 1)
+               return -EINVAL;
+
+       src = &nvme_nvm_id->groups[0];
+       dst = &nvm_id->grp;
+
+       dst->mtype = src->mtype;
+       dst->fmtype = src->fmtype;
+       dst->num_ch = src->num_ch;
+       dst->num_lun = src->num_lun;
+       dst->num_pln = src->num_pln;
+
+       dst->num_pg = le16_to_cpu(src->num_pg);
+       dst->num_blk = le16_to_cpu(src->num_blk);
+       dst->fpg_sz = le16_to_cpu(src->fpg_sz);
+       dst->csecs = le16_to_cpu(src->csecs);
+       dst->sos = le16_to_cpu(src->sos);
+
+       dst->trdt = le32_to_cpu(src->trdt);
+       dst->trdm = le32_to_cpu(src->trdm);
+       dst->tprt = le32_to_cpu(src->tprt);
+       dst->tprm = le32_to_cpu(src->tprm);
+       dst->tbet = le32_to_cpu(src->tbet);
+       dst->tbem = le32_to_cpu(src->tbem);
+       dst->mpos = le32_to_cpu(src->mpos);
+       dst->mccap = le32_to_cpu(src->mccap);
+
+       dst->cpar = le16_to_cpu(src->cpar);
+
+       if (dst->fmtype == NVM_ID_FMTYPE_MLC) {
+               memcpy(dst->lptbl.id, src->lptbl.id, 8);
+               dst->lptbl.mlc.num_pairs =
+                               le16_to_cpu(src->lptbl.mlc.num_pairs);
+
+               if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) {
+                       pr_err("nvm: number of MLC pairs not supported\n");
+                       return -EINVAL;
                }
+
+               memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs,
+                                       dst->lptbl.mlc.num_pairs);
        }
 
        return 0;
@@ -321,7 +321,6 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
 
        nvm_id->ver_id = nvme_nvm_id->ver_id;
        nvm_id->vmnt = nvme_nvm_id->vmnt;
-       nvm_id->cgrps = nvme_nvm_id->cgrps;
        nvm_id->cap = le32_to_cpu(nvme_nvm_id->cap);
        nvm_id->dom = le32_to_cpu(nvme_nvm_id->dom);
        memcpy(&nvm_id->ppaf, &nvme_nvm_id->ppaf,
@@ -372,7 +371,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
                }
 
                /* Transform physical address to target address space */
-               nvmdev->mt->part_to_tgt(nvmdev, entries, cmd_nlb);
+               nvm_part_to_tgt(nvmdev, entries, cmd_nlb);
 
                if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) {
                        ret = -EINTR;
@@ -485,7 +484,8 @@ static void nvme_nvm_end_io(struct request *rq, int error)
        struct nvm_rq *rqd = rq->end_io_data;
 
        rqd->ppa_status = nvme_req(rq)->result.u64;
-       nvm_end_io(rqd, error);
+       rqd->error = error;
+       nvm_end_io(rqd);
 
        kfree(nvme_req(rq)->cmd);
        blk_mq_free_request(rq);
@@ -586,6 +586,224 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
        .max_phys_sect          = 64,
 };
 
+static void nvme_nvm_end_user_vio(struct request *rq, int error)
+{
+       struct completion *waiting = rq->end_io_data;
+
+       complete(waiting);
+}
+
+static int nvme_nvm_submit_user_cmd(struct request_queue *q,
+                               struct nvme_ns *ns,
+                               struct nvme_nvm_command *vcmd,
+                               void __user *ubuf, unsigned int bufflen,
+                               void __user *meta_buf, unsigned int meta_len,
+                               void __user *ppa_buf, unsigned int ppa_len,
+                               u32 *result, u64 *status, unsigned int timeout)
+{
+       bool write = nvme_is_write((struct nvme_command *)vcmd);
+       struct nvm_dev *dev = ns->ndev;
+       struct gendisk *disk = ns->disk;
+       struct request *rq;
+       struct bio *bio = NULL;
+       __le64 *ppa_list = NULL;
+       dma_addr_t ppa_dma;
+       __le64 *metadata = NULL;
+       dma_addr_t metadata_dma;
+       DECLARE_COMPLETION_ONSTACK(wait);
+       int ret;
+
+       rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0,
+                       NVME_QID_ANY);
+       if (IS_ERR(rq)) {
+               ret = -ENOMEM;
+               goto err_cmd;
+       }
+
+       rq->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+       rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
+       rq->end_io_data = &wait;
+
+       if (ppa_buf && ppa_len) {
+               ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma);
+               if (!ppa_list) {
+                       ret = -ENOMEM;
+                       goto err_rq;
+               }
+               if (copy_from_user(ppa_list, (void __user *)ppa_buf,
+                                               sizeof(u64) * (ppa_len + 1))) {
+                       ret = -EFAULT;
+                       goto err_ppa;
+               }
+               vcmd->ph_rw.spba = cpu_to_le64(ppa_dma);
+       } else {
+               vcmd->ph_rw.spba = cpu_to_le64((uintptr_t)ppa_buf);
+       }
+
+       if (ubuf && bufflen) {
+               ret = blk_rq_map_user(q, rq, NULL, ubuf, bufflen, GFP_KERNEL);
+               if (ret)
+                       goto err_ppa;
+               bio = rq->bio;
+
+               if (meta_buf && meta_len) {
+                       metadata = dma_pool_alloc(dev->dma_pool, GFP_KERNEL,
+                                                               &metadata_dma);
+                       if (!metadata) {
+                               ret = -ENOMEM;
+                               goto err_map;
+                       }
+
+                       if (write) {
+                               if (copy_from_user(metadata,
+                                               (void __user *)meta_buf,
+                                               meta_len)) {
+                                       ret = -EFAULT;
+                                       goto err_meta;
+                               }
+                       }
+                       vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma);
+               }
+
+               if (!disk)
+                       goto submit;
+
+               bio->bi_bdev = bdget_disk(disk, 0);
+               if (!bio->bi_bdev) {
+                       ret = -ENODEV;
+                       goto err_meta;
+               }
+       }
+
+submit:
+       blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_user_vio);
+
+       wait_for_completion_io(&wait);
+
+       ret = nvme_error_status(rq->errors);
+       if (result)
+               *result = rq->errors & 0x7ff;
+       if (status)
+               *status = le64_to_cpu(nvme_req(rq)->result.u64);
+
+       if (metadata && !ret && !write) {
+               if (copy_to_user(meta_buf, (void *)metadata, meta_len))
+                       ret = -EFAULT;
+       }
+err_meta:
+       if (meta_buf && meta_len)
+               dma_pool_free(dev->dma_pool, metadata, metadata_dma);
+err_map:
+       if (bio) {
+               if (disk && bio->bi_bdev)
+                       bdput(bio->bi_bdev);
+               blk_rq_unmap_user(bio);
+       }
+err_ppa:
+       if (ppa_buf && ppa_len)
+               dma_pool_free(dev->dma_pool, ppa_list, ppa_dma);
+err_rq:
+       blk_mq_free_request(rq);
+err_cmd:
+       return ret;
+}
+
+static int nvme_nvm_submit_vio(struct nvme_ns *ns,
+                                       struct nvm_user_vio __user *uvio)
+{
+       struct nvm_user_vio vio;
+       struct nvme_nvm_command c;
+       unsigned int length;
+       int ret;
+
+       if (copy_from_user(&vio, uvio, sizeof(vio)))
+               return -EFAULT;
+       if (vio.flags)
+               return -EINVAL;
+
+       memset(&c, 0, sizeof(c));
+       c.ph_rw.opcode = vio.opcode;
+       c.ph_rw.nsid = cpu_to_le32(ns->ns_id);
+       c.ph_rw.control = cpu_to_le16(vio.control);
+       c.ph_rw.length = cpu_to_le16(vio.nppas);
+
+       length = (vio.nppas + 1) << ns->lba_shift;
+
+       ret = nvme_nvm_submit_user_cmd(ns->queue, ns, &c,
+                       (void __user *)(uintptr_t)vio.addr, length,
+                       (void __user *)(uintptr_t)vio.metadata,
+                                                       vio.metadata_len,
+                       (void __user *)(uintptr_t)vio.ppa_list, vio.nppas,
+                       &vio.result, &vio.status, 0);
+
+       if (ret && copy_to_user(uvio, &vio, sizeof(vio)))
+               return -EFAULT;
+
+       return ret;
+}
+
+static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
+                                       struct nvm_passthru_vio __user *uvcmd)
+{
+       struct nvm_passthru_vio vcmd;
+       struct nvme_nvm_command c;
+       struct request_queue *q;
+       unsigned int timeout = 0;
+       int ret;
+
+       if (copy_from_user(&vcmd, uvcmd, sizeof(vcmd)))
+               return -EFAULT;
+       if ((vcmd.opcode != 0xF2) && (!capable(CAP_SYS_ADMIN)))
+               return -EACCES;
+       if (vcmd.flags)
+               return -EINVAL;
+
+       memset(&c, 0, sizeof(c));
+       c.common.opcode = vcmd.opcode;
+       c.common.nsid = cpu_to_le32(ns->ns_id);
+       c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2);
+       c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3);
+       /* cdw11-12 */
+       c.ph_rw.length = cpu_to_le16(vcmd.nppas);
+       c.ph_rw.control  = cpu_to_le32(vcmd.control);
+       c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13);
+       c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14);
+       c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15);
+
+       if (vcmd.timeout_ms)
+               timeout = msecs_to_jiffies(vcmd.timeout_ms);
+
+       q = admin ? ns->ctrl->admin_q : ns->queue;
+
+       ret = nvme_nvm_submit_user_cmd(q, ns,
+                       (struct nvme_nvm_command *)&c,
+                       (void __user *)(uintptr_t)vcmd.addr, vcmd.data_len,
+                       (void __user *)(uintptr_t)vcmd.metadata,
+                                                       vcmd.metadata_len,
+                       (void __user *)(uintptr_t)vcmd.ppa_list, vcmd.nppas,
+                       &vcmd.result, &vcmd.status, timeout);
+
+       if (ret && copy_to_user(uvcmd, &vcmd, sizeof(vcmd)))
+               return -EFAULT;
+
+       return ret;
+}
+
+int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg)
+{
+       switch (cmd) {
+       case NVME_NVM_IOCTL_ADMIN_VIO:
+               return nvme_nvm_user_vcmd(ns, 1, (void __user *)arg);
+       case NVME_NVM_IOCTL_IO_VIO:
+               return nvme_nvm_user_vcmd(ns, 0, (void __user *)arg);
+       case NVME_NVM_IOCTL_SUBMIT_VIO:
+               return nvme_nvm_submit_vio(ns, (void __user *)arg);
+       default:
+               return -ENOTTY;
+       }
+}
+
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 {
        struct request_queue *q = ns->queue;
@@ -622,7 +840,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev,
                return 0;
 
        id = &ndev->identity;
-       grp = &id->groups[0];
+       grp = &id->grp;
        attr = &dattr->attr;
 
        if (strcmp(attr->name, "version") == 0) {
@@ -633,10 +851,9 @@ static ssize_t nvm_dev_attr_show(struct device *dev,
                return scnprintf(page, PAGE_SIZE, "%u\n", id->cap);
        } else if (strcmp(attr->name, "device_mode") == 0) {
                return scnprintf(page, PAGE_SIZE, "%u\n", id->dom);
+       /* kept for compatibility */
        } else if (strcmp(attr->name, "media_manager") == 0) {
-               if (!ndev->mt)
-                       return scnprintf(page, PAGE_SIZE, "%s\n", "none");
-               return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name);
+               return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm");
        } else if (strcmp(attr->name, "ppa_format") == 0) {
                return scnprintf(page, PAGE_SIZE,
                        "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
index aead6d0..14cfc6f 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/kref.h>
 #include <linux/blk-mq.h>
 #include <linux/lightnvm.h>
+#include <linux/sed-opal.h>
 
 enum {
        /*
@@ -125,6 +126,8 @@ struct nvme_ctrl {
        struct list_head node;
        struct ida ns_ida;
 
+       struct opal_dev *opal_dev;
+
        char name[12];
        char serial[20];
        char model[40];
@@ -137,6 +140,7 @@ struct nvme_ctrl {
        u32 max_hw_sectors;
        u16 oncs;
        u16 vid;
+       u16 oacs;
        atomic_t abort_limit;
        u8 event_limit;
        u8 vwc;
@@ -267,6 +271,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl);
 void nvme_queue_scan(struct nvme_ctrl *ctrl);
 void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
 
+int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
+               bool send);
+
 #define NVME_NR_AERS   1
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
                union nvme_result *res);
@@ -318,6 +325,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
 void nvme_nvm_unregister(struct nvme_ns *ns);
 int nvme_nvm_register_sysfs(struct nvme_ns *ns);
 void nvme_nvm_unregister_sysfs(struct nvme_ns *ns);
+int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg);
 #else
 static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
                                    int node)
@@ -335,6 +343,11 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i
 {
        return 0;
 }
+static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd,
+                                                       unsigned long arg)
+{
+       return -ENOTTY;
+}
 #endif /* CONFIG_NVM */
 
 static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
index 3faefab..d67d0d0 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/types.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <asm/unaligned.h>
+#include <linux/sed-opal.h>
 
 #include "nvme.h"
 
@@ -895,12 +896,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
                return BLK_EH_HANDLED;
        }
 
-       iod->aborted = 1;
-
        if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
                atomic_inc(&dev->ctrl.abort_limit);
                return BLK_EH_RESET_TIMER;
        }
+       iod->aborted = 1;
 
        memset(&cmd, 0, sizeof(cmd));
        cmd.abort.opcode = nvme_admin_abort_cmd;
@@ -1178,6 +1178,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
                dev->admin_tagset.timeout = ADMIN_TIMEOUT;
                dev->admin_tagset.numa_node = dev_to_node(dev->dev);
                dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
+               dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
                dev->admin_tagset.driver_data = dev;
 
                if (blk_mq_alloc_tag_set(&dev->admin_tagset))
@@ -1738,6 +1739,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
        if (dev->ctrl.admin_q)
                blk_put_queue(dev->ctrl.admin_q);
        kfree(dev->queues);
+       kfree(dev->ctrl.opal_dev);
        kfree(dev);
 }
 
@@ -1754,6 +1756,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
 static void nvme_reset_work(struct work_struct *work)
 {
        struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
+       bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
        int result = -ENODEV;
 
        if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING))
@@ -1786,6 +1789,14 @@ static void nvme_reset_work(struct work_struct *work)
        if (result)
                goto out;
 
+       if ((dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) && !dev->ctrl.opal_dev) {
+               dev->ctrl.opal_dev =
+                       init_opal_dev(&dev->ctrl, &nvme_sec_submit);
+       }
+
+       if (was_suspend)
+               opal_unlock_from_suspend(dev->ctrl.opal_dev);
+
        result = nvme_setup_io_queues(dev);
        if (result)
                goto out;
index 94352e4..013bfe0 100644 (file)
@@ -117,7 +117,7 @@ static unsigned int sr_check_events(struct cdrom_device_info *cdi,
                                    unsigned int clearing, int slot);
 static int sr_packet(struct cdrom_device_info *, struct packet_command *);
 
-static struct cdrom_device_ops sr_dops = {
+static const struct cdrom_device_ops sr_dops = {
        .open                   = sr_open,
        .release                = sr_release,
        .drive_status           = sr_drive_status,
index 4a2ab5d..8e4df3d 100644 (file)
@@ -22,6 +22,7 @@ struct blk_mq_hw_ctx {
 
        unsigned long           flags;          /* BLK_MQ_F_* flags */
 
+       void                    *sched_data;
        struct request_queue    *queue;
        struct blk_flush_queue  *fq;
 
@@ -35,6 +36,7 @@ struct blk_mq_hw_ctx {
        atomic_t                wait_index;
 
        struct blk_mq_tags      *tags;
+       struct blk_mq_tags      *sched_tags;
 
        struct srcu_struct      queue_rq_srcu;
 
@@ -60,7 +62,7 @@ struct blk_mq_hw_ctx {
 
 struct blk_mq_tag_set {
        unsigned int            *mq_map;
-       struct blk_mq_ops       *ops;
+       const struct blk_mq_ops *ops;
        unsigned int            nr_hw_queues;
        unsigned int            queue_depth;    /* max hw supported */
        unsigned int            reserved_tags;
@@ -151,11 +153,13 @@ enum {
        BLK_MQ_F_SG_MERGE       = 1 << 2,
        BLK_MQ_F_DEFER_ISSUE    = 1 << 4,
        BLK_MQ_F_BLOCKING       = 1 << 5,
+       BLK_MQ_F_NO_SCHED       = 1 << 6,
        BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
        BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 
        BLK_MQ_S_STOPPED        = 0,
        BLK_MQ_S_TAG_ACTIVE     = 1,
+       BLK_MQ_S_SCHED_RESTART  = 2,
 
        BLK_MQ_MAX_DEPTH        = 10240,
 
@@ -179,14 +183,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
-void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_free_request(struct request *rq);
-void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
 enum {
        BLK_MQ_REQ_NOWAIT       = (1 << 0), /* return when out of requests */
        BLK_MQ_REQ_RESERVED     = (1 << 1), /* allocate from reserved pool */
+       BLK_MQ_REQ_INTERNAL     = (1 << 2), /* allocate internal/sched tag */
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
index 519ea2c..37c9a43 100644 (file)
@@ -221,6 +221,15 @@ static inline bool op_is_write(unsigned int op)
 }
 
 /*
+ * Check if the bio or request is one that needs special treatment in the
+ * flush state machine.
+ */
+static inline bool op_is_flush(unsigned int op)
+{
+       return op & (REQ_FUA | REQ_PREFLUSH);
+}
+
+/*
  * Reads are always treated as synchronous, as are requests with the FUA or
  * PREFLUSH flag.  Other operations may be marked as synchronous using the
  * REQ_SYNC flag.
@@ -232,22 +241,29 @@ static inline bool op_is_sync(unsigned int op)
 }
 
 typedef unsigned int blk_qc_t;
-#define BLK_QC_T_NONE  -1U
-#define BLK_QC_T_SHIFT 16
+#define BLK_QC_T_NONE          -1U
+#define BLK_QC_T_SHIFT         16
+#define BLK_QC_T_INTERNAL      (1U << 31)
 
 static inline bool blk_qc_t_valid(blk_qc_t cookie)
 {
        return cookie != BLK_QC_T_NONE;
 }
 
-static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num)
+static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num,
+                                      bool internal)
 {
-       return tag | (queue_num << BLK_QC_T_SHIFT);
+       blk_qc_t ret = tag | (queue_num << BLK_QC_T_SHIFT);
+
+       if (internal)
+               ret |= BLK_QC_T_INTERNAL;
+
+       return ret;
 }
 
 static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
 {
-       return cookie >> BLK_QC_T_SHIFT;
+       return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT;
 }
 
 static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
@@ -255,6 +271,11 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
        return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
 }
 
+static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
+{
+       return (cookie & BLK_QC_T_INTERNAL) != 0;
+}
+
 struct blk_issue_stat {
        u64 time;
 };
index 1ca8e8f..05675b1 100644 (file)
@@ -154,6 +154,7 @@ struct request {
 
        /* the following two fields are internal, NEVER access directly */
        unsigned int __data_len;        /* total data len */
+       int tag;
        sector_t __sector;              /* sector cursor */
 
        struct bio *bio;
@@ -220,9 +221,10 @@ struct request {
 
        unsigned short ioprio;
 
+       int internal_tag;
+
        void *special;          /* opaque pointer available for LLD use */
 
-       int tag;
        int errors;
 
        /*
@@ -407,7 +409,7 @@ struct request_queue {
        dma_drain_needed_fn     *dma_drain_needed;
        lld_busy_fn             *lld_busy_fn;
 
-       struct blk_mq_ops       *mq_ops;
+       const struct blk_mq_ops *mq_ops;
 
        unsigned int            *mq_map;
 
@@ -569,6 +571,11 @@ struct request_queue {
        struct list_head        tag_set_list;
        struct bio_set          *bio_split;
 
+#ifdef CONFIG_DEBUG_FS
+       struct dentry           *debugfs_dir;
+       struct dentry           *mq_debugfs_dir;
+#endif
+
        bool                    mq_sysfs_init_done;
 };
 
@@ -600,6 +607,7 @@ struct request_queue {
 #define QUEUE_FLAG_FLUSH_NQ    25      /* flush not queueuable */
 #define QUEUE_FLAG_DAX         26      /* device supports DAX */
 #define QUEUE_FLAG_STATS       27      /* track rq completion times */
+#define QUEUE_FLAG_RESTART     28      /* queue needs restart at completion */
 
 #define QUEUE_FLAG_DEFAULT     ((1 << QUEUE_FLAG_IO_STAT) |            \
                                 (1 << QUEUE_FLAG_STACKABLE)    |       \
@@ -1620,6 +1628,25 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
        return __bvec_gap_to_prev(q, bprv, offset);
 }
 
+/*
+ * Check if the two bvecs from two bios can be merged to one segment.
+ * If yes, no need to check gap between the two bios since the 1st bio
+ * and the 1st bvec in the 2nd bio can be handled in one segment.
+ */
+static inline bool bios_segs_mergeable(struct request_queue *q,
+               struct bio *prev, struct bio_vec *prev_last_bv,
+               struct bio_vec *next_first_bv)
+{
+       if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv))
+               return false;
+       if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv))
+               return false;
+       if (prev->bi_seg_back_size + next_first_bv->bv_len >
+                       queue_max_segment_size(q))
+               return false;
+       return true;
+}
+
 static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
                         struct bio *next)
 {
@@ -1629,7 +1656,8 @@ static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
                bio_get_last_bvec(prev, &pb);
                bio_get_first_bvec(next, &nb);
 
-               return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+               if (!bios_segs_mergeable(q, prev, &pb, &nb))
+                       return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
        }
 
        return false;
index 8609d57..6e8f209 100644 (file)
@@ -36,7 +36,7 @@ struct packet_command
 
 /* Uniform cdrom data structures for cdrom.c */
 struct cdrom_device_info {
-       struct cdrom_device_ops  *ops;  /* link to device_ops */
+       const struct cdrom_device_ops *ops; /* link to device_ops */
        struct list_head list;          /* linked list of all device_info */
        struct gendisk *disk;           /* matching block layer disk */
        void *handle;                   /* driver-dependent data */
@@ -87,7 +87,6 @@ struct cdrom_device_ops {
 
 /* driver specifications */
        const int capability;   /* capability flags */
-       int n_minors;           /* number of active minor devices */
        /* handle uniform packets for scsi type devices (scsi,atapi) */
        int (*generic_packet) (struct cdrom_device_info *,
                               struct packet_command *);
@@ -123,6 +122,8 @@ extern int cdrom_mode_sense(struct cdrom_device_info *cdi,
                            int page_code, int page_control);
 extern void init_cdrom_command(struct packet_command *cgc,
                               void *buffer, int len, int type);
+extern int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
+                                     struct packet_command *cgc);
 
 /* The SCSI spec says there could be 256 slots. */
 #define CDROM_MAX_SLOTS        256
index b276e9e..b5825c4 100644 (file)
@@ -77,6 +77,34 @@ struct elevator_ops
        elevator_registered_fn *elevator_registered_fn;
 };
 
+struct blk_mq_alloc_data;
+struct blk_mq_hw_ctx;
+
+struct elevator_mq_ops {
+       int (*init_sched)(struct request_queue *, struct elevator_type *);
+       void (*exit_sched)(struct elevator_queue *);
+
+       bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
+       bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
+       int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
+       void (*request_merged)(struct request_queue *, struct request *, int);
+       void (*requests_merged)(struct request_queue *, struct request *, struct request *);
+       struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *);
+       void (*put_request)(struct request *);
+       void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
+       struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
+       bool (*has_work)(struct blk_mq_hw_ctx *);
+       void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
+       void (*started_request)(struct request *);
+       void (*requeue_request)(struct request *);
+       struct request *(*former_request)(struct request_queue *, struct request *);
+       struct request *(*next_request)(struct request_queue *, struct request *);
+       int (*get_rq_priv)(struct request_queue *, struct request *);
+       void (*put_rq_priv)(struct request_queue *, struct request *);
+       void (*init_icq)(struct io_cq *);
+       void (*exit_icq)(struct io_cq *);
+};
+
 #define ELV_NAME_MAX   (16)
 
 struct elv_fs_entry {
@@ -94,12 +122,16 @@ struct elevator_type
        struct kmem_cache *icq_cache;
 
        /* fields provided by elevator implementation */
-       struct elevator_ops ops;
+       union {
+               struct elevator_ops sq;
+               struct elevator_mq_ops mq;
+       } ops;
        size_t icq_size;        /* see iocontext.h */
        size_t icq_align;       /* ditto */
        struct elv_fs_entry *elevator_attrs;
        char elevator_name[ELV_NAME_MAX];
        struct module *elevator_owner;
+       bool uses_mq;
 
        /* managed by elevator core */
        char icq_cache_name[ELV_NAME_MAX + 5];  /* elvname + "_io_cq" */
@@ -123,6 +155,7 @@ struct elevator_queue
        struct kobject kobj;
        struct mutex sysfs_lock;
        unsigned int registered:1;
+       unsigned int uses_mq:1;
        DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
 };
 
@@ -139,6 +172,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *,
 extern void elv_merged_request(struct request_queue *, struct request *, int);
 extern void elv_bio_merged(struct request_queue *q, struct request *,
                                struct bio *);
+extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
index 7c273bb..ca45e4a 100644 (file)
@@ -80,8 +80,6 @@ struct nvm_dev_ops {
        unsigned int            max_phys_sect;
 };
 
-
-
 #ifdef CONFIG_NVM
 
 #include <linux/blkdev.h>
@@ -109,6 +107,7 @@ enum {
        NVM_RSP_ERR_FAILWRITE   = 0x40ff,
        NVM_RSP_ERR_EMPTYPAGE   = 0x42ff,
        NVM_RSP_ERR_FAILECC     = 0x4281,
+       NVM_RSP_ERR_FAILCRC     = 0x4004,
        NVM_RSP_WARN_HIGHECC    = 0x4700,
 
        /* Device opcodes */
@@ -202,11 +201,10 @@ struct nvm_addr_format {
 struct nvm_id {
        u8      ver_id;
        u8      vmnt;
-       u8      cgrps;
        u32     cap;
        u32     dom;
        struct nvm_addr_format ppaf;
-       struct nvm_id_group groups[4];
+       struct nvm_id_group grp;
 } __packed;
 
 struct nvm_target {
@@ -216,10 +214,6 @@ struct nvm_target {
        struct gendisk *disk;
 };
 
-struct nvm_tgt_instance {
-       struct nvm_tgt_type *tt;
-};
-
 #define ADDR_EMPTY (~0ULL)
 
 #define NVM_VERSION_MAJOR 1
@@ -230,7 +224,6 @@ struct nvm_rq;
 typedef void (nvm_end_io_fn)(struct nvm_rq *);
 
 struct nvm_rq {
-       struct nvm_tgt_instance *ins;
        struct nvm_tgt_dev *dev;
 
        struct bio *bio;
@@ -254,6 +247,8 @@ struct nvm_rq {
 
        u64 ppa_status; /* ppa media status */
        int error;
+
+       void *private;
 };
 
 static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu)
@@ -272,15 +267,6 @@ enum {
        NVM_BLK_ST_BAD =        0x8,    /* Bad block */
 };
 
-/* system block cpu representation */
-struct nvm_sb_info {
-       unsigned long           seqnr;
-       unsigned long           erase_cnt;
-       unsigned int            version;
-       char                    mmtype[NVM_MMTYPE_LEN];
-       struct ppa_addr         fs_ppa;
-};
-
 /* Device generic information */
 struct nvm_geo {
        int nr_chnls;
@@ -308,6 +294,7 @@ struct nvm_geo {
        int sec_per_lun;
 };
 
+/* sub-device structure */
 struct nvm_tgt_dev {
        /* Device information */
        struct nvm_geo geo;
@@ -329,17 +316,10 @@ struct nvm_dev {
 
        struct list_head devices;
 
-       /* Media manager */
-       struct nvmm_type *mt;
-       void *mp;
-
-       /* System blocks */
-       struct nvm_sb_info sb;
-
        /* Device information */
        struct nvm_geo geo;
 
-       /* lower page table */
+         /* lower page table */
        int lps_per_blk;
        int *lptbl;
 
@@ -359,6 +339,10 @@ struct nvm_dev {
 
        struct mutex mlock;
        spinlock_t lock;
+
+       /* target management */
+       struct list_head area_list;
+       struct list_head targets;
 };
 
 static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo,
@@ -391,10 +375,10 @@ static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo,
        return l;
 }
 
-static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
-                                               struct ppa_addr r)
+static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev,
+                                                 struct ppa_addr r)
 {
-       struct nvm_geo *geo = &dev->geo;
+       struct nvm_geo *geo = &tgt_dev->geo;
        struct ppa_addr l;
 
        l.ppa = ((u64)r.g.blk) << geo->ppaf.blk_offset;
@@ -407,10 +391,10 @@ static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
        return l;
 }
 
-static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev,
-                                               struct ppa_addr r)
+static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev,
+                                                 struct ppa_addr r)
 {
-       struct nvm_geo *geo = &dev->geo;
+       struct nvm_geo *geo = &tgt_dev->geo;
        struct ppa_addr l;
 
        l.ppa = 0;
@@ -452,15 +436,12 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
                                        (ppa1.g.blk == ppa2.g.blk));
 }
 
-static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg)
-{
-       return dev->lptbl[slc_pg];
-}
-
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
 typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *);
 typedef void (nvm_tgt_exit_fn)(void *);
+typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
+typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
 
 struct nvm_tgt_type {
        const char *name;
@@ -469,12 +450,15 @@ struct nvm_tgt_type {
        /* target entry points */
        nvm_tgt_make_rq_fn *make_rq;
        nvm_tgt_capacity_fn *capacity;
-       nvm_end_io_fn *end_io;
 
        /* module-specific init/teardown */
        nvm_tgt_init_fn *init;
        nvm_tgt_exit_fn *exit;
 
+       /* sysfs */
+       nvm_tgt_sysfs_init_fn *sysfs_init;
+       nvm_tgt_sysfs_exit_fn *sysfs_exit;
+
        /* For internal use */
        struct list_head list;
 };
@@ -487,103 +471,29 @@ extern void nvm_unregister_tgt_type(struct nvm_tgt_type *);
 extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *);
 extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t);
 
-typedef int (nvmm_register_fn)(struct nvm_dev *);
-typedef void (nvmm_unregister_fn)(struct nvm_dev *);
-
-typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *);
-typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *);
-typedef int (nvmm_submit_io_fn)(struct nvm_tgt_dev *, struct nvm_rq *);
-typedef int (nvmm_erase_blk_fn)(struct nvm_tgt_dev *, struct ppa_addr *, int);
-typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t);
-typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t);
-typedef struct ppa_addr (nvmm_trans_ppa_fn)(struct nvm_tgt_dev *,
-                                           struct ppa_addr, int);
-typedef void (nvmm_part_to_tgt_fn)(struct nvm_dev *, sector_t*, int);
-
-enum {
-       TRANS_TGT_TO_DEV =      0x0,
-       TRANS_DEV_TO_TGT =      0x1,
-};
-
-struct nvmm_type {
-       const char *name;
-       unsigned int version[3];
-
-       nvmm_register_fn *register_mgr;
-       nvmm_unregister_fn *unregister_mgr;
-
-       nvmm_create_tgt_fn *create_tgt;
-       nvmm_remove_tgt_fn *remove_tgt;
-
-       nvmm_submit_io_fn *submit_io;
-       nvmm_erase_blk_fn *erase_blk;
-
-       nvmm_get_area_fn *get_area;
-       nvmm_put_area_fn *put_area;
-
-       nvmm_trans_ppa_fn *trans_ppa;
-       nvmm_part_to_tgt_fn *part_to_tgt;
-
-       struct list_head list;
-};
-
-extern int nvm_register_mgr(struct nvmm_type *);
-extern void nvm_unregister_mgr(struct nvmm_type *);
-
 extern struct nvm_dev *nvm_alloc_dev(int);
 extern int nvm_register(struct nvm_dev *);
 extern void nvm_unregister(struct nvm_dev *);
 
-extern int nvm_set_bb_tbl(struct nvm_dev *, struct ppa_addr *, int, int);
 extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
                              int, int);
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
-extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
-extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
 extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
                                        const struct ppa_addr *, int, int);
 extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
-extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int);
 extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int);
 extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
                           void *);
 extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
 extern void nvm_put_area(struct nvm_tgt_dev *, sector_t);
-extern void nvm_end_io(struct nvm_rq *, int);
-extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int,
-                                                               void *, int);
-extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int,
-                                                       int, void *, int);
+extern void nvm_end_io(struct nvm_rq *);
 extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
-extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *);
 extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
 
-/* sysblk.c */
-#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */
-
-/* system block on disk representation */
-struct nvm_system_block {
-       __be32                  magic;          /* magic signature */
-       __be32                  seqnr;          /* sequence number */
-       __be32                  erase_cnt;      /* erase count */
-       __be16                  version;        /* version number */
-       u8                      mmtype[NVM_MMTYPE_LEN]; /* media manager name */
-       __be64                  fs_ppa;         /* PPA for media manager
-                                                * superblock */
-};
-
-extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *);
-extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *);
-extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *);
-
 extern int nvm_dev_factory(struct nvm_dev *, int flags);
 
-#define nvm_for_each_lun_ppa(geo, ppa, chid, lunid)                    \
-       for ((chid) = 0, (ppa).ppa = 0; (chid) < (geo)->nr_chnls;       \
-                                       (chid)++, (ppa).g.ch = (chid))  \
-               for ((lunid) = 0; (lunid) < (geo)->luns_per_chnl;       \
-                                       (lunid)++, (ppa).g.lun = (lunid))
+extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int);
 
 #else /* CONFIG_NVM */
 struct nvm_dev_ops;
index 3d1c6f1..00eac86 100644 (file)
@@ -244,6 +244,7 @@ enum {
        NVME_CTRL_ONCS_DSM                      = 1 << 2,
        NVME_CTRL_ONCS_WRITE_ZEROES             = 1 << 3,
        NVME_CTRL_VWC_PRESENT                   = 1 << 0,
+       NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
 };
 
 struct nvme_lbaf {
index f017fd6..d4e0a20 100644 (file)
@@ -259,6 +259,26 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
 unsigned int sbitmap_weight(const struct sbitmap *sb);
 
 /**
+ * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file.
+ * @sb: Bitmap to show.
+ * @m: struct seq_file to write to.
+ *
+ * This is intended for debugging. The format may change at any time.
+ */
+void sbitmap_show(struct sbitmap *sb, struct seq_file *m);
+
+/**
+ * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct
+ * seq_file.
+ * @sb: Bitmap to show.
+ * @m: struct seq_file to write to.
+ *
+ * This is intended for debugging. The output isn't guaranteed to be internally
+ * consistent.
+ */
+void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m);
+
+/**
  * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific
  * memory node.
  * @sbq: Bitmap queue to initialize.
@@ -370,4 +390,14 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
  */
 void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
 
+/**
+ * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
+ * seq_file.
+ * @sbq: Bitmap queue to show.
+ * @m: struct seq_file to write to.
+ *
+ * This is intended for debugging. The format may change at any time.
+ */
+void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);
+
 #endif /* __LINUX_SCALE_BITMAP_H */
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
new file mode 100644 (file)
index 0000000..deee23d
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ *    Rafael Antognolli <rafael.antognolli@intel.com>
+ *    Scott  Bauer      <scott.bauer@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef LINUX_OPAL_H
+#define LINUX_OPAL_H
+
+#include <uapi/linux/sed-opal.h>
+#include <linux/kernel.h>
+
+struct opal_dev;
+
+typedef int (sec_send_recv)(void *data, u16 spsp, u8 secp, void *buffer,
+               size_t len, bool send);
+
+#ifdef CONFIG_BLK_SED_OPAL
+bool opal_unlock_from_suspend(struct opal_dev *dev);
+struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv);
+int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr);
+
+static inline bool is_sed_ioctl(unsigned int cmd)
+{
+       switch (cmd) {
+       case IOC_OPAL_SAVE:
+       case IOC_OPAL_LOCK_UNLOCK:
+       case IOC_OPAL_TAKE_OWNERSHIP:
+       case IOC_OPAL_ACTIVATE_LSP:
+       case IOC_OPAL_SET_PW:
+       case IOC_OPAL_ACTIVATE_USR:
+       case IOC_OPAL_REVERT_TPR:
+       case IOC_OPAL_LR_SETUP:
+       case IOC_OPAL_ADD_USR_TO_LR:
+       case IOC_OPAL_ENABLE_DISABLE_MBR:
+       case IOC_OPAL_ERASE_LR:
+       case IOC_OPAL_SECURE_ERASE_LR:
+               return true;
+       }
+       return false;
+}
+#else
+static inline bool is_sed_ioctl(unsigned int cmd)
+{
+       return false;
+}
+
+static inline int sed_ioctl(struct opal_dev *dev, unsigned int cmd,
+                           void __user *ioctl_ptr)
+{
+       return 0;
+}
+static inline bool opal_unlock_from_suspend(struct opal_dev *dev)
+{
+       return false;
+}
+#define init_opal_dev(data, send_recv)         NULL
+#endif /* CONFIG_BLK_SED_OPAL */
+#endif /* LINUX_OPAL_H */
index 774a431..fd19f36 100644 (file)
@@ -122,6 +122,44 @@ struct nvm_ioctl_dev_factory {
        __u32 flags;
 };
 
+struct nvm_user_vio {
+       __u8 opcode;
+       __u8 flags;
+       __u16 control;
+       __u16 nppas;
+       __u16 rsvd;
+       __u64 metadata;
+       __u64 addr;
+       __u64 ppa_list;
+       __u32 metadata_len;
+       __u32 data_len;
+       __u64 status;
+       __u32 result;
+       __u32 rsvd3[3];
+};
+
+struct nvm_passthru_vio {
+       __u8 opcode;
+       __u8 flags;
+       __u8 rsvd[2];
+       __u32 nsid;
+       __u32 cdw2;
+       __u32 cdw3;
+       __u64 metadata;
+       __u64 addr;
+       __u32 metadata_len;
+       __u32 data_len;
+       __u64 ppa_list;
+       __u16 nppas;
+       __u16 control;
+       __u32 cdw13;
+       __u32 cdw14;
+       __u32 cdw15;
+       __u64 status;
+       __u32 result;
+       __u32 timeout_ms;
+};
+
 /* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */
 enum {
        /* top level cmds */
@@ -137,6 +175,11 @@ enum {
 
        /* Factory reset device */
        NVM_DEV_FACTORY_CMD,
+
+       /* Vector user I/O */
+       NVM_DEV_VIO_ADMIN_CMD = 0x41,
+       NVM_DEV_VIO_CMD = 0x42,
+       NVM_DEV_VIO_USER_CMD = 0x43,
 };
 
 #define NVM_IOCTL 'L' /* 0x4c */
@@ -154,6 +197,13 @@ enum {
 #define NVM_DEV_FACTORY                _IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \
                                                struct nvm_ioctl_dev_factory)
 
+#define NVME_NVM_IOCTL_IO_VIO          _IOWR(NVM_IOCTL, NVM_DEV_VIO_USER_CMD, \
+                                               struct nvm_passthru_vio)
+#define NVME_NVM_IOCTL_ADMIN_VIO       _IOWR(NVM_IOCTL, NVM_DEV_VIO_ADMIN_CMD,\
+                                               struct nvm_passthru_vio)
+#define NVME_NVM_IOCTL_SUBMIT_VIO      _IOWR(NVM_IOCTL, NVM_DEV_VIO_CMD,\
+                                               struct nvm_user_vio)
+
 #define NVM_VERSION_MAJOR      1
 #define NVM_VERSION_MINOR      0
 #define NVM_VERSION_PATCHLEVEL 0
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
new file mode 100644 (file)
index 0000000..c72e073
--- /dev/null
@@ -0,0 +1,119 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ *    Rafael Antognolli <rafael.antognolli@intel.com>
+ *    Scott  Bauer      <scott.bauer@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _UAPI_SED_OPAL_H
+#define _UAPI_SED_OPAL_H
+
+#include <linux/types.h>
+
+#define OPAL_KEY_MAX 256
+#define OPAL_MAX_LRS 9
+
+enum opal_mbr {
+       OPAL_MBR_ENABLE = 0x0,
+       OPAL_MBR_DISABLE = 0x01,
+};
+
+enum opal_user {
+       OPAL_ADMIN1 = 0x0,
+       OPAL_USER1 = 0x01,
+       OPAL_USER2 = 0x02,
+       OPAL_USER3 = 0x03,
+       OPAL_USER4 = 0x04,
+       OPAL_USER5 = 0x05,
+       OPAL_USER6 = 0x06,
+       OPAL_USER7 = 0x07,
+       OPAL_USER8 = 0x08,
+       OPAL_USER9 = 0x09,
+};
+
+enum opal_lock_state {
+       OPAL_RO = 0x01, /* 0001 */
+       OPAL_RW = 0x02, /* 0010 */
+       OPAL_LK = 0x04, /* 0100 */
+};
+
+struct opal_key {
+       __u8 lr;
+       __u8 key_len;
+       __u8 __align[6];
+       __u8 key[OPAL_KEY_MAX];
+};
+
+struct opal_lr_act {
+       struct opal_key key;
+       __u32 sum;
+       __u8    num_lrs;
+       __u8 lr[OPAL_MAX_LRS];
+       __u8 align[2]; /* Align to 8 byte boundary */
+};
+
+struct opal_session_info {
+       __u32 sum;
+       __u32 who;
+       struct opal_key opal_key;
+};
+
+struct opal_user_lr_setup {
+       __u64 range_start;
+       __u64 range_length;
+       __u32 RLE; /* Read Lock enabled */
+       __u32 WLE; /* Write Lock Enabled */
+       struct opal_session_info session;
+};
+
+struct opal_lock_unlock {
+       struct opal_session_info session;
+       __u32 l_state;
+       __u8 __align[4];
+};
+
+struct opal_new_pw {
+       struct opal_session_info session;
+
+       /* When we're not operating in sum, and we first set
+        * passwords we need to set them via ADMIN authority.
+        * After passwords are changed, we can set them via,
+        * User authorities.
+        * Because of this restriction we need to know about
+        * Two different users. One in 'session' which we will use
+        * to start the session and new_userr_pw as the user we're
+        * chaning the pw for.
+        */
+       struct opal_session_info new_user_pw;
+};
+
+struct opal_mbr_data {
+       struct opal_key key;
+       __u8 enable_disable;
+       __u8 __align[7];
+};
+
+#define IOC_OPAL_SAVE              _IOW('p', 220, struct opal_lock_unlock)
+#define IOC_OPAL_LOCK_UNLOCK       _IOW('p', 221, struct opal_lock_unlock)
+#define IOC_OPAL_TAKE_OWNERSHIP            _IOW('p', 222, struct opal_key)
+#define IOC_OPAL_ACTIVATE_LSP       _IOW('p', 223, struct opal_lr_act)
+#define IOC_OPAL_SET_PW             _IOW('p', 224, struct opal_new_pw)
+#define IOC_OPAL_ACTIVATE_USR       _IOW('p', 225, struct opal_session_info)
+#define IOC_OPAL_REVERT_TPR         _IOW('p', 226, struct opal_key)
+#define IOC_OPAL_LR_SETUP           _IOW('p', 227, struct opal_user_lr_setup)
+#define IOC_OPAL_ADD_USR_TO_LR      _IOW('p', 228, struct opal_lock_unlock)
+#define IOC_OPAL_ENABLE_DISABLE_MBR _IOW('p', 229, struct opal_mbr_data)
+#define IOC_OPAL_ERASE_LR           _IOW('p', 230, struct opal_session_info)
+#define IOC_OPAL_SECURE_ERASE_LR    _IOW('p', 231, struct opal_session_info)
+
+#endif /* _UAPI_SED_OPAL_H */
index 2cecf05..55e11c4 100644 (file)
@@ -17,6 +17,7 @@
 
 #include <linux/random.h>
 #include <linux/sbitmap.h>
+#include <linux/seq_file.h>
 
 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
                      gfp_t flags, int node)
@@ -180,6 +181,62 @@ unsigned int sbitmap_weight(const struct sbitmap *sb)
 }
 EXPORT_SYMBOL_GPL(sbitmap_weight);
 
+void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
+{
+       seq_printf(m, "depth=%u\n", sb->depth);
+       seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
+       seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
+       seq_printf(m, "map_nr=%u\n", sb->map_nr);
+}
+EXPORT_SYMBOL_GPL(sbitmap_show);
+
+static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte)
+{
+       if ((offset & 0xf) == 0) {
+               if (offset != 0)
+                       seq_putc(m, '\n');
+               seq_printf(m, "%08x:", offset);
+       }
+       if ((offset & 0x1) == 0)
+               seq_putc(m, ' ');
+       seq_printf(m, "%02x", byte);
+}
+
+void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
+{
+       u8 byte = 0;
+       unsigned int byte_bits = 0;
+       unsigned int offset = 0;
+       int i;
+
+       for (i = 0; i < sb->map_nr; i++) {
+               unsigned long word = READ_ONCE(sb->map[i].word);
+               unsigned int word_bits = READ_ONCE(sb->map[i].depth);
+
+               while (word_bits > 0) {
+                       unsigned int bits = min(8 - byte_bits, word_bits);
+
+                       byte |= (word & (BIT(bits) - 1)) << byte_bits;
+                       byte_bits += bits;
+                       if (byte_bits == 8) {
+                               emit_byte(m, offset, byte);
+                               byte = 0;
+                               byte_bits = 0;
+                               offset++;
+                       }
+                       word >>= bits;
+                       word_bits -= bits;
+               }
+       }
+       if (byte_bits) {
+               emit_byte(m, offset, byte);
+               offset++;
+       }
+       if (offset)
+               seq_putc(m, '\n');
+}
+EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
+
 static unsigned int sbq_calc_wake_batch(unsigned int depth)
 {
        unsigned int wake_batch;
@@ -239,7 +296,19 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
 
 void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
 {
-       sbq->wake_batch = sbq_calc_wake_batch(depth);
+       unsigned int wake_batch = sbq_calc_wake_batch(depth);
+       int i;
+
+       if (sbq->wake_batch != wake_batch) {
+               WRITE_ONCE(sbq->wake_batch, wake_batch);
+               /*
+                * Pairs with the memory barrier in sbq_wake_up() to ensure that
+                * the batch size is updated before the wait counts.
+                */
+               smp_mb__before_atomic();
+               for (i = 0; i < SBQ_WAIT_QUEUES; i++)
+                       atomic_set(&sbq->ws[i].wait_cnt, 1);
+       }
        sbitmap_resize(&sbq->sb, depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
@@ -297,20 +366,39 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 static void sbq_wake_up(struct sbitmap_queue *sbq)
 {
        struct sbq_wait_state *ws;
+       unsigned int wake_batch;
        int wait_cnt;
 
-       /* Ensure that the wait list checks occur after clear_bit(). */
-       smp_mb();
+       /*
+        * Pairs with the memory barrier in set_current_state() to ensure the
+        * proper ordering of clear_bit()/waitqueue_active() in the waker and
+        * test_and_set_bit()/prepare_to_wait()/finish_wait() in the waiter. See
+        * the comment on waitqueue_active(). This is __after_atomic because we
+        * just did clear_bit() in the caller.
+        */
+       smp_mb__after_atomic();
 
        ws = sbq_wake_ptr(sbq);
        if (!ws)
                return;
 
        wait_cnt = atomic_dec_return(&ws->wait_cnt);
-       if (unlikely(wait_cnt < 0))
-               wait_cnt = atomic_inc_return(&ws->wait_cnt);
-       if (wait_cnt == 0) {
-               atomic_add(sbq->wake_batch, &ws->wait_cnt);
+       if (wait_cnt <= 0) {
+               wake_batch = READ_ONCE(sbq->wake_batch);
+               /*
+                * Pairs with the memory barrier in sbitmap_queue_resize() to
+                * ensure that we see the batch size update before the wait
+                * count is reset.
+                */
+               smp_mb__before_atomic();
+               /*
+                * If there are concurrent callers to sbq_wake_up(), the last
+                * one to decrement the wait count below zero will bump it back
+                * up. If there is a concurrent resize, the count reset will
+                * either cause the cmpxchg to fail or overwrite after the
+                * cmpxchg.
+                */
+               atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch);
                sbq_index_atomic_inc(&sbq->wake_index);
                wake_up(&ws->wait);
        }
@@ -331,7 +419,8 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
        int i, wake_index;
 
        /*
-        * Make sure all changes prior to this are visible from other CPUs.
+        * Pairs with the memory barrier in set_current_state() like in
+        * sbq_wake_up().
         */
        smp_mb();
        wake_index = atomic_read(&sbq->wake_index);
@@ -345,3 +434,37 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
        }
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);
+
+void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
+{
+       bool first;
+       int i;
+
+       sbitmap_show(&sbq->sb, m);
+
+       seq_puts(m, "alloc_hint={");
+       first = true;
+       for_each_possible_cpu(i) {
+               if (!first)
+                       seq_puts(m, ", ");
+               first = false;
+               seq_printf(m, "%u", *per_cpu_ptr(sbq->alloc_hint, i));
+       }
+       seq_puts(m, "}\n");
+
+       seq_printf(m, "wake_batch=%u\n", sbq->wake_batch);
+       seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index));
+
+       seq_puts(m, "ws={\n");
+       for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+               struct sbq_wait_state *ws = &sbq->ws[i];
+
+               seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n",
+                          atomic_read(&ws->wait_cnt),
+                          waitqueue_active(&ws->wait) ? "active" : "inactive");
+       }
+       seq_puts(m, "}\n");
+
+       seq_printf(m, "round_robin=%d\n", sbq->round_robin);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_show);