NVMe: Metadata format support

author Keith Busch <keith.busch@intel.com>

Thu, 19 Feb 2015 20:39:03 +0000 (13:39 -0700)

committer Keith Busch <keith.busch@intel.com>

Thu, 19 Feb 2015 23:15:35 +0000 (16:15 -0700)
author Keith Busch <keith.busch@intel.com>
Thu, 19 Feb 2015 20:39:03 +0000 (13:39 -0700)
committer Keith Busch <keith.busch@intel.com>
Thu, 19 Feb 2015 23:15:35 +0000 (16:15 -0700)
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c

index cbdfbbf983927e85a4a83d94d20047f2fadf6357..3ffa57a932ea47d96c49663eea80d153f46414b1 100644 (file)
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -37,6 +37,7 @@
  #include <linux/ptrace.h>
  #include <linux/sched.h>
  #include <linux/slab.h>
+#include <linux/t10-pi.h>
  #include <linux/types.h>
  #include <scsi/sg.h>
  #include <asm-generic/io-64-nonatomic-lo-hi.h>
@@ -482,6 +483,62 @@ static int nvme_error_status(u16 status)
         }
  }
  
+static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+       if (be32_to_cpu(pi->ref_tag) == v)
+               pi->ref_tag = cpu_to_be32(p);
+}
+
+static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+       if (be32_to_cpu(pi->ref_tag) == p)
+               pi->ref_tag = cpu_to_be32(v);
+}
+
+/**
+ * nvme_dif_remap - remaps ref tags to bip seed and physical lba
+ *
+ * The virtual start sector is the one that was originally submitted by the
+ * block layer.        Due to partitioning, MD/DM cloning, etc. the actual physical
+ * start sector may be different. Remap protection information to match the
+ * physical LBA on writes, and back to the original seed on reads.
+ *
+ * Type 0 and 3 do not have a ref tag, so no remapping required.
+ */
+static void nvme_dif_remap(struct request *req,
+                       void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
+{
+       struct nvme_ns *ns = req->rq_disk->private_data;
+       struct bio_integrity_payload *bip;
+       struct t10_pi_tuple *pi;
+       void *p, *pmap;
+       u32 i, nlb, ts, phys, virt;
+
+       if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
+               return;
+
+       bip = bio_integrity(req->bio);
+       if (!bip)
+               return;
+
+       pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
+       if (!pmap)
+               return;
+
+       p = pmap;
+       virt = bip_get_seed(bip);
+       phys = nvme_block_nr(ns, blk_rq_pos(req));
+       nlb = (blk_rq_bytes(req) >> ns->lba_shift);
+       ts = ns->disk->integrity->tuple_size;
+
+       for (i = 0; i < nlb; i++, virt++, phys++) {
+               pi = (struct t10_pi_tuple *)p;
+               dif_swap(phys, virt, pi);
+               p += ts;
+       }
+       kunmap_atomic(pmap);
+}
+
  static void req_completion(struct nvme_queue *nvmeq, void *ctx,
                                                 struct nvme_completion *cqe)
  {
@@ -512,9 +569,16 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
                         "completing aborted command with status:%04x\n",
                         status);
  
-       if (iod->nents)
+       if (iod->nents) {
                 dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
                         rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               if (blk_integrity_rq(req)) {
+                       if (!rq_data_dir(req))
+                               nvme_dif_remap(req, nvme_dif_complete);
+                       dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1,
+                               rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               }
+       }
         nvme_free_iod(nvmeq->dev, iod);
  
         blk_mq_complete_request(req);
@@ -670,6 +734,24 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
         cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
         cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
         cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+
+       if (blk_integrity_rq(req)) {
+               cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg));
+               switch (ns->pi_type) {
+               case NVME_NS_DPS_PI_TYPE3:
+                       control |= NVME_RW_PRINFO_PRCHK_GUARD;
+                       break;
+               case NVME_NS_DPS_PI_TYPE1:
+               case NVME_NS_DPS_PI_TYPE2:
+                       control |= NVME_RW_PRINFO_PRCHK_GUARD |
+                                       NVME_RW_PRINFO_PRCHK_REF;
+                       cmnd->rw.reftag = cpu_to_le32(
+                                       nvme_block_nr(ns, blk_rq_pos(req)));
+                       break;
+               }
+       } else if (ns->ms)
+               control |= NVME_RW_PRINFO_PRACT;
+
         cmnd->rw.control = cpu_to_le16(control);
         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
  
@@ -690,6 +772,19 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
         struct nvme_iod *iod;
         enum dma_data_direction dma_dir;
  
+       /*
+        * If formated with metadata, require the block layer provide a buffer
+        * unless this namespace is formated such that the metadata can be
+        * stripped/generated by the controller with PRACT=1.
+        */
+       if (ns->ms && !blk_integrity_rq(req)) {
+               if (!(ns->pi_type && ns->ms == 8)) {
+                       req->errors = -EFAULT;
+                       blk_mq_complete_request(req);
+                       return BLK_MQ_RQ_QUEUE_OK;
+               }
+       }
+
         iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
         if (!iod)
                 return BLK_MQ_RQ_QUEUE_BUSY;
@@ -725,6 +820,21 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
                                         iod->nents, dma_dir);
                         goto retry_cmd;
                 }
+               if (blk_integrity_rq(req)) {
+                       if (blk_rq_count_integrity_sg(req->q, req->bio) != 1)
+                               goto error_cmd;
+
+                       sg_init_table(iod->meta_sg, 1);
+                       if (blk_rq_map_integrity_sg(
+                                       req->q, req->bio, iod->meta_sg) != 1)
+                               goto error_cmd;
+
+                       if (rq_data_dir(req))
+                               nvme_dif_remap(req, nvme_dif_prep);
+
+                       if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir))
+                               goto error_cmd;
+               }
         }
  
         nvme_set_info(cmd, iod, req_completion);
@@ -1875,13 +1985,61 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
         return 0;
  }
  
+static void nvme_config_discard(struct nvme_ns *ns)
+{
+       u32 logical_block_size = queue_logical_block_size(ns->queue);
+       ns->queue->limits.discard_zeroes_data = 0;
+       ns->queue->limits.discard_alignment = logical_block_size;
+       ns->queue->limits.discard_granularity = logical_block_size;
+       ns->queue->limits.max_discard_sectors = 0xffffffff;
+       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+}
+
+static int nvme_noop_verify(struct blk_integrity_iter *iter)
+{
+       return 0;
+}
+
+static int nvme_noop_generate(struct blk_integrity_iter *iter)
+{
+       return 0;
+}
+
+struct blk_integrity nvme_meta_noop = {
+       .name                   = "NVME_META_NOOP",
+       .generate_fn            = nvme_noop_generate,
+       .verify_fn              = nvme_noop_verify,
+};
+
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+       struct blk_integrity integrity;
+
+       switch (ns->pi_type) {
+       case NVME_NS_DPS_PI_TYPE3:
+               integrity = t10_pi_type3_crc;
+               break;
+       case NVME_NS_DPS_PI_TYPE1:
+       case NVME_NS_DPS_PI_TYPE2:
+               integrity = t10_pi_type1_crc;
+               break;
+       default:
+               integrity = nvme_meta_noop;
+               break;
+       }
+       integrity.tuple_size = ns->ms;
+       blk_integrity_register(ns->disk, &integrity);
+       blk_queue_max_integrity_segments(ns->queue, 1);
+}
+
  static int nvme_revalidate_disk(struct gendisk *disk)
  {
         struct nvme_ns *ns = disk->private_data;
         struct nvme_dev *dev = ns->dev;
         struct nvme_id_ns *id;
         dma_addr_t dma_addr;
-       int lbaf;
+       int lbaf, pi_type, old_ms;
+       unsigned short bs;
  
         id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
                                                                 GFP_KERNEL);
@@ -1890,16 +2048,50 @@ static int nvme_revalidate_disk(struct gendisk *disk)
                                                                 __func__);
                 return 0;
         }
+       if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) {
+               dev_warn(&dev->pci_dev->dev,
+                       "identify failed ns:%d, setting capacity to 0\n",
+                       ns->ns_id);
+               memset(id, 0, sizeof(*id));
+       }
  
-       if (nvme_identify(dev, ns->ns_id, 0, dma_addr))
-               goto free;
-
-       lbaf = id->flbas & 0xf;
+       old_ms = ns->ms;
+       lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
         ns->lba_shift = id->lbaf[lbaf].ds;
+       ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+
+       /*
+        * If identify namespace failed, use default 512 byte block size so
+        * block layer can use before failing read/write for 0 capacity.
+        */
+       if (ns->lba_shift == 0)
+               ns->lba_shift = 9;
+       bs = 1 << ns->lba_shift;
+
+       /* XXX: PI implementation requires metadata equal t10 pi tuple size */
+       pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
+                                       id->dps & NVME_NS_DPS_PI_MASK : 0;
+
+       if (disk->integrity && (ns->pi_type != pi_type || ns->ms != old_ms ||
+                               bs != queue_logical_block_size(disk->queue) ||
+                               (ns->ms && id->flbas & NVME_NS_FLBAS_META_EXT)))
+               blk_integrity_unregister(disk);
+
+       ns->pi_type = pi_type;
+       blk_queue_logical_block_size(ns->queue, bs);
+
+       if (ns->ms && !disk->integrity && (disk->flags & GENHD_FL_UP) &&
+                               !(id->flbas & NVME_NS_FLBAS_META_EXT))
+               nvme_init_integrity(ns);
+
+       if (id->ncap == 0 || (ns->ms && !disk->integrity))
+               set_capacity(disk, 0);
+       else
+               set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+       if (dev->oncs & NVME_CTRL_ONCS_DSM)
+               nvme_config_discard(ns);
  
-       blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
-       set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
- free:
         dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
         return 0;
  }
@@ -1956,30 +2148,16 @@ static int nvme_kthread(void *data)
         return 0;
  }
  
-static void nvme_config_discard(struct nvme_ns *ns)
-{
-       u32 logical_block_size = queue_logical_block_size(ns->queue);
-       ns->queue->limits.discard_zeroes_data = 0;
-       ns->queue->limits.discard_alignment = logical_block_size;
-       ns->queue->limits.discard_granularity = logical_block_size;
-       ns->queue->limits.max_discard_sectors = 0xffffffff;
-       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
-}
-
-static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
-                       struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
+static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
  {
         struct nvme_ns *ns;
         struct gendisk *disk;
         int node = dev_to_node(&dev->pci_dev->dev);
-       int lbaf;
-
-       if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
-               return NULL;
  
         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
         if (!ns)
-               return NULL;
+               return;
+
         ns->queue = blk_mq_init_queue(&dev->tagset);
         if (IS_ERR(ns->queue))
                 goto out_free_ns;
@@ -1995,9 +2173,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
  
         ns->ns_id = nsid;
         ns->disk = disk;
-       lbaf = id->flbas & 0xf;
-       ns->lba_shift = id->lbaf[lbaf].ds;
-       ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+       ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
+       list_add_tail(&ns->list, &dev->namespaces);
+
         blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
         if (dev->max_hw_sectors)
                 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
@@ -2014,18 +2192,23 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
         disk->driverfs_dev = &dev->pci_dev->dev;
         disk->flags = GENHD_FL_EXT_DEVT;
         sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
-       set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
-
-       if (dev->oncs & NVME_CTRL_ONCS_DSM)
-               nvme_config_discard(ns);
-
-       return ns;
  
+       /*
+        * Initialize capacity to 0 until we establish the namespace format and
+        * setup integrity extentions if necessary. The revalidate_disk after
+        * add_disk allows the driver to register with integrity if the format
+        * requires it.
+        */
+       set_capacity(disk, 0);
+       nvme_revalidate_disk(ns->disk);
+       add_disk(ns->disk);
+       if (ns->ms)
+               revalidate_disk(ns->disk);
+       return;
   out_free_queue:
         blk_cleanup_queue(ns->queue);
   out_free_ns:
         kfree(ns);
-       return NULL;
  }
  
  static void nvme_create_io_queues(struct nvme_dev *dev)
@@ -2150,22 +2333,20 @@ static int nvme_dev_add(struct nvme_dev *dev)
         struct pci_dev *pdev = dev->pci_dev;
         int res;
         unsigned nn, i;
-       struct nvme_ns *ns;
         struct nvme_id_ctrl *ctrl;
-       struct nvme_id_ns *id_ns;
         void *mem;
         dma_addr_t dma_addr;
         int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
  
-       mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL);
+       mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL);
         if (!mem)
                 return -ENOMEM;
  
         res = nvme_identify(dev, 0, 1, dma_addr);
         if (res) {
                 dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res);
-               res = -EIO;
-               goto out;
+               dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
+               return -EIO;
         }
  
         ctrl = mem;
@@ -2191,6 +2372,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
                 } else
                         dev->max_hw_sectors = max_hw_sectors;
         }
+       dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
  
         dev->tagset.ops = &nvme_mq_ops;
         dev->tagset.nr_hw_queues = dev->online_queues - 1;
@@ -2203,33 +2385,12 @@ static int nvme_dev_add(struct nvme_dev *dev)
         dev->tagset.driver_data = dev;
  
         if (blk_mq_alloc_tag_set(&dev->tagset))
-               goto out;
-
-       id_ns = mem;
-       for (i = 1; i <= nn; i++) {
-               res = nvme_identify(dev, i, 0, dma_addr);
-               if (res)
-                       continue;
-
-               if (id_ns->ncap == 0)
-                       continue;
-
-               res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
-                                                       dma_addr + 4096, NULL);
-               if (res)
-                       memset(mem + 4096, 0, 4096);
+               return 0;
  
-               ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
-               if (ns)
-                       list_add_tail(&ns->list, &dev->namespaces);
-       }
-       list_for_each_entry(ns, &dev->namespaces, list)
-               add_disk(ns->disk);
-       res = 0;
+       for (i = 1; i <= nn; i++)
+               nvme_alloc_ns(dev, i);
  
- out:
-       dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
-       return res;
+       return 0;
  }
  
  static int nvme_dev_map(struct nvme_dev *dev)
@@ -2528,8 +2689,11 @@ static void nvme_dev_remove(struct nvme_dev *dev)
         struct nvme_ns *ns;
  
         list_for_each_entry(ns, &dev->namespaces, list) {
-               if (ns->disk->flags & GENHD_FL_UP)
+               if (ns->disk->flags & GENHD_FL_UP) {
+                       if (ns->disk->integrity)
+                               blk_integrity_unregister(ns->disk);
                         del_gendisk(ns->disk);
+               }
                 if (!blk_queue_dying(ns->queue)) {
                         blk_mq_abort_requeue_list(ns->queue);
                         blk_cleanup_queue(ns->queue);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h

index 19a5d4b23209302bc55cce74c12f69cbd91f260d..cca264db24785bc9648ac70c0dbd1dd43dfe36cf 100644 (file)
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -121,6 +121,7 @@ struct nvme_ns {
         unsigned ns_id;
         int lba_shift;
         int ms;
+       int pi_type;
         u64 mode_select_num_blocks;
         u32 mode_select_block_len;
  };
@@ -138,6 +139,7 @@ struct nvme_iod {
         int nents;              /* Used in scatterlist */
         int length;             /* Of data, in bytes */
         dma_addr_t first_dma;
+       struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */
         struct scatterlist sg[0];
  };
  
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h

index 26386cf3db444cbca7bc9e7138f8b0e01c0669b6..406bfc95652c49d6fdcd85a15b24a63ca3d9fed6 100644 (file)
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -124,10 +124,22 @@ struct nvme_id_ns {
  
  enum {
         NVME_NS_FEAT_THIN       = 1 << 0,
+       NVME_NS_FLBAS_LBA_MASK  = 0xf,
+       NVME_NS_FLBAS_META_EXT  = 0x10,
         NVME_LBAF_RP_BEST       = 0,
         NVME_LBAF_RP_BETTER     = 1,
         NVME_LBAF_RP_GOOD       = 2,
         NVME_LBAF_RP_DEGRADED   = 3,
+       NVME_NS_DPC_PI_LAST     = 1 << 4,
+       NVME_NS_DPC_PI_FIRST    = 1 << 3,
+       NVME_NS_DPC_PI_TYPE3    = 1 << 2,
+       NVME_NS_DPC_PI_TYPE2    = 1 << 1,
+       NVME_NS_DPC_PI_TYPE1    = 1 << 0,
+       NVME_NS_DPS_PI_FIRST    = 1 << 3,
+       NVME_NS_DPS_PI_MASK     = 0x7,
+       NVME_NS_DPS_PI_TYPE1    = 1,
+       NVME_NS_DPS_PI_TYPE2    = 2,
+       NVME_NS_DPS_PI_TYPE3    = 3,
  };
  
  struct nvme_smart_log {
@@ -261,6 +273,10 @@ enum {
         NVME_RW_DSM_LATENCY_LOW         = 3 << 4,
         NVME_RW_DSM_SEQ_REQ             = 1 << 6,
         NVME_RW_DSM_COMPRESSED          = 1 << 7,
+       NVME_RW_PRINFO_PRCHK_REF        = 1 << 10,
+       NVME_RW_PRINFO_PRCHK_APP        = 1 << 11,
+       NVME_RW_PRINFO_PRCHK_GUARD      = 1 << 12,
+       NVME_RW_PRINFO_PRACT            = 1 << 13,
  };
  
  struct nvme_dsm_cmd {
author	Keith Busch <keith.busch@intel.com>
	Thu, 19 Feb 2015 20:39:03 +0000 (13:39 -0700)
committer	Keith Busch <keith.busch@intel.com>
	Thu, 19 Feb 2015 23:15:35 +0000 (16:15 -0700)
drivers/block/nvme-core.c		patch \| blob \| history
include/linux/nvme.h		patch \| blob \| history
include/uapi/linux/nvme.h		patch \| blob \| history