unsigned inline_vecs;
if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
- bip = kmalloc(sizeof(struct bio_integrity_payload) +
- sizeof(struct bio_vec) * nr_vecs, gfp_mask);
+ bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask);
inline_vecs = nr_vecs;
} else {
bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
host = ata_host_alloc(dev, 1);
if (!host) {
- dev_err(dev, "ata_host_alloc failed\n");
ret = -ENOMEM;
goto err_pm_put;
}
pos++;
/*
+ * It takes 3.4 seconds to remove 80GiB ramdisk.
+ * So, we need cond_resched to avoid stalling the CPU.
+ */
+ cond_resched();
+
+ /*
* This assumes radix_tree_gang_lookup always returns as
* many pages as possible. If the radix-tree code changes,
* so will this have to.
int num_ch;
};
+static void nvm_free(struct kref *ref);
+
static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
{
struct nvm_target *tgt;
struct nvm_target *t;
struct nvm_tgt_dev *tgt_dev;
void *targetdata;
+ unsigned int mdts;
int ret;
switch (create->conf.type) {
tdisk->private_data = targetdata;
tqueue->queuedata = targetdata;
- blk_queue_max_hw_sectors(tqueue,
- (dev->geo.csecs >> 9) * NVM_MAX_VLBA);
+ mdts = (dev->geo.csecs >> 9) * NVM_MAX_VLBA;
+ if (dev->geo.mdts) {
+ mdts = min_t(u32, dev->geo.mdts,
+ (dev->geo.csecs >> 9) * NVM_MAX_VLBA);
+ }
+ blk_queue_max_hw_sectors(tqueue, mdts);
set_capacity(tdisk, tt->capacity(targetdata));
add_disk(tdisk);
/**
* nvm_remove_tgt - Removes a target from the media manager
- * @dev: device
* @remove: ioctl structure with target name to remove.
*
* Returns:
* 1: on not found
* <0: on error
*/
-static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
+static int nvm_remove_tgt(struct nvm_ioctl_remove *remove)
{
struct nvm_target *t;
+ struct nvm_dev *dev;
- mutex_lock(&dev->mlock);
- t = nvm_find_target(dev, remove->tgtname);
- if (!t) {
+ down_read(&nvm_lock);
+ list_for_each_entry(dev, &nvm_devices, devices) {
+ mutex_lock(&dev->mlock);
+ t = nvm_find_target(dev, remove->tgtname);
+ if (t) {
+ mutex_unlock(&dev->mlock);
+ break;
+ }
mutex_unlock(&dev->mlock);
- return 1;
}
+ up_read(&nvm_lock);
+
+ if (!t)
+ return 1;
+
__nvm_remove_target(t, true);
- mutex_unlock(&dev->mlock);
+ kref_put(&dev->ref, nvm_free);
return 0;
}
return ret;
}
-static void nvm_free(struct nvm_dev *dev)
+static void nvm_free(struct kref *ref)
{
- if (!dev)
- return;
+ struct nvm_dev *dev = container_of(ref, struct nvm_dev, ref);
if (dev->dma_pool)
dev->ops->destroy_dma_pool(dev->dma_pool);
- nvm_unregister_map(dev);
+ if (dev->rmap)
+ nvm_unregister_map(dev);
+
kfree(dev->lun_map);
kfree(dev);
}
struct nvm_dev *nvm_alloc_dev(int node)
{
- return kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node);
+ struct nvm_dev *dev;
+
+ dev = kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node);
+ if (dev)
+ kref_init(&dev->ref);
+
+ return dev;
}
EXPORT_SYMBOL(nvm_alloc_dev);
{
int ret, exp_pool_size;
- if (!dev->q || !dev->ops)
+ if (!dev->q || !dev->ops) {
+ kref_put(&dev->ref, nvm_free);
return -EINVAL;
+ }
ret = nvm_init(dev);
- if (ret)
+ if (ret) {
+ kref_put(&dev->ref, nvm_free);
return ret;
+ }
exp_pool_size = max_t(int, PAGE_SIZE,
(NVM_MAX_VLBA * (sizeof(u64) + dev->geo.sos)));
exp_pool_size);
if (!dev->dma_pool) {
pr_err("nvm: could not create dma pool\n");
- nvm_free(dev);
+ kref_put(&dev->ref, nvm_free);
return -ENOMEM;
}
if (t->dev->parent != dev)
continue;
__nvm_remove_target(t, false);
+ kref_put(&dev->ref, nvm_free);
}
mutex_unlock(&dev->mlock);
list_del(&dev->devices);
up_write(&nvm_lock);
- nvm_free(dev);
+ kref_put(&dev->ref, nvm_free);
}
EXPORT_SYMBOL(nvm_unregister);
static int __nvm_configure_create(struct nvm_ioctl_create *create)
{
struct nvm_dev *dev;
+ int ret;
down_write(&nvm_lock);
dev = nvm_find_nvm_dev(create->dev);
return -EINVAL;
}
- return nvm_create_tgt(dev, create);
+ kref_get(&dev->ref);
+ ret = nvm_create_tgt(dev, create);
+ if (ret)
+ kref_put(&dev->ref, nvm_free);
+
+ return ret;
}
static long nvm_ioctl_info(struct file *file, void __user *arg)
static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
{
struct nvm_ioctl_remove remove;
- struct nvm_dev *dev;
- int ret = 0;
if (copy_from_user(&remove, arg, sizeof(struct nvm_ioctl_remove)))
return -EFAULT;
return -EINVAL;
}
- list_for_each_entry(dev, &nvm_devices, devices) {
- ret = nvm_remove_tgt(dev, &remove);
- if (!ret)
- break;
- }
-
- return ret;
+ return nvm_remove_tgt(&remove);
}
/* kept for compatibility reasons */
#include "pblk.h"
-int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
+void pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
+ unsigned long flags)
{
struct request_queue *q = pblk->dev->q;
struct pblk_w_ctx w_ctx;
goto retry;
case NVM_IO_ERR:
pblk_pipeline_stop(pblk);
+ bio_io_error(bio);
goto out;
}
out:
generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time);
pblk_write_should_kick(pblk);
- return ret;
+
+ if (ret == NVM_IO_DONE)
+ bio_endio(bio);
}
/*
int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd)
{
- struct ppa_addr *ppa_list;
+ struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
int ret;
- ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
-
pblk_down_chunk(pblk, ppa_list[0]);
ret = pblk_submit_io_sync(pblk, rqd);
pblk_up_chunk(pblk, ppa_list[0]);
struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_line_meta *lm = &pblk->lm;
struct bio *bio;
+ struct ppa_addr *ppa_list;
struct nvm_rq rqd;
u64 paddr = pblk_line_smeta_start(pblk, line);
int i, ret;
rqd.opcode = NVM_OP_PREAD;
rqd.nr_ppas = lm->smeta_sec;
rqd.is_seq = 1;
+ ppa_list = nvm_rq_to_ppa_list(&rqd);
for (i = 0; i < lm->smeta_sec; i++, paddr++)
- rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+ ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
ret = pblk_submit_io_sync(pblk, &rqd);
if (ret) {
atomic_dec(&pblk->inflight_io);
- if (rqd.error)
+ if (rqd.error && rqd.error != NVM_RSP_WARN_HIGHECC) {
pblk_log_read_err(pblk, &rqd);
+ ret = -EIO;
+ }
clear_rqd:
pblk_free_rqd_meta(pblk, &rqd);
struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_line_meta *lm = &pblk->lm;
struct bio *bio;
+ struct ppa_addr *ppa_list;
struct nvm_rq rqd;
__le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf);
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
rqd.opcode = NVM_OP_PWRITE;
rqd.nr_ppas = lm->smeta_sec;
rqd.is_seq = 1;
+ ppa_list = nvm_rq_to_ppa_list(&rqd);
for (i = 0; i < lm->smeta_sec; i++, paddr++) {
struct pblk_sec_meta *meta = pblk_get_meta(pblk,
rqd.meta_list, i);
- rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+ ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
meta->lba = lba_list[paddr] = addr_empty;
}
struct nvm_geo *geo = &dev->geo;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
- void *ppa_list, *meta_list;
+ void *ppa_list_buf, *meta_list;
struct bio *bio;
+ struct ppa_addr *ppa_list;
struct nvm_rq rqd;
u64 paddr = line->emeta_ssec;
dma_addr_t dma_ppa_list, dma_meta_list;
if (!meta_list)
return -ENOMEM;
- ppa_list = meta_list + pblk_dma_meta_size(pblk);
+ ppa_list_buf = meta_list + pblk_dma_meta_size(pblk);
dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
next_rq:
rqd.bio = bio;
rqd.meta_list = meta_list;
- rqd.ppa_list = ppa_list;
+ rqd.ppa_list = ppa_list_buf;
rqd.dma_meta_list = dma_meta_list;
rqd.dma_ppa_list = dma_ppa_list;
rqd.opcode = NVM_OP_PREAD;
rqd.nr_ppas = rq_ppas;
+ ppa_list = nvm_rq_to_ppa_list(&rqd);
for (i = 0; i < rqd.nr_ppas; ) {
struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, line_id);
}
for (j = 0; j < min; j++, i++, paddr++)
- rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line_id);
+ ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line_id);
}
ret = pblk_submit_io_sync(pblk, &rqd);
atomic_dec(&pblk->inflight_io);
- if (rqd.error)
+ if (rqd.error && rqd.error != NVM_RSP_WARN_HIGHECC) {
pblk_log_read_err(pblk, &rqd);
+ ret = -EIO;
+ goto free_rqd_dma;
+ }
emeta_buf += rq_len;
left_ppas -= rq_ppas;
off = bit * geo->ws_opt;
bitmap_set(line->map_bitmap, off, lm->smeta_sec);
line->sec_in_line -= lm->smeta_sec;
- line->smeta_ssec = off;
line->cur_sec = off + lm->smeta_sec;
if (init && pblk_line_smeta_write(pblk, line, off)) {
void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd)
{
- struct ppa_addr *ppa_list;
+ struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
int i;
- ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
-
for (i = 0; i < rqd->nr_ppas; i++)
pblk_ppa_to_line_put(pblk, ppa_list[i]);
}
spin_lock(&line->lock);
WARN_ON(line->state != PBLK_LINESTATE_GC);
+ if (line->w_err_gc->has_gc_err) {
+ spin_unlock(&line->lock);
+ pblk_err(pblk, "line %d had errors during GC\n", line->id);
+ pblk_put_line_back(pblk, line);
+ line->w_err_gc->has_gc_err = 0;
+ return;
+ }
+
line->state = PBLK_LINESTATE_FREE;
trace_pblk_line_state(pblk_disk_name(pblk), line->id,
line->state);
struct ppa_addr ppa_l2p;
/* logic error: lba out-of-bounds. Ignore update */
- if (!(lba < pblk->rl.nr_secs)) {
+ if (!(lba < pblk->capacity)) {
WARN(1, "pblk: corrupted L2P map request\n");
return;
}
#endif
/* logic error: lba out-of-bounds. Ignore update */
- if (!(lba < pblk->rl.nr_secs)) {
+ if (!(lba < pblk->capacity)) {
WARN(1, "pblk: corrupted L2P map request\n");
return 0;
}
}
/* logic error: lba out-of-bounds. Ignore update */
- if (!(lba < pblk->rl.nr_secs)) {
+ if (!(lba < pblk->capacity)) {
WARN(1, "pblk: corrupted L2P map request\n");
return;
}
spin_unlock(&pblk->trans_lock);
}
-void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
- sector_t blba, int nr_secs)
+int pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+ sector_t blba, int nr_secs, bool *from_cache)
{
int i;
if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
struct pblk_line *line = pblk_ppa_to_line(pblk, ppa);
+ if (i > 0 && *from_cache)
+ break;
+ *from_cache = false;
+
kref_get(&line->ref);
+ } else {
+ if (i > 0 && !*from_cache)
+ break;
+ *from_cache = true;
}
}
spin_unlock(&pblk->trans_lock);
+ return i;
}
void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
lba = lba_list[i];
if (lba != ADDR_EMPTY) {
/* logic error: lba out-of-bounds. Ignore update */
- if (!(lba < pblk->rl.nr_secs)) {
+ if (!(lba < pblk->capacity)) {
WARN(1, "pblk: corrupted L2P map request\n");
continue;
}
wake_up_process(gc->gc_writer_ts);
}
-static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
+void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct list_head *move_list;
+ spin_lock(&l_mg->gc_lock);
spin_lock(&line->lock);
WARN_ON(line->state != PBLK_LINESTATE_GC);
line->state = PBLK_LINESTATE_CLOSED;
trace_pblk_line_state(pblk_disk_name(pblk), line->id,
line->state);
+
+ /* We need to reset gc_group in order to ensure that
+ * pblk_line_gc_list will return proper move_list
+ * since right now current line is not on any of the
+ * gc lists.
+ */
+ line->gc_group = PBLK_LINEGC_NONE;
move_list = pblk_line_gc_list(pblk, line);
spin_unlock(&line->lock);
-
- if (move_list) {
- spin_lock(&l_mg->gc_lock);
- list_add_tail(&line->list, move_list);
- spin_unlock(&l_mg->gc_lock);
- }
+ list_add_tail(&line->list, move_list);
+ spin_unlock(&l_mg->gc_lock);
}
static void pblk_gc_line_ws(struct work_struct *work)
struct pblk_line_ws *gc_rq_ws = container_of(work,
struct pblk_line_ws, ws);
struct pblk *pblk = gc_rq_ws->pblk;
- struct nvm_tgt_dev *dev = pblk->dev;
- struct nvm_geo *geo = &dev->geo;
struct pblk_gc *gc = &pblk->gc;
struct pblk_line *line = gc_rq_ws->line;
struct pblk_gc_rq *gc_rq = gc_rq_ws->priv;
up(&gc->gc_sem);
- gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs));
- if (!gc_rq->data) {
- pblk_err(pblk, "could not GC line:%d (%d/%d)\n",
- line->id, *line->vsc, gc_rq->nr_secs);
- goto out;
- }
-
/* Read from GC victim block */
ret = pblk_submit_read_gc(pblk, gc_rq);
if (ret) {
- pblk_err(pblk, "failed GC read in line:%d (err:%d)\n",
- line->id, ret);
+ line->w_err_gc->has_gc_err = 1;
goto out;
}
struct pblk_line *line = line_ws->line;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
struct pblk_gc *gc = &pblk->gc;
struct pblk_line_ws *gc_rq_ws;
struct pblk_gc_rq *gc_rq;
gc_rq->nr_secs = nr_secs;
gc_rq->line = line;
+ gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs));
+ if (!gc_rq->data)
+ goto fail_free_gc_rq;
+
gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
if (!gc_rq_ws)
- goto fail_free_gc_rq;
+ goto fail_free_gc_data;
gc_rq_ws->pblk = pblk;
gc_rq_ws->line = line;
return;
+fail_free_gc_data:
+ vfree(gc_rq->data);
fail_free_gc_rq:
kfree(gc_rq);
fail_free_lba_list:
fail_free_ws:
kfree(line_ws);
+ /* Line goes back to closed state, so we cannot release additional
+ * reference for line, since we do that only when we want to do
+ * gc to free line state transition.
+ */
pblk_put_line_back(pblk, line);
- kref_put(&line->ref, pblk_line_put);
atomic_dec(&gc->read_inflight_gc);
pblk_err(pblk, "failed to GC line %d\n", line->id);
pblk_gc_kick(pblk);
- if (pblk_gc_line(pblk, line))
+ if (pblk_gc_line(pblk, line)) {
pblk_err(pblk, "failed to GC line %d\n", line->id);
+ /* rollback */
+ spin_lock(&gc->r_lock);
+ list_add_tail(&line->list, &gc->r_list);
+ spin_unlock(&gc->r_lock);
+ }
return 0;
}
struct bio_set pblk_bio_set;
-static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
- struct bio *bio)
-{
- int ret;
-
- /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
- * constraint. Writes can be of arbitrary size.
- */
- if (bio_data_dir(bio) == READ) {
- blk_queue_split(q, &bio);
- ret = pblk_submit_read(pblk, bio);
- if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED))
- bio_put(bio);
-
- return ret;
- }
-
- /* Prevent deadlock in the case of a modest LUN configuration and large
- * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
- * available for user I/O.
- */
- if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl))
- blk_queue_split(q, &bio);
-
- return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
-}
-
static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
{
struct pblk *pblk = q->queuedata;
}
}
- switch (pblk_rw_io(q, pblk, bio)) {
- case NVM_IO_ERR:
- bio_io_error(bio);
- break;
- case NVM_IO_DONE:
- bio_endio(bio);
- break;
+ /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
+ * constraint. Writes can be of arbitrary size.
+ */
+ if (bio_data_dir(bio) == READ) {
+ blk_queue_split(q, &bio);
+ pblk_submit_read(pblk, bio);
+ } else {
+ /* Prevent deadlock in the case of a modest LUN configuration
+ * and large user I/Os. Unless stalled, the rate limiter
+ * leaves at least 256KB available for user I/O.
+ */
+ if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl))
+ blk_queue_split(q, &bio);
+
+ pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
}
return BLK_QC_T_NONE;
if (pblk->addrf_len < 32)
entry_size = 4;
- return entry_size * pblk->rl.nr_secs;
+ return entry_size * pblk->capacity;
}
#ifdef CONFIG_NVM_PBLK_DEBUG
int ret = 0;
map_size = pblk_trans_map_size(pblk);
- pblk->trans_map = vmalloc(map_size);
- if (!pblk->trans_map)
+ pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN
+ | __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM,
+ PAGE_KERNEL);
+ if (!pblk->trans_map) {
+ pblk_err(pblk, "failed to allocate L2P (need %zu of memory)\n",
+ map_size);
return -ENOMEM;
+ }
pblk_ppa_set_empty(&ppa);
- for (i = 0; i < pblk->rl.nr_secs; i++)
+ for (i = 0; i < pblk->capacity; i++)
pblk_trans_map_set(pblk, i, ppa);
ret = pblk_l2p_recover(pblk, factory_init);
* on user capacity consider only provisioned blocks
*/
pblk->rl.total_blocks = nr_free_chks;
- pblk->rl.nr_secs = nr_free_chks * geo->clba;
/* Consider sectors used for metadata */
sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
geo->all_luns, pblk->l_mg.nr_lines,
- (unsigned long long)pblk->rl.nr_secs,
+ (unsigned long long)pblk->capacity,
pblk->rwb.nr_entries);
wake_up_process(pblk->writer_ts);
*erase_ppa = ppa_list[i];
erase_ppa->a.blk = e_line->id;
+ erase_ppa->a.reserved = 0;
spin_unlock(&e_line->lock);
* be directed to disk.
*/
int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
- struct ppa_addr ppa, int bio_iter, bool advanced_bio)
+ struct ppa_addr ppa)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_rb_entry *entry;
ret = 0;
goto out;
}
-
- /* Only advance the bio if it hasn't been advanced already. If advanced,
- * this bio is at least a partial bio (i.e., it has partially been
- * filled with data from the cache). If part of the data resides on the
- * media, we will read later on
- */
- if (unlikely(!advanced_bio))
- bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE);
-
data = bio_data(bio);
memcpy(data, entry->data, rb->seg_size);
}
out:
- spin_unlock(&rb->w_lock);
spin_unlock_irq(&rb->s_lock);
+ spin_unlock(&rb->w_lock);
return ret;
}
* issued.
*/
static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
- sector_t lba, struct ppa_addr ppa,
- int bio_iter, bool advanced_bio)
+ sector_t lba, struct ppa_addr ppa)
{
#ifdef CONFIG_NVM_PBLK_DEBUG
/* Callers must ensure that the ppa points to a cache address */
BUG_ON(!pblk_addr_in_cache(ppa));
#endif
- return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa,
- bio_iter, advanced_bio);
+ return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa);
}
-static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
+static int pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
struct bio *bio, sector_t blba,
- unsigned long *read_bitmap)
+ bool *from_cache)
{
void *meta_list = rqd->meta_list;
- struct ppa_addr ppas[NVM_MAX_VLBA];
- int nr_secs = rqd->nr_ppas;
- bool advanced_bio = false;
- int i, j = 0;
+ int nr_secs, i;
- pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
+retry:
+ nr_secs = pblk_lookup_l2p_seq(pblk, rqd->ppa_list, blba, rqd->nr_ppas,
+ from_cache);
+
+ if (!*from_cache)
+ goto end;
for (i = 0; i < nr_secs; i++) {
- struct ppa_addr p = ppas[i];
struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
sector_t lba = blba + i;
-retry:
- if (pblk_ppa_empty(p)) {
+ if (pblk_ppa_empty(rqd->ppa_list[i])) {
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
- WARN_ON(test_and_set_bit(i, read_bitmap));
meta->lba = addr_empty;
-
- if (unlikely(!advanced_bio)) {
- bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
- advanced_bio = true;
+ } else if (pblk_addr_in_cache(rqd->ppa_list[i])) {
+ /*
+ * Try to read from write buffer. The address is later
+ * checked on the write buffer to prevent retrieving
+ * overwritten data.
+ */
+ if (!pblk_read_from_cache(pblk, bio, lba,
+ rqd->ppa_list[i])) {
+ if (i == 0) {
+ /*
+ * We didn't call with bio_advance()
+ * yet, so we can just retry.
+ */
+ goto retry;
+ } else {
+ /*
+ * We already call bio_advance()
+ * so we cannot retry and we need
+ * to quit that function in order
+ * to allow caller to handle the bio
+ * splitting in the current sector
+ * position.
+ */
+ nr_secs = i;
+ goto end;
+ }
}
-
- goto next;
- }
-
- /* Try to read from write buffer. The address is later checked
- * on the write buffer to prevent retrieving overwritten data.
- */
- if (pblk_addr_in_cache(p)) {
- if (!pblk_read_from_cache(pblk, bio, lba, p, i,
- advanced_bio)) {
- pblk_lookup_l2p_seq(pblk, &p, lba, 1);
- goto retry;
- }
- WARN_ON(test_and_set_bit(i, read_bitmap));
meta->lba = cpu_to_le64(lba);
- advanced_bio = true;
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->cache_reads);
#endif
- } else {
- /* Read from media non-cached sectors */
- rqd->ppa_list[j++] = p;
}
-
-next:
- if (advanced_bio)
- bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+ bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
}
+end:
if (pblk_io_aligned(pblk, nr_secs))
rqd->is_seq = 1;
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(nr_secs, &pblk->inflight_reads);
#endif
+
+ return nr_secs;
}
WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n");
}
-static void pblk_end_user_read(struct bio *bio)
+static void pblk_end_user_read(struct bio *bio, int error)
{
-#ifdef CONFIG_NVM_PBLK_DEBUG
- WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
-#endif
- bio_endio(bio);
+ if (error && error != NVM_RSP_WARN_HIGHECC)
+ bio_io_error(bio);
+ else
+ bio_endio(bio);
}
static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
pblk_log_read_err(pblk, rqd);
pblk_read_check_seq(pblk, rqd, r_ctx->lba);
-
- if (int_bio)
- bio_put(int_bio);
+ bio_put(int_bio);
if (put_line)
pblk_rq_to_line_put(pblk, rqd);
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
struct bio *bio = (struct bio *)r_ctx->private;
- pblk_end_user_read(bio);
+ pblk_end_user_read(bio, rqd->error);
__pblk_end_io_read(pblk, rqd, true);
}
-static void pblk_end_partial_read(struct nvm_rq *rqd)
-{
- struct pblk *pblk = rqd->private;
- struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
- struct pblk_pr_ctx *pr_ctx = r_ctx->private;
- struct pblk_sec_meta *meta;
- struct bio *new_bio = rqd->bio;
- struct bio *bio = pr_ctx->orig_bio;
- void *meta_list = rqd->meta_list;
- unsigned long *read_bitmap = pr_ctx->bitmap;
- struct bvec_iter orig_iter = BVEC_ITER_ALL_INIT;
- struct bvec_iter new_iter = BVEC_ITER_ALL_INIT;
- int nr_secs = pr_ctx->orig_nr_secs;
- int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
- void *src_p, *dst_p;
- int bit, i;
-
- if (unlikely(nr_holes == 1)) {
- struct ppa_addr ppa;
-
- ppa = rqd->ppa_addr;
- rqd->ppa_list = pr_ctx->ppa_ptr;
- rqd->dma_ppa_list = pr_ctx->dma_ppa_list;
- rqd->ppa_list[0] = ppa;
- }
-
- for (i = 0; i < nr_secs; i++) {
- meta = pblk_get_meta(pblk, meta_list, i);
- pr_ctx->lba_list_media[i] = le64_to_cpu(meta->lba);
- meta->lba = cpu_to_le64(pr_ctx->lba_list_mem[i]);
- }
-
- /* Fill the holes in the original bio */
- i = 0;
- for (bit = 0; bit < nr_secs; bit++) {
- if (!test_bit(bit, read_bitmap)) {
- struct bio_vec dst_bv, src_bv;
- struct pblk_line *line;
-
- line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]);
- kref_put(&line->ref, pblk_line_put);
-
- meta = pblk_get_meta(pblk, meta_list, bit);
- meta->lba = cpu_to_le64(pr_ctx->lba_list_media[i]);
-
- dst_bv = bio_iter_iovec(bio, orig_iter);
- src_bv = bio_iter_iovec(new_bio, new_iter);
-
- src_p = kmap_atomic(src_bv.bv_page);
- dst_p = kmap_atomic(dst_bv.bv_page);
-
- memcpy(dst_p + dst_bv.bv_offset,
- src_p + src_bv.bv_offset,
- PBLK_EXPOSED_PAGE_SIZE);
-
- kunmap_atomic(src_p);
- kunmap_atomic(dst_p);
-
- flush_dcache_page(dst_bv.bv_page);
- mempool_free(src_bv.bv_page, &pblk->page_bio_pool);
-
- bio_advance_iter(new_bio, &new_iter,
- PBLK_EXPOSED_PAGE_SIZE);
- i++;
- }
- bio_advance_iter(bio, &orig_iter, PBLK_EXPOSED_PAGE_SIZE);
- }
-
- bio_put(new_bio);
- kfree(pr_ctx);
-
- /* restore original request */
- rqd->bio = NULL;
- rqd->nr_ppas = nr_secs;
-
- bio_endio(bio);
- __pblk_end_io_read(pblk, rqd, false);
-}
-
-static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
- unsigned int bio_init_idx,
- unsigned long *read_bitmap,
- int nr_holes)
-{
- void *meta_list = rqd->meta_list;
- struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
- struct pblk_pr_ctx *pr_ctx;
- struct bio *new_bio, *bio = r_ctx->private;
- int nr_secs = rqd->nr_ppas;
- int i;
-
- new_bio = bio_alloc(GFP_KERNEL, nr_holes);
-
- if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
- goto fail_bio_put;
-
- if (nr_holes != new_bio->bi_vcnt) {
- WARN_ONCE(1, "pblk: malformed bio\n");
- goto fail_free_pages;
- }
-
- pr_ctx = kzalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
- if (!pr_ctx)
- goto fail_free_pages;
-
- for (i = 0; i < nr_secs; i++) {
- struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
-
- pr_ctx->lba_list_mem[i] = le64_to_cpu(meta->lba);
- }
-
- new_bio->bi_iter.bi_sector = 0; /* internal bio */
- bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
-
- rqd->bio = new_bio;
- rqd->nr_ppas = nr_holes;
-
- pr_ctx->orig_bio = bio;
- bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA);
- pr_ctx->bio_init_idx = bio_init_idx;
- pr_ctx->orig_nr_secs = nr_secs;
- r_ctx->private = pr_ctx;
-
- if (unlikely(nr_holes == 1)) {
- pr_ctx->ppa_ptr = rqd->ppa_list;
- pr_ctx->dma_ppa_list = rqd->dma_ppa_list;
- rqd->ppa_addr = rqd->ppa_list[0];
- }
- return 0;
-
-fail_free_pages:
- pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt);
-fail_bio_put:
- bio_put(new_bio);
-
- return -ENOMEM;
-}
-
-static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
- unsigned int bio_init_idx,
- unsigned long *read_bitmap, int nr_secs)
-{
- int nr_holes;
- int ret;
-
- nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
-
- if (pblk_setup_partial_read(pblk, rqd, bio_init_idx, read_bitmap,
- nr_holes))
- return NVM_IO_ERR;
-
- rqd->end_io = pblk_end_partial_read;
-
- ret = pblk_submit_io(pblk, rqd);
- if (ret) {
- bio_put(rqd->bio);
- pblk_err(pblk, "partial read IO submission failed\n");
- goto err;
- }
-
- return NVM_IO_OK;
-
-err:
- pblk_err(pblk, "failed to perform partial read\n");
-
- /* Free allocated pages in new bio */
- pblk_bio_free_pages(pblk, rqd->bio, 0, rqd->bio->bi_vcnt);
- __pblk_end_io_read(pblk, rqd, false);
- return NVM_IO_ERR;
-}
-
static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
- sector_t lba, unsigned long *read_bitmap)
+ sector_t lba, bool *from_cache)
{
struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0);
struct ppa_addr ppa;
- pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+ pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->inflight_reads);
if (pblk_ppa_empty(ppa)) {
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
- WARN_ON(test_and_set_bit(0, read_bitmap));
meta->lba = addr_empty;
return;
}
* write buffer to prevent retrieving overwritten data.
*/
if (pblk_addr_in_cache(ppa)) {
- if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0, 1)) {
- pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+ if (!pblk_read_from_cache(pblk, bio, lba, ppa)) {
+ pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache);
goto retry;
}
- WARN_ON(test_and_set_bit(0, read_bitmap));
meta->lba = cpu_to_le64(lba);
#ifdef CONFIG_NVM_PBLK_DEBUG
}
}
-int pblk_submit_read(struct pblk *pblk, struct bio *bio)
+void pblk_submit_read(struct pblk *pblk, struct bio *bio)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct request_queue *q = dev->q;
sector_t blba = pblk_get_lba(bio);
unsigned int nr_secs = pblk_get_secs(bio);
+ bool from_cache;
struct pblk_g_ctx *r_ctx;
struct nvm_rq *rqd;
- unsigned int bio_init_idx;
- DECLARE_BITMAP(read_bitmap, NVM_MAX_VLBA);
- int ret = NVM_IO_ERR;
+ struct bio *int_bio, *split_bio;
generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio),
&pblk->disk->part0);
- bitmap_zero(read_bitmap, nr_secs);
-
rqd = pblk_alloc_rqd(pblk, PBLK_READ);
rqd->opcode = NVM_OP_PREAD;
rqd->nr_ppas = nr_secs;
- rqd->bio = NULL; /* cloned bio if needed */
rqd->private = pblk;
rqd->end_io = pblk_end_io_read;
r_ctx = nvm_rq_to_pdu(rqd);
r_ctx->start_time = jiffies;
r_ctx->lba = blba;
- r_ctx->private = bio; /* original bio */
- /* Save the index for this bio's start. This is needed in case
- * we need to fill a partial read.
- */
- bio_init_idx = pblk_get_bi_idx(bio);
+ if (pblk_alloc_rqd_meta(pblk, rqd)) {
+ bio_io_error(bio);
+ pblk_free_rqd(pblk, rqd, PBLK_READ);
+ return;
+ }
- if (pblk_alloc_rqd_meta(pblk, rqd))
- goto fail_rqd_free;
+ /* Clone read bio to deal internally with:
+ * -read errors when reading from drive
+ * -bio_advance() calls during cache reads
+ */
+ int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
if (nr_secs > 1)
- pblk_read_ppalist_rq(pblk, rqd, bio, blba, read_bitmap);
+ nr_secs = pblk_read_ppalist_rq(pblk, rqd, int_bio, blba,
+ &from_cache);
else
- pblk_read_rq(pblk, rqd, bio, blba, read_bitmap);
+ pblk_read_rq(pblk, rqd, int_bio, blba, &from_cache);
- if (bitmap_full(read_bitmap, nr_secs)) {
+split_retry:
+ r_ctx->private = bio; /* original bio */
+ rqd->bio = int_bio; /* internal bio */
+
+ if (from_cache && nr_secs == rqd->nr_ppas) {
+ /* All data was read from cache, we can complete the IO. */
+ pblk_end_user_read(bio, 0);
atomic_inc(&pblk->inflight_io);
__pblk_end_io_read(pblk, rqd, false);
- return NVM_IO_DONE;
- }
-
- /* All sectors are to be read from the device */
- if (bitmap_empty(read_bitmap, rqd->nr_ppas)) {
- struct bio *int_bio = NULL;
+ } else if (nr_secs != rqd->nr_ppas) {
+ /* The read bio request could be partially filled by the write
+ * buffer, but there are some holes that need to be read from
+ * the drive. In order to handle this, we will use block layer
+ * mechanism to split this request in to smaller ones and make
+ * a chain of it.
+ */
+ split_bio = bio_split(bio, nr_secs * NR_PHY_IN_LOG, GFP_KERNEL,
+ &pblk_bio_set);
+ bio_chain(split_bio, bio);
+ generic_make_request(bio);
+
+ /* New bio contains first N sectors of the previous one, so
+ * we can continue to use existing rqd, but we need to shrink
+ * the number of PPAs in it. New bio is also guaranteed that
+ * it contains only either data from cache or from drive, newer
+ * mix of them.
+ */
+ bio = split_bio;
+ rqd->nr_ppas = nr_secs;
+ if (rqd->nr_ppas == 1)
+ rqd->ppa_addr = rqd->ppa_list[0];
- /* Clone read bio to deal with read errors internally */
+ /* Recreate int_bio - existing might have some needed internal
+ * fields modified already.
+ */
+ bio_put(int_bio);
int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
- if (!int_bio) {
- pblk_err(pblk, "could not clone read bio\n");
- goto fail_end_io;
- }
-
- rqd->bio = int_bio;
-
- if (pblk_submit_io(pblk, rqd)) {
- pblk_err(pblk, "read IO submission failed\n");
- ret = NVM_IO_ERR;
- goto fail_end_io;
- }
-
- return NVM_IO_OK;
+ goto split_retry;
+ } else if (pblk_submit_io(pblk, rqd)) {
+ /* Submitting IO to drive failed, let's report an error */
+ rqd->error = -ENODEV;
+ pblk_end_io_read(rqd);
}
-
- /* The read bio request could be partially filled by the write buffer,
- * but there are some holes that need to be read from the drive.
- */
- ret = pblk_partial_read_bio(pblk, rqd, bio_init_idx, read_bitmap,
- nr_secs);
- if (ret)
- goto fail_meta_free;
-
- return NVM_IO_OK;
-
-fail_meta_free:
- nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
-fail_rqd_free:
- pblk_free_rqd(pblk, rqd, PBLK_READ);
- return ret;
-fail_end_io:
- __pblk_end_io_read(pblk, rqd, false);
- return ret;
}
static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
goto out;
/* logic error: lba out-of-bounds */
- if (lba >= pblk->rl.nr_secs) {
+ if (lba >= pblk->capacity) {
WARN(1, "pblk: read lba out of bounds\n");
goto out;
}
if (pblk_submit_io_sync(pblk, &rqd)) {
ret = -EIO;
- pblk_err(pblk, "GC read request failed\n");
goto err_free_bio;
}
static void pblk_update_line_wp(struct pblk *pblk, struct pblk_line *line,
u64 written_secs)
{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
int i;
for (i = 0; i < written_secs; i += pblk->min_write_pgs)
- pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+ __pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+
+ spin_lock(&l_mg->free_lock);
+ if (written_secs > line->left_msecs) {
+ /*
+ * We have all data sectors written
+ * and some emeta sectors written too.
+ */
+ line->left_msecs = 0;
+ } else {
+ /* We have only some data sectors written. */
+ line->left_msecs -= written_secs;
+ }
+ spin_unlock(&l_mg->free_lock);
}
static u64 pblk_sec_in_open_line(struct pblk *pblk, struct pblk_line *line)
struct pblk_pad_rq *pad_rq;
struct nvm_rq *rqd;
struct bio *bio;
+ struct ppa_addr *ppa_list;
void *data;
__le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf);
u64 w_ptr = line->cur_sec;
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
if (rq_ppas < pblk->min_write_pgs) {
pblk_err(pblk, "corrupted pad line %d\n", line->id);
- goto fail_free_pad;
+ goto fail_complete;
}
rq_len = rq_ppas * geo->csecs;
PBLK_VMALLOC_META, GFP_KERNEL);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
- goto fail_free_pad;
+ goto fail_complete;
}
bio->bi_iter.bi_sector = 0; /* internal bio */
rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
ret = pblk_alloc_rqd_meta(pblk, rqd);
- if (ret)
- goto fail_free_rqd;
+ if (ret) {
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
+ bio_put(bio);
+ goto fail_complete;
+ }
rqd->bio = bio;
rqd->opcode = NVM_OP_PWRITE;
rqd->end_io = pblk_end_io_recov;
rqd->private = pad_rq;
+ ppa_list = nvm_rq_to_ppa_list(rqd);
meta_list = rqd->meta_list;
for (i = 0; i < rqd->nr_ppas; ) {
lba_list[w_ptr] = addr_empty;
meta = pblk_get_meta(pblk, meta_list, i);
meta->lba = addr_empty;
- rqd->ppa_list[i] = dev_ppa;
+ ppa_list[i] = dev_ppa;
}
}
kref_get(&pad_rq->ref);
- pblk_down_chunk(pblk, rqd->ppa_list[0]);
+ pblk_down_chunk(pblk, ppa_list[0]);
ret = pblk_submit_io(pblk, rqd);
if (ret) {
pblk_err(pblk, "I/O submission failed: %d\n", ret);
- pblk_up_chunk(pblk, rqd->ppa_list[0]);
- goto fail_free_rqd;
+ pblk_up_chunk(pblk, ppa_list[0]);
+ kref_put(&pad_rq->ref, pblk_recov_complete);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
+ bio_put(bio);
+ goto fail_complete;
}
left_line_ppas -= rq_ppas;
if (left_ppas && left_line_ppas)
goto next_pad_rq;
+fail_complete:
kref_put(&pad_rq->ref, pblk_recov_complete);
-
- if (!wait_for_completion_io_timeout(&pad_rq->wait,
- msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
- pblk_err(pblk, "pad write timed out\n");
- ret = -ETIME;
- }
+ wait_for_completion(&pad_rq->wait);
if (!pblk_line_is_full(line))
pblk_err(pblk, "corrupted padded line: %d\n", line->id);
free_rq:
kfree(pad_rq);
return ret;
-
-fail_free_rqd:
- pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
- bio_put(bio);
-fail_free_pad:
- kfree(pad_rq);
- vfree(data);
- return ret;
}
static int pblk_pad_distance(struct pblk *pblk, struct pblk_line *line)
rqd->ppa_list = ppa_list;
rqd->dma_ppa_list = dma_ppa_list;
rqd->dma_meta_list = dma_meta_list;
+ ppa_list = nvm_rq_to_ppa_list(rqd);
if (pblk_io_aligned(pblk, rq_ppas))
rqd->is_seq = 1;
}
for (j = 0; j < pblk->min_write_pgs; j++, i++)
- rqd->ppa_list[i] =
+ ppa_list[i] =
addr_to_gen_ppa(pblk, paddr + j, line->id);
}
atomic_dec(&pblk->inflight_io);
/* If a read fails, do a best effort by padding the line and retrying */
- if (rqd->error) {
+ if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
int pad_distance, ret;
if (padded) {
lba_list[paddr++] = cpu_to_le64(lba);
- if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+ if (lba == ADDR_EMPTY || lba >= pblk->capacity)
continue;
line->nr_valid_lbas++;
- pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+ pblk_update_map(pblk, lba, ppa_list[i]);
}
left_ppas -= rq_ppas;
bppa = pblk->luns[smeta_blk].bppa;
chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)];
- if (chunk->state & NVM_CHK_ST_FREE)
- return 0;
+ if (chunk->state & NVM_CHK_ST_CLOSED ||
+ (chunk->state & NVM_CHK_ST_OPEN
+ && chunk->wp >= lm->smeta_sec))
+ return 1;
- return 1;
+ return 0;
}
static bool pblk_line_is_open(struct pblk *pblk, struct pblk_line *line)
spin_unlock(&l_mg->free_lock);
} else {
spin_lock(&l_mg->free_lock);
+ l_mg->data_line = data_line;
/* Allocate next line for preparation */
l_mg->data_next = pblk_line_get(pblk);
if (l_mg->data_next) {
mempool_free(recovery, &pblk->rec_pool);
atomic_dec(&pblk->inflight_io);
+ pblk_write_kick(pblk);
}
#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16)
-#define PBLK_COMMAND_TIMEOUT_MS 30000
-
/* Max 512 LUNs per device */
#define PBLK_MAX_LUNS_BITMAP (4)
u64 lba;
};
-/* partial read context */
-struct pblk_pr_ctx {
- struct bio *orig_bio;
- DECLARE_BITMAP(bitmap, NVM_MAX_VLBA);
- unsigned int orig_nr_secs;
- unsigned int bio_init_idx;
- void *ppa_ptr;
- dma_addr_t dma_ppa_list;
- u64 lba_list_mem[NVM_MAX_VLBA];
- u64 lba_list_media[NVM_MAX_VLBA];
-};
-
/* Pad context */
struct pblk_pad_rq {
struct pblk *pblk;
struct timer_list u_timer;
- unsigned long long nr_secs;
unsigned long total_blocks;
atomic_t free_blocks; /* Total number of free blocks (+ OP) */
struct pblk_w_err_gc {
int has_write_err;
+ int has_gc_err;
__le64 *lba_list;
};
int meta_line; /* Metadata line id */
int meta_distance; /* Distance between data and metadata */
- u64 smeta_ssec; /* Sector where smeta starts */
u64 emeta_ssec; /* Sector where emeta starts */
unsigned int sec_in_line; /* Number of usable secs in line */
unsigned int pos, unsigned int nr_entries,
unsigned int count);
int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
- struct ppa_addr ppa, int bio_iter, bool advanced_bio);
+ struct ppa_addr ppa);
unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags);
struct pblk_line *gc_line, u64 paddr);
void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
u64 *lba_list, int nr_secs);
-void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
- sector_t blba, int nr_secs);
+int pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+ sector_t blba, int nr_secs, bool *from_cache);
void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd);
void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd);
/*
* pblk user I/O write path
*/
-int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
+void pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
unsigned long flags);
int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
* pblk read path
*/
extern struct bio_set pblk_bio_set;
-int pblk_submit_read(struct pblk *pblk, struct bio *bio);
+void pblk_submit_read(struct pblk *pblk, struct bio *bio);
int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
/*
* pblk recovery
void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
int *gc_active);
int pblk_gc_sysfs_force(struct pblk *pblk, int force);
+void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line);
/*
* pblk rate limiter
return 0;
}
+ effects |= nvme_known_admin_effects(opcode);
if (ctrl->effects)
effects = le32_to_cpu(ctrl->effects->acs[opcode]);
- else
- effects = nvme_known_admin_effects(opcode);
/*
* For simplicity, IO to all namespaces is quiesced even if the command
NULL,
};
-static int nvme_active_ctrls(struct nvme_subsystem *subsys)
+static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
+ struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
- int count = 0;
- struct nvme_ctrl *ctrl;
+ struct nvme_ctrl *tmp;
+
+ lockdep_assert_held(&nvme_subsystems_lock);
+
+ list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
+ if (ctrl->state == NVME_CTRL_DELETING ||
+ ctrl->state == NVME_CTRL_DEAD)
+ continue;
+
+ if (tmp->cntlid == ctrl->cntlid) {
+ dev_err(ctrl->device,
+ "Duplicate cntlid %u with %s, rejecting\n",
+ ctrl->cntlid, dev_name(tmp->device));
+ return false;
+ }
- mutex_lock(&subsys->lock);
- list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
- if (ctrl->state != NVME_CTRL_DELETING &&
- ctrl->state != NVME_CTRL_DEAD)
- count++;
+ if ((id->cmic & (1 << 1)) ||
+ (ctrl->opts && ctrl->opts->discovery_nqn))
+ continue;
+
+ dev_err(ctrl->device,
+ "Subsystem does not support multiple controllers\n");
+ return false;
}
- mutex_unlock(&subsys->lock);
- return count;
+ return true;
}
static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
mutex_lock(&nvme_subsystems_lock);
found = __nvme_find_get_subsystem(subsys->subnqn);
if (found) {
- /*
- * Verify that the subsystem actually supports multiple
- * controllers, else bail out.
- */
- if (!(ctrl->opts && ctrl->opts->discovery_nqn) &&
- nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
- dev_err(ctrl->device,
- "ignoring ctrl due to duplicate subnqn (%s).\n",
- found->subnqn);
- nvme_put_subsystem(found);
- ret = -EINVAL;
- goto out_unlock;
- }
-
__nvme_release_subsystem(subsys);
subsys = found;
+
+ if (!nvme_validate_cntlid(subsys, ctrl, id)) {
+ ret = -EINVAL;
+ goto out_put_subsystem;
+ }
} else {
ret = device_add(&subsys->dev);
if (ret) {
list_add_tail(&subsys->entry, &nvme_subsystems);
}
- ctrl->subsys = subsys;
- mutex_unlock(&nvme_subsystems_lock);
-
if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
dev_name(ctrl->device))) {
dev_err(ctrl->device,
"failed to create sysfs link from subsystem.\n");
- /* the transport driver will eventually put the subsystem */
- return -EINVAL;
+ goto out_put_subsystem;
}
- mutex_lock(&subsys->lock);
+ ctrl->subsys = subsys;
list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
- mutex_unlock(&subsys->lock);
-
+ mutex_unlock(&nvme_subsystems_lock);
return 0;
+out_put_subsystem:
+ nvme_put_subsystem(subsys);
out_unlock:
mutex_unlock(&nvme_subsystems_lock);
put_device(&subsys->dev);
{
u32 aer_notice_type = (result & 0xff00) >> 8;
+ trace_nvme_async_event(ctrl, aer_notice_type);
+
switch (aer_notice_type) {
case NVME_AER_NOTICE_NS_CHANGED:
- trace_nvme_async_event(ctrl, aer_notice_type);
set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
nvme_queue_scan(ctrl);
break;
case NVME_AER_NOTICE_FW_ACT_STARTING:
- trace_nvme_async_event(ctrl, aer_notice_type);
queue_work(nvme_wq, &ctrl->fw_act_work);
break;
#ifdef CONFIG_NVME_MULTIPATH
case NVME_AER_NOTICE_ANA:
- trace_nvme_async_event(ctrl, aer_notice_type);
if (!ctrl->ana_log_buf)
break;
queue_work(nvme_wq, &ctrl->ana_work);
__free_page(ctrl->discard_page);
if (subsys) {
- mutex_lock(&subsys->lock);
+ mutex_lock(&nvme_subsystems_lock);
list_del(&ctrl->subsys_entry);
- mutex_unlock(&subsys->lock);
sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
+ mutex_unlock(&nvme_subsystems_lock);
}
ctrl->ops->free_ctrl(ctrl);
NVMF_OPT_DISABLE_SQFLOW)
static struct nvme_ctrl *
-nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
+nvmf_create_ctrl(struct device *dev, const char *buf)
{
struct nvmf_ctrl_options *opts;
struct nvmf_transport_ops *ops;
goto out_unlock;
}
- ctrl = nvmf_create_ctrl(nvmf_device, buf, count);
+ ctrl = nvmf_create_ctrl(nvmf_device, buf);
if (IS_ERR(ctrl)) {
ret = PTR_ERR(ctrl);
goto out_unlock;
static DEFINE_IDA(nvme_fc_local_port_cnt);
static DEFINE_IDA(nvme_fc_ctrl_cnt);
-
+static struct workqueue_struct *nvme_fc_wq;
/*
* These items are short-term. They will eventually be moved into
*/
if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) {
active = atomic_xchg(&ctrl->err_work_active, 1);
- if (!active && !schedule_work(&ctrl->err_work)) {
+ if (!active && !queue_work(nvme_fc_wq, &ctrl->err_work)) {
atomic_set(&ctrl->err_work_active, 0);
WARN_ON(1);
}
{
int ret;
+ nvme_fc_wq = alloc_workqueue("nvme_fc_wq", WQ_MEM_RECLAIM, 0);
+ if (!nvme_fc_wq)
+ return -ENOMEM;
+
/*
* NOTE:
* It is expected that in the future the kernel will combine
ret = class_register(&fc_class);
if (ret) {
pr_err("couldn't register class fc\n");
- return ret;
+ goto out_destroy_wq;
}
/*
device_destroy(&fc_class, MKDEV(0, 0));
out_destroy_class:
class_unregister(&fc_class);
+out_destroy_wq:
+ destroy_workqueue(nvme_fc_wq);
+
return ret;
}
device_destroy(&fc_class, MKDEV(0, 0));
class_unregister(&fc_class);
+ destroy_workqueue(nvme_fc_wq);
}
module_init(nvme_fc_init_module);
geo->csecs = 1 << ns->lba_shift;
geo->sos = ns->ms;
geo->ext = ns->ext;
+ geo->mdts = ns->ctrl->max_hw_sectors;
dev->q = q;
memcpy(dev->name, disk_name, DISK_NAME_LEN);
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
} else if (ns->head->disk) {
sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
- ctrl->cntlid, ns->head->instance);
+ ctrl->instance, ns->head->instance);
*flags = GENHD_FL_HIDDEN;
} else {
sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
switch (dev->ctrl.state) {
case NVME_CTRL_DELETING:
shutdown = true;
+ /* fall through */
case NVME_CTRL_CONNECTING:
case NVME_CTRL_RESETTING:
dev_warn_ratelimited(dev->ctrl.device,
return ret;
}
dev->ctrl.tagset = &dev->tagset;
-
- nvme_dbbuf_set(dev);
} else {
blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
nvme_free_queues(dev, dev->online_queues);
}
+ nvme_dbbuf_set(dev);
return 0;
}
return ret;
}
-static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
- struct blk_mq_tag_set *set)
-{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-
- blk_mq_free_tag_set(set);
- nvme_rdma_dev_put(ctrl->device);
-}
-
static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
bool admin)
{
ret = blk_mq_alloc_tag_set(set);
if (ret)
- goto out;
-
- /*
- * We need a reference on the device as long as the tag_set is alive,
- * as the MRs in the request structures need a valid ib_device.
- */
- ret = nvme_rdma_dev_get(ctrl->device);
- if (!ret) {
- ret = -EINVAL;
- goto out_free_tagset;
- }
+ return ERR_PTR(ret);
return set;
-
-out_free_tagset:
- blk_mq_free_tag_set(set);
-out:
- return ERR_PTR(ret);
}
static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.admin_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
}
if (ctrl->async_event_sqe.data) {
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
blk_cleanup_queue(ctrl->ctrl.admin_q);
out_free_tagset:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
out_free_async_qe:
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
sizeof(struct nvme_command), DMA_TO_DEVICE);
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.connect_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
}
nvme_rdma_free_io_queues(ctrl);
}
blk_cleanup_queue(ctrl->ctrl.connect_q);
out_free_tag_set:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
out_free_io_queues:
nvme_rdma_free_io_queues(ctrl);
return ret;
aer_name(NVME_AER_NOTICE_NS_CHANGED),
aer_name(NVME_AER_NOTICE_ANA),
aer_name(NVME_AER_NOTICE_FW_ACT_STARTING),
+ aer_name(NVME_AER_NOTICE_DISC_CHANGED),
aer_name(NVME_AER_ERROR),
aer_name(NVME_AER_SMART),
aer_name(NVME_AER_CSS),
if ((start_padding_sectors || end_padding_sectors) &&
(rq_data_dir(req) == WRITE)) {
DBF_DEV_EVENT(DBF_ERR, basedev,
- "raw write not track aligned (%lu,%lu) req %p",
+ "raw write not track aligned (%llu,%llu) req %p",
start_padding_sectors, end_padding_sectors, req);
return ERR_PTR(-EINVAL);
}
u16 csecs; /* sector size */
u16 sos; /* out-of-band area size */
bool ext; /* metadata in extended data buffer */
+ u32 mdts; /* Max data transfer size*/
/* device write constrains */
u32 ws_min; /* minimum write size */
char name[DISK_NAME_LEN];
void *private_data;
+ struct kref ref;
void *rmap;
struct mutex mlock;
NVME_SC_FW_NEEDS_SUBSYS_RESET = 0x110,
NVME_SC_FW_NEEDS_RESET = 0x111,
NVME_SC_FW_NEEDS_MAX_TIME = 0x112,
- NVME_SC_FW_ACIVATE_PROHIBITED = 0x113,
+ NVME_SC_FW_ACTIVATE_PROHIBITED = 0x113,
NVME_SC_OVERLAPPING_RANGE = 0x114,
- NVME_SC_NS_INSUFFICENT_CAP = 0x115,
+ NVME_SC_NS_INSUFFICIENT_CAP = 0x115,
NVME_SC_NS_ID_UNAVAILABLE = 0x116,
NVME_SC_NS_ALREADY_ATTACHED = 0x118,
NVME_SC_NS_IS_PRIVATE = 0x119,