* @bdev: device
*
* Locates the passed device's request queue and returns the address of its
- * backing_dev_info
- *
- * Will return NULL if the request queue cannot be located.
+ * backing_dev_info. This function can only be called if @bdev is opened
+ * and the return value is never NULL.
*/
struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
{
- struct backing_dev_info *ret = NULL;
struct request_queue *q = bdev_get_queue(bdev);
- if (q)
- ret = &q->backing_dev_info;
- return ret;
+ return &q->backing_dev_info;
}
EXPORT_SYMBOL(blk_get_backing_dev_info);
* this function.
*
* This function does not cancel any asynchronous activity arising
- * out of elevator or throttling code. That would require elevaotor_exit()
+ * out of elevator or throttling code. That would require elevator_exit()
* and blkcg_exit_queue() to be called with queue lock initialized.
*
*/
* be drained. Check all the queues and counters.
*/
if (drain_all) {
+ struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
drain |= !list_empty(&q->queue_head);
for (i = 0; i < 2; i++) {
drain |= q->nr_rqs[i];
drain |= q->in_flight[i];
- drain |= !list_empty(&q->flush_queue[i]);
+ if (fq)
+ drain |= !list_empty(&fq->flush_queue[i]);
}
}
#ifdef CONFIG_BLK_CGROUP
INIT_LIST_HEAD(&q->blkg_list);
#endif
- INIT_LIST_HEAD(&q->flush_queue[0]);
- INIT_LIST_HEAD(&q->flush_queue[1]);
- INIT_LIST_HEAD(&q->flush_data_in_flight);
INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
kobject_init(&q->kobj, &blk_queue_ktype);
if (!q)
return NULL;
- q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
- if (!q->flush_rq)
+ q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0);
+ if (!q->fq)
return NULL;
if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
return q;
fail:
- kfree(q->flush_rq);
+ blk_free_flush_queue(q->fq);
return NULL;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
* pressure or if @q is dead.
*
* Must be called with @q->queue_lock held and,
- * Returns %NULL on failure, with @q->queue_lock held.
- * Returns !%NULL on success, with @q->queue_lock *not held*.
+ * Returns ERR_PTR on failure, with @q->queue_lock held.
+ * Returns request pointer on success, with @q->queue_lock *not held*.
*/
static struct request *__get_request(struct request_list *rl, int rw_flags,
struct bio *bio, gfp_t gfp_mask)
int may_queue;
if (unlikely(blk_queue_dying(q)))
- return NULL;
+ return ERR_PTR(-ENODEV);
may_queue = elv_may_queue(q, rw_flags);
if (may_queue == ELV_MQUEUE_NO)
* process is not a "batcher", and not
* exempted by the IO scheduler
*/
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
}
}
* allocated with any setting of ->nr_requests
*/
if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
- return NULL;
+ return ERR_PTR(-ENOMEM);
q->nr_rqs[is_sync]++;
rl->count[is_sync]++;
* shouldn't stall IO. Treat this request as !elvpriv. This will
* disturb iosched and blkcg but weird is bettern than dead.
*/
- printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
- dev_name(q->backing_dev_info.dev));
+ printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
+ __func__, dev_name(q->backing_dev_info.dev));
rq->cmd_flags &= ~REQ_ELVPRIV;
rq->elv.icq = NULL;
rq_starved:
if (unlikely(rl->count[is_sync] == 0))
rl->starved[is_sync] = 1;
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
/**
* function keeps retrying under memory pressure and fails iff @q is dead.
*
* Must be called with @q->queue_lock held and,
- * Returns %NULL on failure, with @q->queue_lock held.
- * Returns !%NULL on success, with @q->queue_lock *not held*.
+ * Returns ERR_PTR on failure, with @q->queue_lock held.
+ * Returns request pointer on success, with @q->queue_lock *not held*.
*/
static struct request *get_request(struct request_queue *q, int rw_flags,
struct bio *bio, gfp_t gfp_mask)
rl = blk_get_rl(q, bio); /* transferred to @rq on success */
retry:
rq = __get_request(rl, rw_flags, bio, gfp_mask);
- if (rq)
+ if (!IS_ERR(rq))
return rq;
if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
blk_put_rl(rl);
- return NULL;
+ return rq;
}
/* wait on @rl and retry */
spin_lock_irq(q->queue_lock);
rq = get_request(q, rw, NULL, gfp_mask);
- if (!rq)
+ if (IS_ERR(rq))
spin_unlock_irq(q->queue_lock);
/* q->queue_lock is unlocked at this point */
{
struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
- if (unlikely(!rq))
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(rq))
+ return rq;
blk_rq_set_block_pc(rq);
EXPORT_SYMBOL(blk_make_request);
/**
- * blk_rq_set_block_pc - initialize a requeest to type BLOCK_PC
+ * blk_rq_set_block_pc - initialize a request to type BLOCK_PC
* @rq: request to be initialized
*
*/
* Returns with the queue unlocked.
*/
req = get_request(q, rw_flags, bio, GFP_NOIO);
- if (unlikely(!req)) {
- bio_endio(bio, -ENODEV); /* @q is dead */
+ if (IS_ERR(req)) {
+ bio_endio(bio, PTR_ERR(req)); /* @q is dead */
goto out_unlock;
}
{
int total_bytes;
+ trace_block_rq_complete(req->q, req, nr_bytes);
+
if (!req->bio)
return false;
- trace_block_rq_complete(req->q, req, nr_bytes);
-
/*
* For fs requests, rq is just carrier of independent bio's
* and each partial completion should be handled separately.
error_type = "I/O";
break;
}
- printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
- error_type, req->rq_disk ?
+ printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n",
+ __func__, error_type, req->rq_disk ?
req->rq_disk->disk_name : "?",
(unsigned long long)blk_rq_pos(req));
blk_rq_init(NULL, rq);
__rq_for_each_bio(bio_src, rq_src) {
- bio = bio_clone_bioset(bio_src, gfp_mask, bs);
+ bio = bio_clone_fast(bio_src, gfp_mask, bs);
if (!bio)
goto free_and_out;
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
#include <linux/delay.h>
+ #include <linux/crash_dump.h>
#include <trace/events/block.h>
struct blk_mq_hw_ctx *hctx;
struct request *rq;
struct blk_mq_alloc_data alloc_data;
+ int ret;
- if (blk_mq_queue_enter(q))
- return NULL;
+ ret = blk_mq_queue_enter(q);
+ if (ret)
+ return ERR_PTR(ret);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
ctx = alloc_data.ctx;
}
blk_mq_put_ctx(ctx);
+ if (!rq)
+ return ERR_PTR(-EWOULDBLOCK);
return rq;
}
EXPORT_SYMBOL(blk_mq_alloc_request);
__blk_mq_free_request(hctx, ctx, rq);
}
- /*
- * Clone all relevant state from a request that has been put on hold in
- * the flush state machine into the preallocated flush request that hangs
- * off the request queue.
- *
- * For a driver the flush request should be invisible, that's why we are
- * impersonating the original request here.
- */
- void blk_mq_clone_flush_request(struct request *flush_rq,
- struct request *orig_rq)
- {
- struct blk_mq_hw_ctx *hctx =
- orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
-
- flush_rq->mq_ctx = orig_rq->mq_ctx;
- flush_rq->tag = orig_rq->tag;
- memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
- hctx->cmd_size);
- }
-
- inline void __blk_mq_end_io(struct request *rq, int error)
+ inline void __blk_mq_end_request(struct request *rq, int error)
{
blk_account_io_done(rq);
blk_mq_free_request(rq);
}
}
- EXPORT_SYMBOL(__blk_mq_end_io);
+ EXPORT_SYMBOL(__blk_mq_end_request);
- void blk_mq_end_io(struct request *rq, int error)
+ void blk_mq_end_request(struct request *rq, int error)
{
if (blk_update_request(rq, error, blk_rq_bytes(rq)))
BUG();
- __blk_mq_end_io(rq, error);
+ __blk_mq_end_request(rq, error);
}
- EXPORT_SYMBOL(blk_mq_end_io);
+ EXPORT_SYMBOL(blk_mq_end_request);
static void __blk_mq_complete_request_remote(void *data)
{
struct request_queue *q = rq->q;
if (!q->softirq_done_fn)
- blk_mq_end_io(rq, rq->errors);
+ blk_mq_end_request(rq, rq->errors);
else
blk_mq_ipi_complete_request(rq);
}
}
EXPORT_SYMBOL(blk_mq_complete_request);
- static void blk_mq_start_request(struct request *rq, bool last)
+ void blk_mq_start_request(struct request *rq)
{
struct request_queue *q = rq->q;
*/
rq->nr_phys_segments++;
}
-
- /*
- * Flag the last request in the series so that drivers know when IO
- * should be kicked off, if they don't do it on a per-request basis.
- *
- * Note: the flag isn't the only condition drivers should do kick off.
- * If drive is busy, the last request might not have the bit set.
- */
- if (last)
- rq->cmd_flags |= REQ_END;
}
+ EXPORT_SYMBOL(blk_mq_start_request);
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
trace_block_rq_requeue(q, rq);
- clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-
- rq->cmd_flags &= ~REQ_END;
- if (q->dma_drain_size && blk_rq_bytes(rq))
- rq->nr_phys_segments--;
+ if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+ if (q->dma_drain_size && blk_rq_bytes(rq))
+ rq->nr_phys_segments--;
+ }
}
void blk_mq_requeue_request(struct request *rq)
{
__blk_mq_requeue_request(rq);
- blk_clear_rq_complete(rq);
BUG_ON(blk_queued_rq(rq));
blk_mq_add_to_requeue_list(rq, true);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);
- static inline bool is_flush_request(struct request *rq, unsigned int tag)
+ static inline bool is_flush_request(struct request *rq,
+ struct blk_flush_queue *fq, unsigned int tag)
{
return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
- rq->q->flush_rq->tag == tag);
+ fq->flush_rq->tag == tag);
}
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
struct request *rq = tags->rqs[tag];
+ /* mq_ctx of flush rq is always cloned from the corresponding req */
+ struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx);
- if (!is_flush_request(rq, tag))
+ if (!is_flush_request(rq, fq, tag))
return rq;
- return rq->q->flush_rq;
+ return fq->flush_rq;
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);
struct blk_mq_timeout_data {
- struct blk_mq_hw_ctx *hctx;
- unsigned long *next;
- unsigned int *next_set;
+ unsigned long next;
+ unsigned int next_set;
};
- static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
+ void blk_mq_rq_timed_out(struct request *req, bool reserved)
{
- struct blk_mq_timeout_data *data = __data;
- struct blk_mq_hw_ctx *hctx = data->hctx;
- unsigned int tag;
-
- /* It may not be in flight yet (this is where
- * the REQ_ATOMIC_STARTED flag comes in). The requests are
- * statically allocated, so we know it's always safe to access the
- * memory associated with a bit offset into ->rqs[].
- */
- tag = 0;
- do {
- struct request *rq;
-
- tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
- if (tag >= hctx->tags->nr_tags)
- break;
-
- rq = blk_mq_tag_to_rq(hctx->tags, tag++);
- if (rq->q != hctx->queue)
- continue;
- if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
- continue;
-
- blk_rq_check_expired(rq, data->next, data->next_set);
- } while (1);
- }
-
- static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
- unsigned long *next,
- unsigned int *next_set)
- {
- struct blk_mq_timeout_data data = {
- .hctx = hctx,
- .next = next,
- .next_set = next_set,
- };
-
- /*
- * Ask the tagging code to iterate busy requests, so we can
- * check them for timeout.
- */
- blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
- }
-
- static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
- {
- struct request_queue *q = rq->q;
+ struct blk_mq_ops *ops = req->q->mq_ops;
+ enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
/*
* We know that complete is set at this point. If STARTED isn't set
* we both flags will get cleared. So check here again, and ignore
* a timeout event with a request that isn't active.
*/
- if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
- return BLK_EH_NOT_HANDLED;
+ if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
+ return;
+
+ if (ops->timeout)
+ ret = ops->timeout(req, reserved);
+
+ switch (ret) {
+ case BLK_EH_HANDLED:
+ __blk_mq_complete_request(req);
+ break;
+ case BLK_EH_RESET_TIMER:
+ blk_add_timer(req);
+ blk_clear_rq_complete(req);
+ break;
+ case BLK_EH_NOT_HANDLED:
+ break;
+ default:
+ printk(KERN_ERR "block: bad eh return: %d\n", ret);
+ break;
+ }
+ }
+
+ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
+ struct request *rq, void *priv, bool reserved)
+ {
+ struct blk_mq_timeout_data *data = priv;
- if (!q->mq_ops->timeout)
- return BLK_EH_RESET_TIMER;
+ if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+ return;
- return q->mq_ops->timeout(rq);
+ if (time_after_eq(jiffies, rq->deadline)) {
+ if (!blk_mark_rq_complete(rq))
+ blk_mq_rq_timed_out(rq, reserved);
+ } else if (!data->next_set || time_after(data->next, rq->deadline)) {
+ data->next = rq->deadline;
+ data->next_set = 1;
+ }
}
- static void blk_mq_rq_timer(unsigned long data)
+ static void blk_mq_rq_timer(unsigned long priv)
{
- struct request_queue *q = (struct request_queue *) data;
+ struct request_queue *q = (struct request_queue *)priv;
+ struct blk_mq_timeout_data data = {
+ .next = 0,
+ .next_set = 0,
+ };
struct blk_mq_hw_ctx *hctx;
- unsigned long next = 0;
- int i, next_set = 0;
+ int i;
queue_for_each_hw_ctx(q, hctx, i) {
/*
if (!hctx->nr_ctx || !hctx->tags)
continue;
- blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
+ blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
}
- if (next_set) {
- next = blk_rq_timeout(round_jiffies_up(next));
- mod_timer(&q->timeout, next);
+ if (data.next_set) {
+ data.next = blk_rq_timeout(round_jiffies_up(data.next));
+ mod_timer(&q->timeout, data.next);
} else {
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_tag_idle(hctx);
rq = list_first_entry(&rq_list, struct request, queuelist);
list_del_init(&rq->queuelist);
- blk_mq_start_request(rq, list_empty(&rq_list));
-
- ret = q->mq_ops->queue_rq(hctx, rq);
+ ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
switch (ret) {
case BLK_MQ_RQ_QUEUE_OK:
queued++;
pr_err("blk-mq: bad return on queue: %d\n", ret);
case BLK_MQ_RQ_QUEUE_ERROR:
rq->errors = -EIO;
- blk_mq_end_io(rq, rq->errors);
+ blk_mq_end_request(rq, rq->errors);
break;
}
int ret;
blk_mq_bio_to_request(rq, bio);
- blk_mq_start_request(rq, true);
/*
* For OK queue, we are done. For error, kill it. Any other
* error (busy), just add it to our list as we previously
* would have done
*/
- ret = q->mq_ops->queue_rq(data.hctx, rq);
+ ret = q->mq_ops->queue_rq(data.hctx, rq, true);
if (ret == BLK_MQ_RQ_QUEUE_OK)
goto done;
else {
if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
rq->errors = -EIO;
- blk_mq_end_io(rq, rq->errors);
+ blk_mq_end_request(rq, rq->errors);
goto done;
}
}
return NOTIFY_OK;
}
+ static void blk_mq_exit_hctx(struct request_queue *q,
+ struct blk_mq_tag_set *set,
+ struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+ {
+ unsigned flush_start_tag = set->queue_depth;
+
+ blk_mq_tag_idle(hctx);
+
+ if (set->ops->exit_request)
+ set->ops->exit_request(set->driver_data,
+ hctx->fq->flush_rq, hctx_idx,
+ flush_start_tag + hctx_idx);
+
+ if (set->ops->exit_hctx)
+ set->ops->exit_hctx(hctx, hctx_idx);
+
+ blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+ blk_free_flush_queue(hctx->fq);
+ kfree(hctx->ctxs);
+ blk_mq_free_bitmap(&hctx->ctx_map);
+ }
+
static void blk_mq_exit_hw_queues(struct request_queue *q,
struct blk_mq_tag_set *set, int nr_queue)
{
queue_for_each_hw_ctx(q, hctx, i) {
if (i == nr_queue)
break;
-
- blk_mq_tag_idle(hctx);
-
- if (set->ops->exit_hctx)
- set->ops->exit_hctx(hctx, i);
-
- blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
- kfree(hctx->ctxs);
- blk_mq_free_bitmap(&hctx->ctx_map);
+ blk_mq_exit_hctx(q, set, hctx, i);
}
-
}
static void blk_mq_free_hw_queues(struct request_queue *q,
}
}
- static int blk_mq_init_hw_queues(struct request_queue *q,
- struct blk_mq_tag_set *set)
+ static int blk_mq_init_hctx(struct request_queue *q,
+ struct blk_mq_tag_set *set,
+ struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
{
- struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ int node;
+ unsigned flush_start_tag = set->queue_depth;
+
+ node = hctx->numa_node;
+ if (node == NUMA_NO_NODE)
+ node = hctx->numa_node = set->numa_node;
+
+ INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
+ INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
+ spin_lock_init(&hctx->lock);
+ INIT_LIST_HEAD(&hctx->dispatch);
+ hctx->queue = q;
+ hctx->queue_num = hctx_idx;
+ hctx->flags = set->flags;
+ hctx->cmd_size = set->cmd_size;
+
+ blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
+ blk_mq_hctx_notify, hctx);
+ blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
+
+ hctx->tags = set->tags[hctx_idx];
/*
- * Initialize hardware queues
+ * Allocate space for all possible cpus to avoid allocation at
+ * runtime
*/
- queue_for_each_hw_ctx(q, hctx, i) {
- int node;
+ hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
+ GFP_KERNEL, node);
+ if (!hctx->ctxs)
+ goto unregister_cpu_notifier;
- node = hctx->numa_node;
- if (node == NUMA_NO_NODE)
- node = hctx->numa_node = set->numa_node;
+ if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
+ goto free_ctxs;
- INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
- INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
- spin_lock_init(&hctx->lock);
- INIT_LIST_HEAD(&hctx->dispatch);
- hctx->queue = q;
- hctx->queue_num = i;
- hctx->flags = set->flags;
- hctx->cmd_size = set->cmd_size;
+ hctx->nr_ctx = 0;
- blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
- blk_mq_hctx_notify, hctx);
- blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
+ if (set->ops->init_hctx &&
+ set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
+ goto free_bitmap;
- hctx->tags = set->tags[i];
+ hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
+ if (!hctx->fq)
+ goto exit_hctx;
- /*
- * Allocate space for all possible cpus to avoid allocation at
- * runtime
- */
- hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
- GFP_KERNEL, node);
- if (!hctx->ctxs)
- break;
+ if (set->ops->init_request &&
+ set->ops->init_request(set->driver_data,
+ hctx->fq->flush_rq, hctx_idx,
+ flush_start_tag + hctx_idx, node))
+ goto free_fq;
- if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
- break;
+ return 0;
- hctx->nr_ctx = 0;
+ free_fq:
+ kfree(hctx->fq);
+ exit_hctx:
+ if (set->ops->exit_hctx)
+ set->ops->exit_hctx(hctx, hctx_idx);
+ free_bitmap:
+ blk_mq_free_bitmap(&hctx->ctx_map);
+ free_ctxs:
+ kfree(hctx->ctxs);
+ unregister_cpu_notifier:
+ blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+
+ return -1;
+ }
+
+ static int blk_mq_init_hw_queues(struct request_queue *q,
+ struct blk_mq_tag_set *set)
+ {
+ struct blk_mq_hw_ctx *hctx;
+ unsigned int i;
- if (set->ops->init_hctx &&
- set->ops->init_hctx(hctx, set->driver_data, i))
+ /*
+ * Initialize hardware queues
+ */
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (blk_mq_init_hctx(q, set, hctx, i))
break;
}
if (!ctx)
return ERR_PTR(-ENOMEM);
+ /*
+ * If a crashdump is active, then we are potentially in a very
+ * memory constrained environment. Limit us to 1 queue and
+ * 64 tags to prevent using too much memory.
+ */
+ if (is_kdump_kernel()) {
+ set->nr_hw_queues = 1;
+ set->queue_depth = min(64U, set->queue_depth);
+ }
+
hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
set->numa_node);
if (!hctxs[i])
goto err_hctxs;
- if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
+ if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
+ node))
goto err_hctxs;
atomic_set(&hctxs[i]->nr_active, 0);
if (!q)
goto err_hctxs;
- if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release))
+ /*
+ * Init percpu_ref in atomic mode so that it's faster to shutdown.
+ * See blk_register_queue() for details.
+ */
+ if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
+ PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
goto err_map;
setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
else
blk_queue_make_request(q, blk_sq_make_request);
- blk_queue_rq_timed_out(q, blk_mq_rq_timed_out);
if (set->timeout)
blk_queue_rq_timeout(q, set->timeout);
if (set->ops->complete)
blk_queue_softirq_done(q, set->ops->complete);
- blk_mq_init_flush(q);
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
- q->flush_rq = kzalloc(round_up(sizeof(struct request) +
- set->cmd_size, cache_line_size()),
- GFP_KERNEL);
- if (!q->flush_rq)
- goto err_hw;
-
if (blk_mq_init_hw_queues(q, set))
- goto err_flush_rq;
+ goto err_hw;
mutex_lock(&all_q_mutex);
list_add_tail(&q->all_q_node, &all_q_list);
return q;
- err_flush_rq:
- kfree(q->flush_rq);
err_hw:
blk_cleanup_queue(q);
err_hctxs:
if (q->mq_ops)
blk_mq_free_queue(q);
-
- kfree(q->flush_rq);
+ else
+ blk_free_flush_queue(q->fq);
blk_trace_shutdown(q);
return -ENXIO;
/*
- * Initialization must be complete by now. Finish the initial
- * bypass from queue allocation.
+ * SCSI probing may synchronously create and destroy a lot of
+ * request_queues for non-existent devices. Shutting down a fully
+ * functional queue takes measureable wallclock time as RCU grace
+ * periods are involved. To avoid excessive latency in these
+ * cases, a request_queue starts out in a degraded mode which is
+ * faster to shut down and is made fully functional here as
+ * request_queues for non-existent devices never get registered.
*/
if (!blk_queue_init_done(q)) {
queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
blk_queue_bypass_end(q);
+ if (q->mq_ops)
+ blk_mq_finish_init(q);
}
ret = blk_trace_init_sysfs(dev);
/* Process context for config space updates */
struct work_struct config_work;
- /* Lock for config space updates */
- struct mutex config_lock;
-
- /* enable config space updates */
- bool config_enable;
-
/* What host tells us, plus 2 for header & tailer. */
unsigned int sg_elems;
req->errors = (error != 0);
}
- blk_mq_end_io(req, error);
+ blk_mq_end_request(req, error);
}
static void virtblk_done(struct virtqueue *vq)
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
}
- static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
+ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
+ bool last)
{
struct virtio_blk *vblk = hctx->queue->queuedata;
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
unsigned long flags;
unsigned int num;
int qid = hctx->queue_num;
- const bool last = (req->cmd_flags & REQ_END) != 0;
int err;
bool notify = false;
}
}
+ blk_mq_start_request(req);
+
num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
if (num) {
if (rq_data_dir(vbr->req) == WRITE)
char *envp[] = { "RESIZE=1", NULL };
u64 capacity, size;
- mutex_lock(&vblk->config_lock);
- if (!vblk->config_enable)
- goto done;
-
/* Host must always specify the capacity. */
virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
set_capacity(vblk->disk, capacity);
revalidate_disk(vblk->disk);
kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
-done:
- mutex_unlock(&vblk->config_lock);
}
static void virtblk_config_changed(struct virtio_device *vdev)
vblk->vdev = vdev;
vblk->sg_elems = sg_elems;
- mutex_init(&vblk->config_lock);
INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
- vblk->config_enable = true;
err = init_vq(vblk);
if (err)
if (!err && opt_io_size)
blk_queue_io_opt(q, blk_size * opt_io_size);
+ virtio_device_ready(vdev);
+
add_disk(vblk->disk);
err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
if (err)
int index = vblk->index;
int refc;
- /* Prevent config work handler from accessing the device. */
- mutex_lock(&vblk->config_lock);
- vblk->config_enable = false;
- mutex_unlock(&vblk->config_lock);
+ /* Make sure no work handler is accessing the device. */
+ flush_work(&vblk->config_work);
del_gendisk(vblk->disk);
blk_cleanup_queue(vblk->disk->queue);
/* Stop all the virtqueues. */
vdev->config->reset(vdev);
- flush_work(&vblk->config_work);
-
refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
put_disk(vblk->disk);
vdev->config->del_vqs(vdev);
/* Ensure we don't receive any more interrupts */
vdev->config->reset(vdev);
- /* Prevent config work handler from accessing the device. */
- mutex_lock(&vblk->config_lock);
- vblk->config_enable = false;
- mutex_unlock(&vblk->config_lock);
-
+ /* Make sure no work handler is accessing the device. */
flush_work(&vblk->config_work);
blk_mq_stop_hw_queues(vblk->disk->queue);
struct virtio_blk *vblk = vdev->priv;
int ret;
- vblk->config_enable = true;
ret = init_vq(vdev->priv);
- if (!ret)
- blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+ if (ret)
+ return ret;
- return ret;
+ virtio_device_ready(vdev);
+
+ blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+ return 0;
}
#endif
config SCSI_NETLINK
bool
default n
- select NET
+ depends on NET
+
+config SCSI_MQ_DEFAULT
+ bool "SCSI: use blk-mq I/O path by default"
+ depends on SCSI
+ ---help---
+ This option enables the new blk-mq based I/O path for SCSI
+ devices by default. With the option the scsi_mod.use_blk_mq
+ module/boot option defaults to Y, without it to N, but it can
+ still be overriden either way.
+
+ If unsure say N.
config SCSI_PROC_FS
bool "legacy /proc/scsi/ support"
config BLK_DEV_SD
tristate "SCSI disk support"
depends on SCSI
- select CRC_T10DIF if BLK_DEV_INTEGRITY
---help---
If you want to use SCSI hard disks, Fibre Channel disks,
Serial ATA (SATA) or Parallel ATA (PATA) hard disks,
config SCSI_FC_ATTRS
tristate "FiberChannel Transport Attributes"
- depends on SCSI
+ depends on SCSI && NET
select SCSI_NETLINK
help
If you wish to export transport-specific information about
To compile this driver as a module, choose M here: the
module will be called vmw_pvscsi.
+config XEN_SCSI_FRONTEND
+ tristate "XEN SCSI frontend driver"
+ depends on SCSI && XEN
+ select XEN_XENBUS_FRONTEND
+ help
+ The XEN SCSI frontend driver allows the kernel to access SCSI Devices
+ within another guest OS (usually Dom0).
+ Only needed if the kernel is running in a XEN guest and generic
+ SCSI access to a device is needed.
+
config HYPERV_STORAGE
tristate "Microsoft Hyper-V virtual storage driver"
depends on SCSI && HYPERV
config LIBFC
tristate "LibFC module"
- select SCSI_FC_ATTRS
+ depends on SCSI_FC_ATTRS
select CRC32
---help---
Fibre Channel library module
config LIBFCOE
tristate "LibFCoE module"
- select LIBFC
+ depends on LIBFC
---help---
Library for Fibre Channel over Ethernet module
config FCOE
tristate "FCoE module"
depends on PCI
- select LIBFCOE
+ depends on LIBFCOE
---help---
Fibre Channel over Ethernet module
config FCOE_FNIC
tristate "Cisco FNIC Driver"
depends on PCI && X86
- select LIBFCOE
+ depends on LIBFCOE
help
This is support for the Cisco PCI-Express FCoE HBA.
config SCSI_IBMVFC
tristate "IBM Virtual FC support"
depends on PPC_PSERIES && SCSI
- select SCSI_FC_ATTRS
+ depends on SCSI_FC_ATTRS
help
This is the IBM POWER Virtual FC Client
config SCSI_LPFC
tristate "Emulex LightPulse Fibre Channel Support"
depends on PCI && SCSI
- select SCSI_FC_ATTRS
+ depends on SCSI_FC_ATTRS
select CRC_T10DIF
help
This lpfc driver supports the Emulex LightPulse
config ZFCP
tristate "FCP host bus adapter driver for IBM eServer zSeries"
depends on S390 && QDIO && SCSI
- select SCSI_FC_ATTRS
+ depends on SCSI_FC_ATTRS
help
If you want to access SCSI devices attached to your IBM eServer
zSeries by means of Fibre Channel interfaces say Y.
config SCSI_BFA_FC
tristate "Brocade BFA Fibre Channel Support"
depends on PCI && SCSI
- select SCSI_FC_ATTRS
+ depends on SCSI_FC_ATTRS
help
This bfa driver supports all Brocade PCIe FC/FCOE host adapters.
/**
* scsi_eh_test_devices - check if devices are responding from error recovery.
* @cmd_list: scsi commands in error recovery.
- * @work_q: queue for commands which still need more error recovery
- * @done_q: queue for commands which are finished
- * @try_stu: boolean on if a STU command should be tried in addition to TUR.
+ * @work_q: queue for commands which still need more error recovery
+ * @done_q: queue for commands which are finished
+ * @try_stu: boolean on if a STU command should be tried in addition to TUR.
*
* Decription:
* Tests if devices are in a working state. Commands to devices now in
/**
* scsi_eh_stu - send START_UNIT if needed
* @shost: &scsi host being recovered.
- * @work_q: &list_head for pending commands.
+ * @work_q: &list_head for pending commands.
* @done_q: &list_head for processed commands.
*
* Notes:
/**
* scsi_eh_bus_device_reset - send bdr if needed
* @shost: scsi host being recovered.
- * @work_q: &list_head for pending commands.
+ * @work_q: &list_head for pending commands.
* @done_q: &list_head for processed commands.
*
* Notes:
/**
* scsi_eh_target_reset - send target reset if needed
* @shost: scsi host being recovered.
- * @work_q: &list_head for pending commands.
+ * @work_q: &list_head for pending commands.
* @done_q: &list_head for processed commands.
*
* Notes:
/**
* scsi_eh_bus_reset - send a bus reset
* @shost: &scsi host being recovered.
- * @work_q: &list_head for pending commands.
+ * @work_q: &list_head for pending commands.
* @done_q: &list_head for processed commands.
*/
static int scsi_eh_bus_reset(struct Scsi_Host *shost,
/**
* scsi_eh_host_reset - send a host reset
- * @work_q: list_head for processed commands.
- * @done_q: list_head for processed commands.
+ * @shost: host to be reset.
+ * @work_q: &list_head for pending commands.
+ * @done_q: &list_head for processed commands.
*/
static int scsi_eh_host_reset(struct Scsi_Host *shost,
struct list_head *work_q,
/**
* scsi_eh_offline_sdevs - offline scsi devices that fail to recover
- * @work_q: list_head for processed commands.
- * @done_q: list_head for processed commands.
+ * @work_q: &list_head for pending commands.
+ * @done_q: &list_head for processed commands.
*/
static void scsi_eh_offline_sdevs(struct list_head *work_q,
struct list_head *done_q)
* request becomes available
*/
req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL);
+ if (IS_ERR(req))
+ return;
blk_rq_set_block_pc(req);
/**
* scsi_eh_ready_devs - check device ready state and recover if not.
- * @shost: host to be recovered.
- * @work_q: &list_head for pending commands.
+ * @shost: host to be recovered.
+ * @work_q: &list_head for pending commands.
* @done_q: &list_head for processed commands.
*/
void scsi_eh_ready_devs(struct Scsi_Host *shost,
int ret = DRIVER_ERROR << 24;
req = blk_get_request(sdev->request_queue, write, __GFP_WAIT);
- if (!req)
+ if (IS_ERR(req))
return ret;
blk_rq_set_block_pc(req);
static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd)
{
struct scsi_device *sdev = cmd->device;
+ struct Scsi_Host *shost = sdev->host;
unsigned long flags;
- BUG_ON(list_empty(&cmd->list));
-
scsi_mq_free_sgtables(cmd);
scsi_uninit_cmd(cmd);
- spin_lock_irqsave(&sdev->list_lock, flags);
- list_del_init(&cmd->list);
- spin_unlock_irqrestore(&sdev->list_lock, flags);
+ if (shost->use_cmd_list) {
+ BUG_ON(list_empty(&cmd->list));
+ spin_lock_irqsave(&sdev->list_lock, flags);
+ list_del_init(&cmd->list);
+ spin_unlock_irqrestore(&sdev->list_lock, flags);
+ }
}
/*
if (req->mq_ctx) {
/*
- * In the MQ case the command gets freed by __blk_mq_end_io,
+ * In the MQ case the command gets freed by __blk_mq_end_request,
* so we have to do all cleanup that depends on it earlier.
*
* We also can't kick the queues from irq context, so we
*/
scsi_mq_uninit_cmd(cmd);
- __blk_mq_end_io(req, error);
+ __blk_mq_end_request(req, error);
if (scsi_target(sdev)->single_lun ||
!list_empty(&sdev->host->starved_list))
} else {
unsigned long flags;
+ if (bidi_bytes)
+ scsi_release_bidi_buffers(cmd);
+
spin_lock_irqsave(q->queue_lock, flags);
blk_finish_request(req, error);
spin_unlock_irqrestore(q->queue_lock, flags);
- if (bidi_bytes)
- scsi_release_bidi_buffers(cmd);
scsi_release_buffers(cmd);
scsi_next_command(cmd);
}
INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
cmd->jiffies_at_alloc = jiffies;
- /*
- * XXX: cmd_list lookups are only used by two drivers, try to get
- * rid of this list in common code.
- */
- spin_lock_irq(&sdev->list_lock);
- list_add_tail(&cmd->list, &sdev->cmd_list);
- spin_unlock_irq(&sdev->list_lock);
+ if (shost->use_cmd_list) {
+ spin_lock_irq(&sdev->list_lock);
+ list_add_tail(&cmd->list, &sdev->cmd_list);
+ spin_unlock_irq(&sdev->list_lock);
+ }
sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
cmd->sdb.table.sgl = sg;
next_rq->special = bidi_sdb;
}
+ blk_mq_start_request(req);
+
return scsi_setup_cmnd(sdev, req);
}
blk_mq_complete_request(cmd->request);
}
- static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
+ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
+ bool last)
{
struct request_queue *q = req->q;
struct scsi_device *sdev = q->queuedata;
if (!scsi_host_queue_ready(q, shost, sdev))
goto out_dec_target_busy;
+
if (!(req->cmd_flags & REQ_DONTPREP)) {
ret = prep_to_mq(scsi_mq_prep_fn(req));
if (ret)
goto out_dec_host_busy;
req->cmd_flags |= REQ_DONTPREP;
+ } else {
+ blk_mq_start_request(req);
}
scsi_init_cmd_errh(cmd);
return ret;
}
+ static enum blk_eh_timer_return scsi_timeout(struct request *req,
+ bool reserved)
+ {
+ if (reserved)
+ return BLK_EH_RESET_TIMER;
+ return scsi_times_out(req);
+ }
+
static int scsi_init_request(void *data, struct request *rq,
unsigned int hctx_idx, unsigned int request_idx,
unsigned int numa_node)
.map_queue = blk_mq_map_queue,
.queue_rq = scsi_queue_rq,
.complete = scsi_softirq_done,
- .timeout = scsi_times_out,
+ .timeout = scsi_timeout,
.init_request = scsi_init_request,
.exit_request = scsi_exit_request,
};
if (ct < 0)
return -EINVAL;
rcd = ct & 0x01 ? 1 : 0;
- wce = ct & 0x02 ? 1 : 0;
+ wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0;
if (sdkp->cache_override) {
sdkp->WCE = wce;
mutex_unlock(&sd_ref_mutex);
}
- static void sd_prot_op(struct scsi_cmnd *scmd, unsigned int dif)
+
+
+ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd,
+ unsigned int dix, unsigned int dif)
{
- unsigned int prot_op = SCSI_PROT_NORMAL;
- unsigned int dix = scsi_prot_sg_count(scmd);
-
- if (scmd->sc_data_direction == DMA_FROM_DEVICE) {
- if (dif && dix)
- prot_op = SCSI_PROT_READ_PASS;
- else if (dif && !dix)
- prot_op = SCSI_PROT_READ_STRIP;
- else if (!dif && dix)
- prot_op = SCSI_PROT_READ_INSERT;
- } else {
- if (dif && dix)
- prot_op = SCSI_PROT_WRITE_PASS;
- else if (dif && !dix)
- prot_op = SCSI_PROT_WRITE_INSERT;
- else if (!dif && dix)
- prot_op = SCSI_PROT_WRITE_STRIP;
+ struct bio *bio = scmd->request->bio;
+ unsigned int prot_op = sd_prot_op(rq_data_dir(scmd->request), dix, dif);
+ unsigned int protect = 0;
+
+ if (dix) { /* DIX Type 0, 1, 2, 3 */
+ if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM))
+ scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM;
+
+ if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
+ scmd->prot_flags |= SCSI_PROT_GUARD_CHECK;
+ }
+
+ if (dif != SD_DIF_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */
+ scmd->prot_flags |= SCSI_PROT_REF_INCREMENT;
+
+ if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
+ scmd->prot_flags |= SCSI_PROT_REF_CHECK;
+ }
+
+ if (dif) { /* DIX/DIF Type 1, 2, 3 */
+ scmd->prot_flags |= SCSI_PROT_TRANSFER_PI;
+
+ if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK))
+ protect = 3 << 5; /* Disable target PI checking */
+ else
+ protect = 1 << 5; /* Enable target PI checking */
}
scsi_set_prot_op(scmd, prot_op);
scsi_set_prot_type(scmd, dif);
+ scmd->prot_flags &= sd_prot_flag_mask(prot_op);
+
+ return protect;
}
static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
sector_t block = blk_rq_pos(rq);
sector_t threshold;
unsigned int this_count = blk_rq_sectors(rq);
- int ret, host_dif;
+ unsigned int dif, dix;
+ int ret;
unsigned char protect;
ret = scsi_init_io(SCpnt, GFP_ATOMIC);
SCpnt->cmnd[0] = WRITE_6;
if (blk_integrity_rq(rq))
- sd_dif_prepare(rq, block, sdp->sector_size);
+ sd_dif_prepare(SCpnt);
} else if (rq_data_dir(rq) == READ) {
SCpnt->cmnd[0] = READ_6;
"writing" : "reading", this_count,
blk_rq_sectors(rq)));
- /* Set RDPROTECT/WRPROTECT if disk is formatted with DIF */
- host_dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type);
- if (host_dif)
- protect = 1 << 5;
+ dix = scsi_prot_sg_count(SCpnt);
+ dif = scsi_host_dif_capable(SCpnt->device->host, sdkp->protection_type);
+
+ if (dif || dix)
+ protect = sd_setup_protect_cmnd(SCpnt, dix, dif);
else
protect = 0;
- if (host_dif == SD_DIF_TYPE2_PROTECTION) {
+ if (protect && sdkp->protection_type == SD_DIF_TYPE2_PROTECTION) {
SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC);
if (unlikely(SCpnt->cmnd == NULL)) {
}
SCpnt->sdb.length = this_count * sdp->sector_size;
- /* If DIF or DIX is enabled, tell HBA how to handle request */
- if (host_dif || scsi_prot_sg_count(SCpnt))
- sd_prot_op(SCpnt, host_dif);
-
/*
* We shouldn't disconnect in the middle of a sector, so with a dumb
* host adapter, it's safe to assume that we can at least transfer
sdkp->DPOFUA = 0;
}
+ /* No cache flush allowed for write protected devices */
+ if (sdkp->WCE && sdkp->write_prot)
+ sdkp->WCE = 0;
+
if (sdkp->first_scan || old_wce != sdkp->WCE ||
old_rcd != sdkp->RCD || old_dpofua != sdkp->DPOFUA)
sd_printk(KERN_NOTICE, sdkp,
int index;
int error;
+ scsi_autopm_get_device(sdp);
error = -ENODEV;
if (sdp->type != TYPE_DISK && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC)
goto out;
out_free:
kfree(sdkp);
out:
+ scsi_autopm_put_device(sdp);
return error;
}
req = blk_get_request(SRpnt->stp->device->request_queue, write,
GFP_KERNEL);
- if (!req)
+ if (IS_ERR(req))
return DRIVER_ERROR << 24;
blk_rq_set_block_pc(req);
return -ENODEV;
}
+ scsi_autopm_get_device(SDp);
i = queue_max_segments(SDp->request_queue);
if (st_max_sg_segs < i)
i = st_max_sg_segs;
out_buffer_free:
kfree(buffer);
out:
+ scsi_autopm_put_device(SDp);
return -ENODEV;
};
EXPORT_SYMBOL(I_BDEV);
/*
- * Move the inode from its current bdi to a new bdi. If the inode is dirty we
- * need to move it onto the dirty list of @dst so that the inode is always on
- * the right list.
+ * Move the inode from its current bdi to a new bdi. Make sure the inode
+ * is clean before moving so that it doesn't linger on the old bdi.
*/
static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
- struct backing_dev_info *old = inode->i_data.backing_dev_info;
- bool wakeup_bdi = false;
-
- if (unlikely(dst == old)) /* deadlock avoidance */
- return;
- bdi_lock_two(&old->wb, &dst->wb);
- spin_lock(&inode->i_lock);
- inode->i_data.backing_dev_info = dst;
- if (inode->i_state & I_DIRTY) {
- if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
- wakeup_bdi = true;
- list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+ while (true) {
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & I_DIRTY)) {
+ inode->i_data.backing_dev_info = dst;
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+ spin_unlock(&inode->i_lock);
+ WARN_ON_ONCE(write_inode_now(inode, true));
}
- spin_unlock(&inode->i_lock);
- spin_unlock(&old->wb.list_lock);
- spin_unlock(&dst->wb.list_lock);
-
- if (wakeup_bdi)
- bdi_wakeup_thread_delayed(dst);
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
return block_read_full_page(page, blkdev_get_block);
}
+static int blkdev_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+}
+
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
bdi = blk_get_backing_dev_info(bdev);
- if (bdi == NULL)
- bdi = &default_backing_dev_info;
bdev_inode_switch_bdi(bdev->bd_inode, bdi);
}
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
+ .readpages = blkdev_readpages,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
static void btrfs_error_commit_super(struct btrfs_root *root);
/*
- * end_io_wq structs are used to do processing in task context when an IO is
- * complete. This is used during reads to verify checksums, and it is used
+ * btrfs_end_io_wq structs are used to do processing in task context when an IO
+ * is complete. This is used during reads to verify checksums, and it is used
* by writes to insert metadata for new file extents after IO is complete.
*/
-struct end_io_wq {
+struct btrfs_end_io_wq {
struct bio *bio;
bio_end_io_t *end_io;
void *private;
struct btrfs_fs_info *info;
int error;
- int metadata;
+ enum btrfs_wq_endio_type metadata;
struct list_head list;
struct btrfs_work work;
};
+static struct kmem_cache *btrfs_end_io_wq_cache;
+
+int __init btrfs_end_io_wq_init(void)
+{
+ btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
+ sizeof(struct btrfs_end_io_wq),
+ 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_end_io_wq_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void btrfs_end_io_wq_exit(void)
+{
+ if (btrfs_end_io_wq_cache)
+ kmem_cache_destroy(btrfs_end_io_wq_cache);
+}
+
/*
* async submit bios are used to offload expensive checksumming
* onto the worker threads. They checksum file and metadata bios
{
struct extent_state *cached_state = NULL;
int ret;
- bool need_lock = (current->journal_info ==
- (void *)BTRFS_SEND_TRANS_STUB);
+ bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
ret = 0;
goto out;
}
- printk_ratelimited("parent transid verify failed on %llu wanted %llu "
- "found %llu\n",
- eb->start, parent_transid, btrfs_header_generation(eb));
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+ eb->fs_info->sb->s_id, eb->start,
+ parent_transid, btrfs_header_generation(eb));
ret = 1;
/*
goto err;
eb->read_mirror = mirror;
- if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
ret = -EIO;
goto err;
}
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_INFO "BTRFS: bad tree block start "
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
"%llu %llu\n",
- found_start, eb->start);
+ eb->fs_info->sb->s_id, found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root, eb)) {
- printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n",
- eb->start);
+ printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
+ eb->fs_info->sb->s_id, eb->start);
ret = -EIO;
goto err;
}
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
eb = (struct extent_buffer *)page->private;
- set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = failed_mirror;
atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
static void end_workqueue_bio(struct bio *bio, int err)
{
- struct end_io_wq *end_io_wq = bio->bi_private;
+ struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
struct btrfs_workqueue *wq;
btrfs_work_func_t func;
func = btrfs_endio_write_helper;
}
} else {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ if (unlikely(end_io_wq->metadata ==
+ BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+ wq = fs_info->endio_repair_workers;
+ func = btrfs_endio_repair_helper;
+ } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
wq = fs_info->endio_raid56_workers;
func = btrfs_endio_raid56_helper;
} else if (end_io_wq->metadata) {
btrfs_queue_work(wq, &end_io_wq->work);
}
-/*
- * For the metadata arg you want
- *
- * 0 - if data
- * 1 - if normal metadta
- * 2 - if writing to the free space cache area
- * 3 - raid parity work
- */
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- int metadata)
+ enum btrfs_wq_endio_type metadata)
{
- struct end_io_wq *end_io_wq;
- end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+ struct btrfs_end_io_wq *end_io_wq;
+
+ end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
if (!end_io_wq)
return -ENOMEM;
* can happen in the async kernel threads
*/
ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
- bio, 1);
+ bio, BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
.set_page_dirty = btree_set_page_dirty,
};
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
- u64 parent_transid)
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
{
struct extent_buffer *buf = NULL;
struct inode *btree_inode = root->fs_info->btree_inode;
- int ret = 0;
buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
if (!buf)
- return 0;
+ return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
buf, 0, WAIT_NONE, btree_get_extent, 0);
free_extent_buffer(buf);
- return ret;
}
int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
}
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize)
+ u64 bytenr)
{
return find_extent_buffer(root->fs_info, bytenr);
}
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (btrfs_test_is_dummy_root(root))
return alloc_test_extent_buffer(root->fs_info, bytenr,
blocksize);
-#endif
return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
}
}
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
- u32 blocksize, u64 parent_transid)
+ u64 parent_transid)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
if (!buf)
return NULL;
if (!writers)
return ERR_PTR(-ENOMEM);
- ret = percpu_counter_init(&writers->counter, 0);
+ ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
if (ret < 0) {
kfree(writers);
return ERR_PTR(ret);
kfree(writers);
}
-static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
- u32 stripesize, struct btrfs_root *root,
- struct btrfs_fs_info *fs_info,
+static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
+ struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
root->node = NULL;
root->commit_root = NULL;
root->sectorsize = sectorsize;
root->nodesize = nodesize;
- root->leafsize = leafsize;
root->stripesize = stripesize;
root->state = 0;
root->orphan_cleanup_state = 0;
root = btrfs_alloc_root(NULL);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
+ __setup_root(4096, 4096, 4096, root, NULL, 1);
set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
root->alloc_bytenr = 0;
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, objectid);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, objectid);
root->root_key.objectid = objectid;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info,
+ BTRFS_TREE_LOG_OBJECTID);
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
* updated (along with back refs to the log tree).
*/
- leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- BTRFS_TREE_LOG_OBJECTID, NULL,
- 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
+ NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
kfree(root);
return ERR_CAST(leaf);
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
- btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+ btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_node(&log_root->root_item, log_root->node);
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_path *path;
u64 generation;
- u32 blocksize;
int ret;
path = btrfs_alloc_path();
goto alloc_fail;
}
- __setup_root(tree_root->nodesize, tree_root->leafsize,
- tree_root->sectorsize, tree_root->stripesize,
- root, fs_info, key->objectid);
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, root, fs_info, key->objectid);
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
}
generation = btrfs_root_generation(&root->root_item);
- blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
- blocksize, generation);
+ generation);
if (!root->node) {
ret = -ENOMEM;
goto find_fail;
root->subv_writers = writers;
btrfs_init_free_ino_ctl(root);
- spin_lock_init(&root->cache_lock);
- init_waitqueue_head(&root->cache_wait);
+ spin_lock_init(&root->ino_cache_lock);
+ init_waitqueue_head(&root->ino_cache_wait);
ret = get_anon_bdev(&root->anon_dev);
if (ret)
if (!device->bdev)
continue;
bdi = blk_get_backing_dev_info(device->bdev);
- if (bdi && bdi_congested(bdi, bdi_bits)) {
+ if (bdi_congested(bdi, bdi_bits)) {
ret = 1;
break;
}
return ret;
}
-/*
- * If this fails, caller must call bdi_destroy() to get rid of the
- * bdi again.
- */
static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
{
int err;
static void end_workqueue_fn(struct btrfs_work *work)
{
struct bio *bio;
- struct end_io_wq *end_io_wq;
+ struct btrfs_end_io_wq *end_io_wq;
int error;
- end_io_wq = container_of(work, struct end_io_wq, work);
+ end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
bio = end_io_wq->bio;
error = end_io_wq->error;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
- kfree(end_io_wq);
+ kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
bio_endio_nodec(bio, error);
}
}
btrfs_run_delayed_iputs(root);
+ btrfs_delete_unused_bgs(root->fs_info);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
btrfs_destroy_workqueue(fs_info->endio_workers);
btrfs_destroy_workqueue(fs_info->endio_meta_workers);
btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ btrfs_destroy_workqueue(fs_info->endio_repair_workers);
btrfs_destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
{
u32 sectorsize;
u32 nodesize;
- u32 leafsize;
- u32 blocksize;
u32 stripesize;
u64 generation;
u64 features;
goto fail_srcu;
}
- ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_bdi;
fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
(1 + ilog2(nr_cpu_ids));
- ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_dirty_metadata_bytes;
}
- ret = percpu_counter_init(&fs_info->bio_counter, 0);
+ ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_delalloc_bytes;
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->unused_bgs_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_LIST_HEAD(&fs_info->unused_bgs);
btrfs_mapping_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
- fs_info->max_inline = 8192 * 1024;
+ fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->free_chunk_space = 0;
goto fail_alloc;
}
- __setup_root(4096, 4096, 4096, 4096, tree_root,
+ __setup_root(4096, 4096, 4096, tree_root,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
invalidate_bdev(fs_devices->latest_bdev);
goto fail_alloc;
}
- if (btrfs_super_leafsize(disk_super) !=
+ /*
+ * Leafsize and nodesize were always equal, this is only a sanity check.
+ */
+ if (le32_to_cpu(disk_super->__unused_leafsize) !=
btrfs_super_nodesize(disk_super)) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksizes don't match. node %d leaf %d\n",
btrfs_super_nodesize(disk_super),
- btrfs_super_leafsize(disk_super));
+ le32_to_cpu(disk_super->__unused_leafsize));
err = -EINVAL;
goto fail_alloc;
}
- if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksize (%d) was too large\n",
- btrfs_super_leafsize(disk_super));
+ btrfs_super_nodesize(disk_super));
err = -EINVAL;
goto fail_alloc;
}
* flag our filesystem as having big metadata blocks if
* they are bigger than the page size
*/
- if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+ if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
}
nodesize = btrfs_super_nodesize(disk_super);
- leafsize = btrfs_super_leafsize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
stripesize = btrfs_super_stripesize(disk_super);
- fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
/*
* extent buffers for the same range. It leads to corruptions
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (sectorsize != leafsize)) {
+ (sectorsize != nodesize)) {
printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
"are not allowed for mixed block groups on %s\n",
sb->s_id);
btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
fs_info->endio_raid56_workers =
btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ fs_info->endio_repair_workers =
+ btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
fs_info->rmw_workers =
btrfs_alloc_workqueue("rmw", flags, max_active, 2);
fs_info->endio_write_workers =
fs_info->submit_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_meta_write_workers &&
+ fs_info->endio_repair_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers &&
fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->fixup_workers && fs_info->extent_workers &&
+ fs_info->extent_workers &&
fs_info->qgroup_rescan_workers)) {
err = -ENOMEM;
goto fail_sb_buffer;
4 * 1024 * 1024 / PAGE_CACHE_SIZE);
tree_root->nodesize = nodesize;
- tree_root->leafsize = leafsize;
tree_root->sectorsize = sectorsize;
tree_root->stripesize = stripesize;
goto fail_sb_buffer;
}
- blocksize = btrfs_level_size(tree_root,
- btrfs_super_chunk_root_level(disk_super));
generation = btrfs_super_chunk_root_generation(disk_super);
- __setup_root(nodesize, leafsize, sectorsize, stripesize,
- chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+ __setup_root(nodesize, sectorsize, stripesize, chunk_root,
+ fs_info, BTRFS_CHUNK_TREE_OBJECTID);
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
- blocksize, generation);
+ generation);
if (!chunk_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
}
retry_root_backup:
- blocksize = btrfs_level_size(tree_root,
- btrfs_super_root_level(disk_super));
generation = btrfs_super_generation(disk_super);
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
- blocksize, generation);
+ generation);
if (!tree_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
err = -EIO;
goto fail_qgroup;
}
- blocksize =
- btrfs_level_size(tree_root,
- btrfs_super_log_root_level(disk_super));
log_tree_root = btrfs_alloc_root(fs_info);
if (!log_tree_root) {
goto fail_qgroup;
}
- __setup_root(nodesize, leafsize, sectorsize, stripesize,
+ __setup_root(nodesize, sectorsize, stripesize,
log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
log_tree_root->node = read_tree_block(tree_root, bytenr,
- blocksize,
generation + 1);
if (!log_tree_root->node ||
!extent_buffer_uptodate(log_tree_root->node)) {
fs_info->update_uuid_tree_gen = 1;
}
+ fs_info->open = 1;
+
return 0;
fail_qgroup:
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ device->commit_total_bytes)
break;
if (wait) {
btrfs_set_stack_device_type(dev_item, dev->type);
btrfs_set_stack_device_id(dev_item, dev->devid);
btrfs_set_stack_device_total_bytes(dev_item,
- dev->disk_total_bytes);
- btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+ dev->commit_total_bytes);
+ btrfs_set_stack_device_bytes_used(dev_item,
+ dev->commit_bytes_used);
btrfs_set_stack_device_io_align(dev_item, dev->io_align);
btrfs_set_stack_device_io_width(dev_item, dev->io_width);
btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
static void free_fs_root(struct btrfs_root *root)
{
- iput(root->cache_inode);
+ iput(root->ino_cache_inode);
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
btrfs_free_block_rsv(root, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
return btrfs_commit_transaction(trans, root);
}
-int close_ctree(struct btrfs_root *root)
+void close_ctree(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
+ fs_info->open = 0;
free_root_pointers(fs_info, 1);
iput(fs_info->btree_inode);
btrfs_free_block_rsv(root, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
-
- return 0;
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ int ret = 0;
+
+ if (sb->root_level > BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
+ sb->root_level, BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
+ sb->chunk_root_level, BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+ if (sb->log_root_level > BTRFS_MAX_LEVEL) {
+ printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
+ sb->log_root_level, BTRFS_MAX_LEVEL);
+ ret = -EINVAL;
+ }
+
/*
- * Placeholder for checks
+ * The common minimum, we don't know if we can trust the nodesize/sectorsize
+ * items yet, they'll be verified later. Issue just a warning.
*/
- return 0;
+ if (!IS_ALIGNED(sb->root, 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ sb->root);
+ if (!IS_ALIGNED(sb->chunk_root, 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ sb->chunk_root);
+ if (!IS_ALIGNED(sb->log_root, 4096))
+ printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ sb->log_root);
+
+ if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
+ printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
+ fs_info->fsid, sb->dev_item.fsid);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+ * done later
+ */
+ if (sb->num_devices > (1UL << 31))
+ printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
+ sb->num_devices);
+
+ if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
+ printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
+ sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
+ ret = -EINVAL;
+ }
+
+ /*
+ * The generation is a global counter, we'll trust it more than the others
+ * but it's still possible that it's the one that's wrong.
+ */
+ if (sb->generation < sb->chunk_root_generation)
+ printk(KERN_WARNING
+ "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
+ sb->generation, sb->chunk_root_generation);
+ if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
+ printk(KERN_WARNING
+ "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
+ sb->generation, sb->cache_generation);
+
+ return ret;
}
static void btrfs_error_commit_super(struct btrfs_root *root)
clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
while (start <= end) {
- eb = btrfs_find_tree_block(root, start,
- root->leafsize);
- start += root->leafsize;
+ eb = btrfs_find_tree_block(root, start);
+ start += root->nodesize;
if (!eb)
continue;
wait_on_extent_buffer_writeback(eb);
return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
/*
* nfs_direct_cmp_commit_data_verf - compare verifier for commit data
* @dreq - direct request possibly spanning multiple servers
WARN_ON_ONCE(verfp->committed < 0);
return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
}
-#endif
/**
* nfs_direct_IO - NFS address space operation for direct I/O
#else
VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
- if (rw == READ || rw == KERNEL_READ)
- return nfs_file_direct_read(iocb, iter, pos,
- rw == READ ? true : false);
- return nfs_file_direct_write(iocb, iter, pos,
- rw == WRITE ? true : false);
+ if (rw == READ)
+ return nfs_file_direct_read(iocb, iter, pos);
+ return nfs_file_direct_write(iocb, iter, pos);
#endif /* CONFIG_NFS_SWAP */
}
* cache.
*/
ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
- loff_t pos, bool uio)
+ loff_t pos)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
return result;
}
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor desc;
schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
}
-#else
-static void nfs_direct_write_schedule_work(struct work_struct *work)
-{
-}
-
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
-{
- nfs_direct_complete(dreq, true);
-}
-#endif
-
static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
* is no atomic O_APPEND write facility in the NFS protocol.
*/
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
- loff_t pos, bool uio)
+ loff_t pos)
{
ssize_t result = -EINVAL;
struct file *file = iocb->ki_filp;
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
+#include "pnfs.h"
#include "nfstrace.h"
ssize_t result;
if (iocb->ki_filp->f_flags & O_DIRECT)
- return nfs_file_direct_read(iocb, to, iocb->ki_pos, true);
+ return nfs_file_direct_read(iocb, to, iocb->ki_pos);
dprintk("NFS: read(%pD2, %zu@%lu)\n",
iocb->ki_filp,
unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
unsigned int end = offset + len;
+ if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
+ if (!PageUptodate(page))
+ return 1;
+ return 0;
+ }
+
if ((file->f_mode & FMODE_READ) && /* open for read? */
!PageUptodate(page) && /* Uptodate? */
!PagePrivate(page) && /* i/o request already? */
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
- /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not
- * doing this memory reclaim for a fs-related allocation.
+ /* Always try to initiate a 'commit' if relevant, but only
+ * wait for it if __GFP_WAIT is set. Even then, only wait 1
+ * second and only if the 'bdi' is not congested.
+ * Waiting indefinitely can cause deadlocks when the NFS
+ * server is on this machine, when a new TCP connection is
+ * needed and in other rare cases. There is no particular
+ * need to wait extensively here. A short wait has the
+ * benefit that someone else can worry about the freezer.
*/
- if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL &&
- !(current->flags & PF_FSTRANS)) {
- int how = FLUSH_SYNC;
-
- /* Don't let kswapd deadlock waiting for OOM RPC calls */
- if (current_is_kswapd())
- how = 0;
- nfs_commit_inode(mapping->host, how);
+ if (mapping) {
+ struct nfs_server *nfss = NFS_SERVER(mapping->host);
+ nfs_commit_inode(mapping->host, 0);
+ if ((gfp & __GFP_WAIT) &&
+ !bdi_write_congested(&nfss->backing_dev_info)) {
+ wait_on_page_bit_killable_timeout(page, PG_private,
+ HZ);
+ if (PagePrivate(page))
+ set_bdi_congested(&nfss->backing_dev_info,
+ BLK_RW_ASYNC);
+ }
}
/* If PagePrivate() is set, then the page is not freeable */
if (PagePrivate(page))
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
+ int ret;
+ struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
*span = sis->pages;
- return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
+
+ rcu_read_lock();
+ ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
+ rcu_read_unlock();
+
+ return ret;
}
static void nfs_swap_deactivate(struct file *file)
{
- xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
+ struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+
+ rcu_read_lock();
+ xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
+ rcu_read_unlock();
}
#endif
return result;
if (file->f_flags & O_DIRECT)
- return nfs_file_direct_write(iocb, from, pos, true);
+ return nfs_file_direct_write(iocb, from, pos);
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
file, count, (long long) pos);
}
EXPORT_SYMBOL_GPL(nfs_flock);
-/*
- * There is no protocol support for leases, so we have no way to implement
- * them correctly in the face of opens by other clients.
- */
-int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
-{
- dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg);
- return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(nfs_setlease);
-
const struct file_operations nfs_file_operations = {
.llseek = nfs_file_llseek,
.read = new_sync_read,
.splice_read = nfs_file_splice_read,
.splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
- .setlease = nfs_setlease,
+ .setlease = simple_nosetlease,
};
EXPORT_SYMBOL_GPL(nfs_file_operations);
bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
- xfs_buf_iorequest(bp);
- if (flags & XBF_ASYNC)
+ if (flags & XBF_ASYNC) {
+ xfs_buf_submit(bp);
return 0;
- return xfs_buf_iowait(bp);
+ }
+ return xfs_buf_submit_wait(bp);
}
xfs_buf_t *
* Read an uncached buffer from disk. Allocates and returns a locked
* buffer containing the disk contents or nothing.
*/
-struct xfs_buf *
+int
xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
int flags,
+ struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
+ *bpp = NULL;
+
bp = xfs_buf_get_uncached(target, numblks, flags);
if (!bp)
- return NULL;
+ return -ENOMEM;
/* set up the buffer for a read IO */
ASSERT(bp->b_map_count == 1);
- bp->b_bn = daddr;
+ bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */
bp->b_maps[0].bm_bn = daddr;
bp->b_flags |= XBF_READ;
bp->b_ops = ops;
- if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
+ xfs_buf_submit_wait(bp);
+ if (bp->b_error) {
+ int error = bp->b_error;
xfs_buf_relse(bp);
- return NULL;
+ return error;
}
- xfs_buf_iorequest(bp);
- xfs_buf_iowait(bp);
- return bp;
+
+ *bpp = bp;
+ return 0;
}
/*
* Buffer Utility Routines
*/
-STATIC void
-xfs_buf_iodone_work(
- struct work_struct *work)
+void
+xfs_buf_ioend(
+ struct xfs_buf *bp)
{
- struct xfs_buf *bp =
- container_of(work, xfs_buf_t, b_iodone_work);
- bool read = !!(bp->b_flags & XBF_READ);
+ bool read = bp->b_flags & XBF_READ;
+
+ trace_xfs_buf_iodone(bp, _RET_IP_);
bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
- /* only validate buffers that were read without errors */
- if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
+ /*
+ * Pull in IO completion errors now. We are guaranteed to be running
+ * single threaded, so we don't need the lock to read b_io_error.
+ */
+ if (!bp->b_error && bp->b_io_error)
+ xfs_buf_ioerror(bp, bp->b_io_error);
+
+ /* Only validate buffers that were read without errors */
+ if (read && !bp->b_error && bp->b_ops) {
+ ASSERT(!bp->b_iodone);
bp->b_ops->verify_read(bp);
+ }
+
+ if (!bp->b_error)
+ bp->b_flags |= XBF_DONE;
if (bp->b_iodone)
(*(bp->b_iodone))(bp);
else if (bp->b_flags & XBF_ASYNC)
xfs_buf_relse(bp);
- else {
- ASSERT(read && bp->b_ops);
+ else
complete(&bp->b_iowait);
- }
}
-void
-xfs_buf_ioend(
- struct xfs_buf *bp,
- int schedule)
+static void
+xfs_buf_ioend_work(
+ struct work_struct *work)
{
- bool read = !!(bp->b_flags & XBF_READ);
-
- trace_xfs_buf_iodone(bp, _RET_IP_);
+ struct xfs_buf *bp =
+ container_of(work, xfs_buf_t, b_iodone_work);
- if (bp->b_error == 0)
- bp->b_flags |= XBF_DONE;
+ xfs_buf_ioend(bp);
+}
- if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
- if (schedule) {
- INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
- queue_work(xfslogd_workqueue, &bp->b_iodone_work);
- } else {
- xfs_buf_iodone_work(&bp->b_iodone_work);
- }
- } else {
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
- complete(&bp->b_iowait);
- }
+void
+xfs_buf_ioend_async(
+ struct xfs_buf *bp)
+{
+ INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
+ queue_work(xfslogd_workqueue, &bp->b_iodone_work);
}
void
(__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
}
-/*
- * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
- * so that the proper iodone callbacks get called.
- */
-STATIC int
-xfs_bioerror(
- xfs_buf_t *bp)
-{
-#ifdef XFSERRORDEBUG
- ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
-#endif
-
- /*
- * No need to wait until the buffer is unpinned, we aren't flushing it.
- */
- xfs_buf_ioerror(bp, -EIO);
-
- /*
- * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
- */
- XFS_BUF_UNREAD(bp);
- XFS_BUF_UNDONE(bp);
- xfs_buf_stale(bp);
-
- xfs_buf_ioend(bp, 0);
-
- return -EIO;
-}
-
-/*
- * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the xfs_buf_ioend call.
- * This is meant for userdata errors; metadata bufs come with
- * iodone functions attached, so that we can track down errors.
- */
-int
-xfs_bioerror_relse(
- struct xfs_buf *bp)
-{
- int64_t fl = bp->b_flags;
- /*
- * No need to wait until the buffer is unpinned.
- * We aren't flushing it.
- *
- * chunkhold expects B_DONE to be set, whether
- * we actually finish the I/O or not. We don't want to
- * change that interface.
- */
- XFS_BUF_UNREAD(bp);
- XFS_BUF_DONE(bp);
- xfs_buf_stale(bp);
- bp->b_iodone = NULL;
- if (!(fl & XBF_ASYNC)) {
- /*
- * Mark b_error and B_ERROR _both_.
- * Lot's of chunkcache code assumes that.
- * There's no reason to mark error for
- * ASYNC buffers.
- */
- xfs_buf_ioerror(bp, -EIO);
- complete(&bp->b_iowait);
- } else {
- xfs_buf_relse(bp);
- }
-
- return -EIO;
-}
-
-STATIC int
-xfs_bdstrat_cb(
- struct xfs_buf *bp)
-{
- if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
- trace_xfs_bdstrat_shut(bp, _RET_IP_);
- /*
- * Metadata write that didn't get logged but
- * written delayed anyway. These aren't associated
- * with a transaction, and can be ignored.
- */
- if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
- return xfs_bioerror_relse(bp);
- else
- return xfs_bioerror(bp);
- }
-
- xfs_buf_iorequest(bp);
- return 0;
-}
-
int
xfs_bwrite(
struct xfs_buf *bp)
ASSERT(xfs_buf_islocked(bp));
bp->b_flags |= XBF_WRITE;
- bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
+ bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
+ XBF_WRITE_FAIL | XBF_DONE);
- xfs_bdstrat_cb(bp);
-
- error = xfs_buf_iowait(bp);
+ error = xfs_buf_submit_wait(bp);
if (error) {
xfs_force_shutdown(bp->b_target->bt_mount,
SHUTDOWN_META_IO_ERROR);
}
STATIC void
-_xfs_buf_ioend(
- xfs_buf_t *bp,
- int schedule)
-{
- if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
- xfs_buf_ioend(bp, schedule);
-}
-
-STATIC void
xfs_buf_bio_end_io(
struct bio *bio,
int error)
* don't overwrite existing errors - otherwise we can lose errors on
* buffers that require multiple bios to complete.
*/
- if (!bp->b_error)
- xfs_buf_ioerror(bp, error);
+ if (error) {
+ spin_lock(&bp->b_lock);
+ if (!bp->b_io_error)
+ bp->b_io_error = error;
+ spin_unlock(&bp->b_lock);
+ }
if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
- _xfs_buf_ioend(bp, 1);
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+ xfs_buf_ioend_async(bp);
bio_put(bio);
}
} else {
/*
* This is guaranteed not to be the last io reference count
- * because the caller (xfs_buf_iorequest) holds a count itself.
+ * because the caller (xfs_buf_submit) holds a count itself.
*/
atomic_dec(&bp->b_io_remaining);
xfs_buf_ioerror(bp, -EIO);
blk_finish_plug(&plug);
}
+/*
+ * Asynchronous IO submission path. This transfers the buffer lock ownership and
+ * the current reference to the IO. It is not safe to reference the buffer after
+ * a call to this function unless the caller holds an additional reference
+ * itself.
+ */
void
-xfs_buf_iorequest(
- xfs_buf_t *bp)
+xfs_buf_submit(
+ struct xfs_buf *bp)
{
- trace_xfs_buf_iorequest(bp, _RET_IP_);
+ trace_xfs_buf_submit(bp, _RET_IP_);
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+ ASSERT(bp->b_flags & XBF_ASYNC);
+
+ /* on shutdown we stale and complete the buffer immediately */
+ if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ xfs_buf_ioerror(bp, -EIO);
+ bp->b_flags &= ~XBF_DONE;
+ xfs_buf_stale(bp);
+ xfs_buf_ioend(bp);
+ return;
+ }
if (bp->b_flags & XBF_WRITE)
xfs_buf_wait_unpin(bp);
+
+ /* clear the internal error state to avoid spurious errors */
+ bp->b_io_error = 0;
+
+ /*
+ * The caller's reference is released during I/O completion.
+ * This occurs some time after the last b_io_remaining reference is
+ * released, so after we drop our Io reference we have to have some
+ * other reference to ensure the buffer doesn't go away from underneath
+ * us. Take a direct reference to ensure we have safe access to the
+ * buffer until we are finished with it.
+ */
xfs_buf_hold(bp);
/*
- * Set the count to 1 initially, this will stop an I/O
- * completion callout which happens before we have started
- * all the I/O from calling xfs_buf_ioend too early.
+ * Set the count to 1 initially, this will stop an I/O completion
+ * callout which happens before we have started all the I/O from calling
+ * xfs_buf_ioend too early.
*/
atomic_set(&bp->b_io_remaining, 1);
_xfs_buf_ioapply(bp);
+
/*
- * If _xfs_buf_ioapply failed, we'll get back here with
- * only the reference we took above. _xfs_buf_ioend will
- * drop it to zero, so we'd better not queue it for later,
- * or we'll free it before it's done.
+ * If _xfs_buf_ioapply failed, we can get back here with only the IO
+ * reference we took above. If we drop it to zero, run completion so
+ * that we don't return to the caller with completion still pending.
*/
- _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+ if (bp->b_error)
+ xfs_buf_ioend(bp);
+ else
+ xfs_buf_ioend_async(bp);
+ }
xfs_buf_rele(bp);
+ /* Note: it is not safe to reference bp now we've dropped our ref */
}
/*
- * Waits for I/O to complete on the buffer supplied. It returns immediately if
- * no I/O is pending or there is already a pending error on the buffer, in which
- * case nothing will ever complete. It returns the I/O error code, if any, or
- * 0 if there was no error.
+ * Synchronous buffer IO submission path, read or write.
*/
int
-xfs_buf_iowait(
- xfs_buf_t *bp)
+xfs_buf_submit_wait(
+ struct xfs_buf *bp)
{
- trace_xfs_buf_iowait(bp, _RET_IP_);
+ int error;
- if (!bp->b_error)
- wait_for_completion(&bp->b_iowait);
+ trace_xfs_buf_submit_wait(bp, _RET_IP_);
+
+ ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC)));
+
+ if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ xfs_buf_ioerror(bp, -EIO);
+ xfs_buf_stale(bp);
+ bp->b_flags &= ~XBF_DONE;
+ return -EIO;
+ }
+
+ if (bp->b_flags & XBF_WRITE)
+ xfs_buf_wait_unpin(bp);
+
+ /* clear the internal error state to avoid spurious errors */
+ bp->b_io_error = 0;
+
+ /*
+ * For synchronous IO, the IO does not inherit the submitters reference
+ * count, nor the buffer lock. Hence we cannot release the reference we
+ * are about to take until we've waited for all IO completion to occur,
+ * including any xfs_buf_ioend_async() work that may be pending.
+ */
+ xfs_buf_hold(bp);
+
+ /*
+ * Set the count to 1 initially, this will stop an I/O completion
+ * callout which happens before we have started all the I/O from calling
+ * xfs_buf_ioend too early.
+ */
+ atomic_set(&bp->b_io_remaining, 1);
+ _xfs_buf_ioapply(bp);
+
+ /*
+ * make sure we run completion synchronously if it raced with us and is
+ * already complete.
+ */
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+ xfs_buf_ioend(bp);
+ /* wait for completion before gathering the error from the buffer */
+ trace_xfs_buf_iowait(bp, _RET_IP_);
+ wait_for_completion(&bp->b_iowait);
trace_xfs_buf_iowait_done(bp, _RET_IP_);
- return bp->b_error;
+ error = bp->b_error;
+
+ /*
+ * all done now, we can release the hold that keeps the buffer
+ * referenced for the entire IO.
+ */
+ xfs_buf_rele(bp);
+ return error;
}
xfs_caddr_t
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
btp->bt_bdi = blk_get_backing_dev_info(bdev);
- if (!btp->bt_bdi)
- goto error;
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, io_list, b_list) {
bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
- bp->b_flags |= XBF_WRITE;
+ bp->b_flags |= XBF_WRITE | XBF_ASYNC;
- if (!wait) {
- bp->b_flags |= XBF_ASYNC;
+ /*
+ * we do all Io submission async. This means if we need to wait
+ * for IO completion we need to take an extra reference so the
+ * buffer is still valid on the other side.
+ */
+ if (wait)
+ xfs_buf_hold(bp);
+ else
list_del_init(&bp->b_list);
- }
- xfs_bdstrat_cb(bp);
+
+ xfs_buf_submit(bp);
}
blk_finish_plug(&plug);
bp = list_first_entry(&io_list, struct xfs_buf, b_list);
list_del_init(&bp->b_list);
- error2 = xfs_buf_iowait(bp);
+
+ /* locking the buffer will wait for async IO completion. */
+ xfs_buf_lock(bp);
+ error2 = bp->b_error;
xfs_buf_relse(bp);
if (!error)
error = error2;
goto out;
xfslogd_workqueue = alloc_workqueue("xfslogd",
- WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1);
if (!xfslogd_workqueue)
goto out_free_buf_zone;
#include <linux/blkdev.h>
struct blk_mq_tags;
+ struct blk_flush_queue;
struct blk_mq_cpu_notifier {
struct list_head list;
struct request_queue *queue;
unsigned int queue_num;
+ struct blk_flush_queue *fq;
void *driver_data;
struct list_head tag_list;
};
- typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
+ typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool);
typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
+ typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
typedef int (init_request_fn)(void *, struct request *, unsigned int,
typedef void (exit_request_fn)(void *, struct request *, unsigned int,
unsigned int);
+ typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
+ bool);
+
struct blk_mq_ops {
/*
* Queue request
/*
* Called on request timeout
*/
- rq_timed_out_fn *timeout;
+ timeout_fn *timeout;
softirq_done_fn *complete;
/*
* Called for every command allocated by the block layer to allow
* the driver to set up driver specific data.
+ *
+ * Tag greater than or equal to queue_depth is for setting up
+ * flush request.
+ *
* Ditto for exit/teardown.
*/
init_request_fn *init_request;
};
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
+void blk_mq_finish_init(struct request_queue *q);
int blk_mq_register_disk(struct gendisk *);
void blk_mq_unregister_disk(struct gendisk *);
struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
- void blk_mq_end_io(struct request *rq, int error);
- void __blk_mq_end_io(struct request *rq, int error);
+ void blk_mq_start_request(struct request *rq);
+ void blk_mq_end_request(struct request *rq, int error);
+ void __blk_mq_end_request(struct request *rq, int error);
void blk_mq_requeue_request(struct request *rq);
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
- void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
+ void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
+ void *priv);
/*
* Driver command data is immediately after the request. So subtract request
struct sg_io_hdr;
struct bsg_job;
struct blkcg_gq;
+ struct blk_flush_queue;
#define BLKDEV_MIN_RQ 4
#define BLKDEV_MAX_RQ 128 /* Default maximum */
*/
unsigned int flush_flags;
unsigned int flush_not_queueable:1;
- unsigned int flush_queue_delayed:1;
- unsigned int flush_pending_idx:1;
- unsigned int flush_running_idx:1;
- unsigned long flush_pending_since;
- struct list_head flush_queue[2];
- struct list_head flush_data_in_flight;
- struct request *flush_rq;
- spinlock_t mq_flush_lock;
+ struct blk_flush_queue *fq;
struct list_head requeue_list;
spinlock_t requeue_lock;
static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
{
- return bdev->bd_disk->queue;
+ return bdev->bd_disk->queue; /* this is never NULL */
}
/*
static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector)
{
unsigned int granularity = max(lim->physical_block_size, lim->io_min);
- unsigned int alignment = (sector << 9) & (granularity - 1);
+ unsigned int alignment = sector_div(sector, granularity >> 9) << 9;
- return (granularity + lim->alignment_offset - alignment)
- & (granularity - 1);
+ return (granularity + lim->alignment_offset - alignment) % granularity;
}
static inline int bdev_alignment_offset(struct block_device *bdev)
#if defined(CONFIG_BLK_DEV_INTEGRITY)
- #define INTEGRITY_FLAG_READ 2 /* verify data integrity on read */
- #define INTEGRITY_FLAG_WRITE 4 /* generate data integrity on write */
+ enum blk_integrity_flags {
+ BLK_INTEGRITY_VERIFY = 1 << 0,
+ BLK_INTEGRITY_GENERATE = 1 << 1,
+ BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2,
+ BLK_INTEGRITY_IP_CHECKSUM = 1 << 3,
+ };
- struct blk_integrity_exchg {
+ struct blk_integrity_iter {
void *prot_buf;
void *data_buf;
- sector_t sector;
+ sector_t seed;
unsigned int data_size;
- unsigned short sector_size;
+ unsigned short interval;
const char *disk_name;
};
- typedef void (integrity_gen_fn) (struct blk_integrity_exchg *);
- typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *);
- typedef void (integrity_set_tag_fn) (void *, void *, unsigned int);
- typedef void (integrity_get_tag_fn) (void *, void *, unsigned int);
+ typedef int (integrity_processing_fn) (struct blk_integrity_iter *);
struct blk_integrity {
- integrity_gen_fn *generate_fn;
- integrity_vrfy_fn *verify_fn;
- integrity_set_tag_fn *set_tag_fn;
- integrity_get_tag_fn *get_tag_fn;
+ integrity_processing_fn *generate_fn;
+ integrity_processing_fn *verify_fn;
unsigned short flags;
unsigned short tuple_size;
- unsigned short sector_size;
+ unsigned short interval;
unsigned short tag_size;
const char *name;
extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
struct scatterlist *);
extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
- extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
- struct request *);
- extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
- struct bio *);
+ extern bool blk_integrity_merge_rq(struct request_queue *, struct request *,
+ struct request *);
+ extern bool blk_integrity_merge_bio(struct request_queue *, struct request *,
+ struct bio *);
static inline
struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
return disk->integrity;
}
- static inline int blk_integrity_rq(struct request *rq)
+ static inline bool blk_integrity_rq(struct request *rq)
{
- if (rq->bio == NULL)
- return 0;
-
- return bio_integrity(rq->bio);
+ return rq->cmd_flags & REQ_INTEGRITY;
}
static inline void blk_queue_max_integrity_segments(struct request_queue *q,
}
static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
{
- return 0;
+ return NULL;
}
static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
{
{
return 0;
}
- static inline int blk_integrity_merge_rq(struct request_queue *rq,
- struct request *r1,
- struct request *r2)
+ static inline bool blk_integrity_merge_rq(struct request_queue *rq,
+ struct request *r1,
+ struct request *r2)
{
return 0;
}
- static inline int blk_integrity_merge_bio(struct request_queue *rq,
- struct request *r,
- struct bio *b)
+ static inline bool blk_integrity_merge_bio(struct request_queue *rq,
+ struct request *r,
+ struct bio *b)
{
return 0;
}
#define READ 0
#define WRITE RW_MASK
#define READA RWA_MASK
- #define KERNEL_READ (READ|REQ_KERNEL)
- #define KERNEL_WRITE (WRITE|REQ_KERNEL)
#define READ_SYNC (READ | REQ_SYNC)
#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE)
*/
#define FILE_LOCK_DEFERRED 1
-/*
- * The POSIX file lock owner is determined by
- * the "struct files_struct" in the thread group
- * (or NULL for no owner - BSD locks).
- *
- * Lockd stuffs a "host" pointer into this.
- */
+/* legacy typedef, should eventually be removed */
typedef void *fl_owner_t;
struct file_lock_operations {
struct lock_manager_operations {
int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
unsigned long (*lm_owner_key)(struct file_lock *);
+ void (*lm_get_owner)(struct file_lock *, struct file_lock *);
+ void (*lm_put_owner)(struct file_lock *);
void (*lm_notify)(struct file_lock *); /* unblock callback */
- int (*lm_grant)(struct file_lock *, struct file_lock *, int);
- void (*lm_break)(struct file_lock *);
- int (*lm_change)(struct file_lock **, int);
+ int (*lm_grant)(struct file_lock *, int);
+ bool (*lm_break)(struct file_lock *);
+ int (*lm_change)(struct file_lock **, int, struct list_head *);
+ void (*lm_setup)(struct file_lock *, void **);
};
struct lock_manager {
extern void locks_init_lock(struct file_lock *);
extern struct file_lock * locks_alloc_lock(void);
extern void locks_copy_lock(struct file_lock *, struct file_lock *);
-extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
+extern void locks_copy_conflock(struct file_lock *, struct file_lock *);
extern void locks_remove_posix(struct file *, fl_owner_t);
extern void locks_remove_file(struct file *);
extern void locks_release_private(struct file_lock *);
extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
extern void lease_get_mtime(struct inode *, struct timespec *time);
-extern int generic_setlease(struct file *, long, struct file_lock **);
-extern int vfs_setlease(struct file *, long, struct file_lock **);
-extern int lease_modify(struct file_lock **, int);
-extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
-extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
+extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
+extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
+extern int lease_modify(struct file_lock **, int, struct list_head *);
#else /* !CONFIG_FILE_LOCKING */
static inline int fcntl_getlk(struct file *file, unsigned int cmd,
struct flock __user *user)
#endif
static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
{
- return 0;
+ return -EINVAL;
}
static inline int fcntl_getlease(struct file *filp)
{
- return 0;
+ return F_UNLCK;
}
static inline void locks_init_lock(struct file_lock *fl)
return;
}
-static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl)
+static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
return;
}
}
static inline int generic_setlease(struct file *filp, long arg,
- struct file_lock **flp)
+ struct file_lock **flp, void **priv)
{
return -EINVAL;
}
static inline int vfs_setlease(struct file *filp, long arg,
- struct file_lock **lease)
+ struct file_lock **lease, void **priv)
{
return -EINVAL;
}
-static inline int lease_modify(struct file_lock **before, int arg)
+static inline int lease_modify(struct file_lock **before, int arg,
+ struct list_head *dispose)
{
return -EINVAL;
}
-
-static inline int lock_may_read(struct inode *inode, loff_t start,
- unsigned long len)
-{
- return 1;
-}
-
-static inline int lock_may_write(struct inode *inode, loff_t start,
- unsigned long len)
-{
- return 1;
-}
#endif /* !CONFIG_FILE_LOCKING */
/* can be called from interrupts */
extern void kill_fasync(struct fasync_struct **, int, int);
-extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
-extern int f_setown(struct file *filp, unsigned long arg, int force);
+extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
+extern void f_setown(struct file *filp, unsigned long arg, int force);
extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct fown_struct *fown);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
- int (*setlease)(struct file *, long, struct file_lock **);
+ int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
int (*show_fdinfo)(struct seq_file *m, struct file *f);
extern void kern_unmount(struct vfsmount *mnt);
extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
-extern long do_mount(const char *, const char *, const char *, unsigned long, void *);
+extern long do_mount(const char *, const char __user *,
+ const char *, unsigned long, void *);
extern struct vfsmount *collect_mounts(struct path *);
extern void drop_collected_mounts(struct vfsmount *);
extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
extern void ihold(struct inode * inode);
extern void iput(struct inode *);
-static inline struct inode *file_inode(struct file *f)
+static inline struct inode *file_inode(const struct file *f)
{
return f->f_inode;
}
struct page *page, void *fsdata);
extern int always_delete_dentry(const struct dentry *);
extern struct inode *alloc_anon_inode(struct super_block *);
+extern int simple_nosetlease(struct file *, long, struct file_lock **, void **);
extern const struct dentry_operations simple_dentry_operations;
extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
}
/*
- * linux/fs/nfs/xattr.c
- */
-#ifdef CONFIG_NFS_V3_ACL
-extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
-extern ssize_t nfs3_getxattr(struct dentry *, const char *, void *, size_t);
-extern int nfs3_setxattr(struct dentry *, const char *,
- const void *, size_t, int);
-extern int nfs3_removexattr (struct dentry *, const char *name);
-#else
-# define nfs3_listxattr NULL
-# define nfs3_getxattr NULL
-# define nfs3_setxattr NULL
-# define nfs3_removexattr NULL
-#endif
-
-/*
* linux/fs/nfs/direct.c
*/
extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t);
extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
struct iov_iter *iter,
- loff_t pos, bool uio);
+ loff_t pos);
extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
struct iov_iter *iter,
- loff_t pos, bool uio);
+ loff_t pos);
/*
* linux/fs/nfs/dir.c
extern int nfs_wb_all(struct inode *inode);
extern int nfs_wb_page(struct inode *inode, struct page* page);
extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
-#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
extern int nfs_commit_inode(struct inode *, int);
extern struct nfs_commit_data *nfs_commitdata_alloc(void);
extern void nfs_commit_free(struct nfs_commit_data *data);
-#else
-static inline int
-nfs_commit_inode(struct inode *inode, int how)
-{
- return 0;
-}
-#endif
static inline int
nfs_have_writebacks(struct inode *inode)
struct page *);
/*
- * linux/fs/nfs3proc.c
- */
-#ifdef CONFIG_NFS_V3_ACL
-extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
-extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
- struct posix_acl *dfacl);
-extern const struct xattr_handler *nfs3_xattr_handlers[];
-#else
-static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
- struct posix_acl *dfacl)
-{
- return 0;
-}
-#endif /* CONFIG_NFS_V3_ACL */
-
-/*
* inline functions
*/
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;
- void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+ static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
{
if (wb1 < wb2) {
spin_lock(&wb1->list_lock);
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
flush_delayed_work(&bdi->wb.dwork);
WARN_ON(!list_empty(&bdi->work_list));
-
- /*
- * This shouldn't be necessary unless @bdi for some reason has
- * unflushed dirty IO after work_list is drained. Do it anyway
- * just in case.
- */
- cancel_delayed_work_sync(&bdi->wb.dwork);
+ WARN_ON(delayed_work_pending(&bdi->wb.dwork));
}
/*
void bdi_unregister(struct backing_dev_info *bdi)
{
- struct device *dev = bdi->dev;
-
- if (dev) {
+ if (bdi->dev) {
bdi_set_min_ratio(bdi, 0);
trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
bdi_wb_shutdown(bdi);
bdi_debug_unregister(bdi);
-
- spin_lock_bh(&bdi->wb_lock);
+ device_unregister(bdi->dev);
bdi->dev = NULL;
- spin_unlock_bh(&bdi->wb_lock);
-
- device_unregister(dev);
}
}
EXPORT_SYMBOL(bdi_unregister);
bdi_wb_init(&bdi->wb, bdi);
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
- err = percpu_counter_init(&bdi->bdi_stat[i], 0);
+ err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
if (err)
goto err;
}
bdi->write_bandwidth = INIT_BW;
bdi->avg_write_bandwidth = INIT_BW;
- err = fprop_local_init_percpu(&bdi->completions);
+ err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
if (err) {
err:
int i;
/*
- * Splice our entries to the default_backing_dev_info, if this
- * bdi disappears
+ * Splice our entries to the default_backing_dev_info. This
+ * condition shouldn't happen. @wb must be empty at this point and
+ * dirty inodes on it might cause other issues. This workaround is
+ * added by ce5f8e779519 ("writeback: splice dirty inode entries to
+ * default bdi on bdi_destroy()") without root-causing the issue.
+ *
+ * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com
+ * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350
+ *
+ * We should probably add WARN_ON() to find out whether it still
+ * happens and track it down if so.
*/
if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb;
bdi_unregister(bdi);
- /*
- * If bdi_unregister() had already been called earlier, the dwork
- * could still be pending because bdi_prune_sb() can race with the
- * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
- */
- cancel_delayed_work_sync(&bdi->wb.dwork);
+ WARN_ON(delayed_work_pending(&bdi->wb.dwork));
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
* of sleeping on the congestion queue
*/
if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
- !zone_is_reclaim_congested(zone)) {
+ !test_bit(ZONE_CONGESTED, &zone->flags)) {
cond_resched();
/* In case we scheduled, work out time remaining */