Merge branch 'for-3.18/core' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 18 Oct 2014 18:53:51 +0000 (11:53 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 18 Oct 2014 18:53:51 +0000 (11:53 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 18 Oct 2014 18:53:51 +0000 (11:53 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 18 Oct 2014 18:53:51 +0000 (11:53 -0700)
diff --combined block/blk-core.c

index 9c888bd,d6ec7db..0421b53
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -83,18 -83,14 +83,14 @@@ void blk_queue_congestion_threshold(str
    * @bdev:     device
    *
    * Locates the passed device's request queue and returns the address of its
-  * backing_dev_info
-  *
-  * Will return NULL if the request queue cannot be located.
+  * backing_dev_info.  This function can only be called if @bdev is opened
+  * and the return value is never NULL.
    */
   struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
   {
-       struct backing_dev_info *ret = NULL;
         struct request_queue *q = bdev_get_queue(bdev);
   
-       if (q)
-               ret = &q->backing_dev_info;
-       return ret;
+       return &q->backing_dev_info;
   }
   EXPORT_SYMBOL(blk_get_backing_dev_info);
   
@@@ -240,7 -236,7 +236,7 @@@ EXPORT_SYMBOL(blk_stop_queue)
    *     this function.
    *
    *     This function does not cancel any asynchronous activity arising
- - *     out of elevator or throttling code. That would require elevaotor_exit()
+ + *     out of elevator or throttling code. That would require elevator_exit()
    *     and blkcg_exit_queue() to be called with queue lock initialized.
    *
    */
@@@ -394,11 -390,13 +390,13 @@@ static void __blk_drain_queue(struct re
                  * be drained.  Check all the queues and counters.
                  */
                 if (drain_all) {
+                       struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
                         drain |= !list_empty(&q->queue_head);
                         for (i = 0; i < 2; i++) {
                                 drain |= q->nr_rqs[i];
                                 drain |= q->in_flight[i];
-                               drain |= !list_empty(&q->flush_queue[i]);
+                               if (fq)
+                                   drain |= !list_empty(&fq->flush_queue[i]);
                         }
                 }
   
@@@ -604,9 -602,6 +602,6 @@@ struct request_queue *blk_alloc_queue_n
   #ifdef CONFIG_BLK_CGROUP
         INIT_LIST_HEAD(&q->blkg_list);
   #endif
-       INIT_LIST_HEAD(&q->flush_queue[0]);
-       INIT_LIST_HEAD(&q->flush_queue[1]);
-       INIT_LIST_HEAD(&q->flush_data_in_flight);
         INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
   
         kobject_init(&q->kobj, &blk_queue_ktype);
@@@ -709,8 -704,8 +704,8 @@@ blk_init_allocated_queue(struct request
         if (!q)
                 return NULL;
   
-       q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
-       if (!q->flush_rq)
+       q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0);
+       if (!q->fq)
                 return NULL;
   
         if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
@@@ -746,7 -741,7 +741,7 @@@
         return q;
   
   fail:
-       kfree(q->flush_rq);
+       blk_free_flush_queue(q->fq);
         return NULL;
   }
   EXPORT_SYMBOL(blk_init_allocated_queue);
@@@ -934,8 -929,8 +929,8 @@@ static struct io_context *rq_ioc(struc
    * pressure or if @q is dead.
    *
    * Must be called with @q->queue_lock held and,
-  * Returns %NULL on failure, with @q->queue_lock held.
-  * Returns !%NULL on success, with @q->queue_lock *not held*.
+  * Returns ERR_PTR on failure, with @q->queue_lock held.
+  * Returns request pointer on success, with @q->queue_lock *not held*.
    */
   static struct request *__get_request(struct request_list *rl, int rw_flags,
                                      struct bio *bio, gfp_t gfp_mask)
@@@ -949,7 -944,7 +944,7 @@@
         int may_queue;
   
         if (unlikely(blk_queue_dying(q)))
-               return NULL;
+               return ERR_PTR(-ENODEV);
   
         may_queue = elv_may_queue(q, rw_flags);
         if (may_queue == ELV_MQUEUE_NO)
@@@ -974,7 -969,7 +969,7 @@@
                                          * process is not a "batcher", and not
                                          * exempted by the IO scheduler
                                          */
-                                       return NULL;
+                                       return ERR_PTR(-ENOMEM);
                                 }
                         }
                 }
@@@ -992,7 -987,7 +987,7 @@@
          * allocated with any setting of ->nr_requests
          */
         if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
-               return NULL;
+               return ERR_PTR(-ENOMEM);
   
         q->nr_rqs[is_sync]++;
         rl->count[is_sync]++;
@@@ -1065,8 -1060,8 +1060,8 @@@ fail_elvpriv
          * shouldn't stall IO.  Treat this request as !elvpriv.  This will
          * disturb iosched and blkcg but weird is bettern than dead.
          */
-       printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
-                          dev_name(q->backing_dev_info.dev));
+       printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
+                          __func__, dev_name(q->backing_dev_info.dev));
   
         rq->cmd_flags &= ~REQ_ELVPRIV;
         rq->elv.icq = NULL;
@@@ -1097,7 -1092,7 +1092,7 @@@ fail_alloc
   rq_starved:
         if (unlikely(rl->count[is_sync] == 0))
                 rl->starved[is_sync] = 1;
-       return NULL;
+       return ERR_PTR(-ENOMEM);
   }
   
   /**
@@@ -1111,8 -1106,8 +1106,8 @@@
    * function keeps retrying under memory pressure and fails iff @q is dead.
    *
    * Must be called with @q->queue_lock held and,
-  * Returns %NULL on failure, with @q->queue_lock held.
-  * Returns !%NULL on success, with @q->queue_lock *not held*.
+  * Returns ERR_PTR on failure, with @q->queue_lock held.
+  * Returns request pointer on success, with @q->queue_lock *not held*.
    */
   static struct request *get_request(struct request_queue *q, int rw_flags,
                                    struct bio *bio, gfp_t gfp_mask)
@@@ -1125,12 -1120,12 +1120,12 @@@
         rl = blk_get_rl(q, bio);        /* transferred to @rq on success */
   retry:
         rq = __get_request(rl, rw_flags, bio, gfp_mask);
-       if (rq)
+       if (!IS_ERR(rq))
                 return rq;
   
         if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
                 blk_put_rl(rl);
-               return NULL;
+               return rq;
         }
   
         /* wait on @rl and retry */
@@@ -1167,7 -1162,7 +1162,7 @@@ static struct request *blk_old_get_requ
   
         spin_lock_irq(q->queue_lock);
         rq = get_request(q, rw, NULL, gfp_mask);
-       if (!rq)
+       if (IS_ERR(rq))
                 spin_unlock_irq(q->queue_lock);
         /* q->queue_lock is unlocked at this point */
   
@@@ -1219,8 -1214,8 +1214,8 @@@ struct request *blk_make_request(struc
   {
         struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
   
-       if (unlikely(!rq))
-               return ERR_PTR(-ENOMEM);
+       if (IS_ERR(rq))
+               return rq;
   
         blk_rq_set_block_pc(rq);
   
@@@ -1241,7 -1236,7 +1236,7 @@@
   EXPORT_SYMBOL(blk_make_request);
   
   /**
- - * blk_rq_set_block_pc - initialize a requeest to type BLOCK_PC
+ + * blk_rq_set_block_pc - initialize a request to type BLOCK_PC
    * @rq:               request to be initialized
    *
    */
@@@ -1614,8 -1609,8 +1609,8 @@@ get_rq
          * Returns with the queue unlocked.
          */
         req = get_request(q, rw_flags, bio, GFP_NOIO);
-       if (unlikely(!req)) {
-               bio_endio(bio, -ENODEV);        /* @q is dead */
+       if (IS_ERR(req)) {
+               bio_endio(bio, PTR_ERR(req));   /* @q is dead */
                 goto out_unlock;
         }
   
@@@ -2405,11 -2400,11 +2400,11 @@@ bool blk_update_request(struct request 
   {
         int total_bytes;
   
+       trace_block_rq_complete(req->q, req, nr_bytes);
+ 
         if (!req->bio)
                 return false;
   
-       trace_block_rq_complete(req->q, req, nr_bytes);
- 
         /*
          * For fs requests, rq is just carrier of independent bio's
          * and each partial completion should be handled separately.
@@@ -2449,8 -2444,8 +2444,8 @@@
                         error_type = "I/O";
                         break;
                 }
-               printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
-                                  error_type, req->rq_disk ?
+               printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n",
+                                  __func__, error_type, req->rq_disk ?
                                    req->rq_disk->disk_name : "?",
                                    (unsigned long long)blk_rq_pos(req));
   
@@@ -2931,7 -2926,7 +2926,7 @@@ int blk_rq_prep_clone(struct request *r
         blk_rq_init(NULL, rq);
   
         __rq_for_each_bio(bio_src, rq_src) {
-               bio = bio_clone_bioset(bio_src, gfp_mask, bs);
+               bio = bio_clone_fast(bio_src, gfp_mask, bs);
                 if (!bio)
                         goto free_and_out;
   
diff --combined block/blk-mq.c

index 38f4a16,79aa11b..68929ba
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -20,6 -20,7 +20,7 @@@
   #include <linux/cache.h>
   #include <linux/sched/sysctl.h>
   #include <linux/delay.h>
+ #include <linux/crash_dump.h>
   
   #include <trace/events/block.h>
   
@@@ -223,9 -224,11 +224,11 @@@ struct request *blk_mq_alloc_request(st
         struct blk_mq_hw_ctx *hctx;
         struct request *rq;
         struct blk_mq_alloc_data alloc_data;
+       int ret;
   
-       if (blk_mq_queue_enter(q))
-               return NULL;
+       ret = blk_mq_queue_enter(q);
+       if (ret)
+               return ERR_PTR(ret);
   
         ctx = blk_mq_get_ctx(q);
         hctx = q->mq_ops->map_queue(q, ctx->cpu);
@@@ -245,6 -248,8 +248,8 @@@
                 ctx = alloc_data.ctx;
         }
         blk_mq_put_ctx(ctx);
+       if (!rq)
+               return ERR_PTR(-EWOULDBLOCK);
         return rq;
   }
   EXPORT_SYMBOL(blk_mq_alloc_request);
@@@ -276,27 -281,7 +281,7 @@@ void blk_mq_free_request(struct reques
         __blk_mq_free_request(hctx, ctx, rq);
   }
   
- /*
-  * Clone all relevant state from a request that has been put on hold in
-  * the flush state machine into the preallocated flush request that hangs
-  * off the request queue.
-  *
-  * For a driver the flush request should be invisible, that's why we are
-  * impersonating the original request here.
-  */
- void blk_mq_clone_flush_request(struct request *flush_rq,
-               struct request *orig_rq)
- {
-       struct blk_mq_hw_ctx *hctx =
-               orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
- 
-       flush_rq->mq_ctx = orig_rq->mq_ctx;
-       flush_rq->tag = orig_rq->tag;
-       memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
-               hctx->cmd_size);
- }
- 
- inline void __blk_mq_end_io(struct request *rq, int error)
+ inline void __blk_mq_end_request(struct request *rq, int error)
   {
         blk_account_io_done(rq);
   
@@@ -308,15 -293,15 +293,15 @@@
                 blk_mq_free_request(rq);
         }
   }
- EXPORT_SYMBOL(__blk_mq_end_io);
+ EXPORT_SYMBOL(__blk_mq_end_request);
   
- void blk_mq_end_io(struct request *rq, int error)
+ void blk_mq_end_request(struct request *rq, int error)
   {
         if (blk_update_request(rq, error, blk_rq_bytes(rq)))
                 BUG();
-       __blk_mq_end_io(rq, error);
+       __blk_mq_end_request(rq, error);
   }
- EXPORT_SYMBOL(blk_mq_end_io);
+ EXPORT_SYMBOL(blk_mq_end_request);
   
   static void __blk_mq_complete_request_remote(void *data)
   {
@@@ -356,7 -341,7 +341,7 @@@ void __blk_mq_complete_request(struct r
         struct request_queue *q = rq->q;
   
         if (!q->softirq_done_fn)
-               blk_mq_end_io(rq, rq->errors);
+               blk_mq_end_request(rq, rq->errors);
         else
                 blk_mq_ipi_complete_request(rq);
   }
@@@ -380,7 -365,7 +365,7 @@@ void blk_mq_complete_request(struct req
   }
   EXPORT_SYMBOL(blk_mq_complete_request);
   
- static void blk_mq_start_request(struct request *rq, bool last)
+ void blk_mq_start_request(struct request *rq)
   {
         struct request_queue *q = rq->q;
   
@@@ -417,35 -402,24 +402,24 @@@
                  */
                 rq->nr_phys_segments++;
         }
- 
-       /*
-        * Flag the last request in the series so that drivers know when IO
-        * should be kicked off, if they don't do it on a per-request basis.
-        *
-        * Note: the flag isn't the only condition drivers should do kick off.
-        * If drive is busy, the last request might not have the bit set.
-        */
-       if (last)
-               rq->cmd_flags |= REQ_END;
   }
+ EXPORT_SYMBOL(blk_mq_start_request);
   
   static void __blk_mq_requeue_request(struct request *rq)
   {
         struct request_queue *q = rq->q;
   
         trace_block_rq_requeue(q, rq);
-       clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- 
-       rq->cmd_flags &= ~REQ_END;
   
-       if (q->dma_drain_size && blk_rq_bytes(rq))
-               rq->nr_phys_segments--;
+       if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+               if (q->dma_drain_size && blk_rq_bytes(rq))
+                       rq->nr_phys_segments--;
+       }
   }
   
   void blk_mq_requeue_request(struct request *rq)
   {
         __blk_mq_requeue_request(rq);
-       blk_clear_rq_complete(rq);
   
         BUG_ON(blk_queued_rq(rq));
         blk_mq_add_to_requeue_list(rq, true);
@@@ -514,78 -488,35 +488,35 @@@ void blk_mq_kick_requeue_list(struct re
   }
   EXPORT_SYMBOL(blk_mq_kick_requeue_list);
   
- static inline bool is_flush_request(struct request *rq, unsigned int tag)
+ static inline bool is_flush_request(struct request *rq,
+               struct blk_flush_queue *fq, unsigned int tag)
   {
         return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
-                       rq->q->flush_rq->tag == tag);
+                       fq->flush_rq->tag == tag);
   }
   
   struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
   {
         struct request *rq = tags->rqs[tag];
+       /* mq_ctx of flush rq is always cloned from the corresponding req */
+       struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx);
   
-       if (!is_flush_request(rq, tag))
+       if (!is_flush_request(rq, fq, tag))
                 return rq;
   
-       return rq->q->flush_rq;
+       return fq->flush_rq;
   }
   EXPORT_SYMBOL(blk_mq_tag_to_rq);
   
   struct blk_mq_timeout_data {
-       struct blk_mq_hw_ctx *hctx;
-       unsigned long *next;
-       unsigned int *next_set;
+       unsigned long next;
+       unsigned int next_set;
   };
   
- static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
+ void blk_mq_rq_timed_out(struct request *req, bool reserved)
   {
-       struct blk_mq_timeout_data *data = __data;
-       struct blk_mq_hw_ctx *hctx = data->hctx;
-       unsigned int tag;
- 
-        /* It may not be in flight yet (this is where
-        * the REQ_ATOMIC_STARTED flag comes in). The requests are
-        * statically allocated, so we know it's always safe to access the
-        * memory associated with a bit offset into ->rqs[].
-        */
-       tag = 0;
-       do {
-               struct request *rq;
- 
-               tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
-               if (tag >= hctx->tags->nr_tags)
-                       break;
- 
-               rq = blk_mq_tag_to_rq(hctx->tags, tag++);
-               if (rq->q != hctx->queue)
-                       continue;
-               if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-                       continue;
- 
-               blk_rq_check_expired(rq, data->next, data->next_set);
-       } while (1);
- }
- 
- static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
-                                       unsigned long *next,
-                                       unsigned int *next_set)
- {
-       struct blk_mq_timeout_data data = {
-               .hctx           = hctx,
-               .next           = next,
-               .next_set       = next_set,
-       };
- 
-       /*
-        * Ask the tagging code to iterate busy requests, so we can
-        * check them for timeout.
-        */
-       blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
- }
- 
- static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
- {
-       struct request_queue *q = rq->q;
+       struct blk_mq_ops *ops = req->q->mq_ops;
+       enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
   
         /*
          * We know that complete is set at this point. If STARTED isn't set
@@@ -596,21 -527,54 +527,54 @@@
          * we both flags will get cleared. So check here again, and ignore
          * a timeout event with a request that isn't active.
          */
-       if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-               return BLK_EH_NOT_HANDLED;
+       if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
+               return;
+ 
+       if (ops->timeout)
+               ret = ops->timeout(req, reserved);
+ 
+       switch (ret) {
+       case BLK_EH_HANDLED:
+               __blk_mq_complete_request(req);
+               break;
+       case BLK_EH_RESET_TIMER:
+               blk_add_timer(req);
+               blk_clear_rq_complete(req);
+               break;
+       case BLK_EH_NOT_HANDLED:
+               break;
+       default:
+               printk(KERN_ERR "block: bad eh return: %d\n", ret);
+               break;
+       }
+ }
+               
+ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
+               struct request *rq, void *priv, bool reserved)
+ {
+       struct blk_mq_timeout_data *data = priv;
   
-       if (!q->mq_ops->timeout)
-               return BLK_EH_RESET_TIMER;
+       if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+               return;
   
-       return q->mq_ops->timeout(rq);
+       if (time_after_eq(jiffies, rq->deadline)) {
+               if (!blk_mark_rq_complete(rq))
+                       blk_mq_rq_timed_out(rq, reserved);
+       } else if (!data->next_set || time_after(data->next, rq->deadline)) {
+               data->next = rq->deadline;
+               data->next_set = 1;
+       }
   }
   
- static void blk_mq_rq_timer(unsigned long data)
+ static void blk_mq_rq_timer(unsigned long priv)
   {
-       struct request_queue *q = (struct request_queue *) data;
+       struct request_queue *q = (struct request_queue *)priv;
+       struct blk_mq_timeout_data data = {
+               .next           = 0,
+               .next_set       = 0,
+       };
         struct blk_mq_hw_ctx *hctx;
-       unsigned long next = 0;
-       int i, next_set = 0;
+       int i;
   
         queue_for_each_hw_ctx(q, hctx, i) {
                 /*
@@@ -620,12 -584,12 +584,12 @@@
                 if (!hctx->nr_ctx || !hctx->tags)
                         continue;
   
-               blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
+               blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
         }
   
-       if (next_set) {
-               next = blk_rq_timeout(round_jiffies_up(next));
-               mod_timer(&q->timeout, next);
+       if (data.next_set) {
+               data.next = blk_rq_timeout(round_jiffies_up(data.next));
+               mod_timer(&q->timeout, data.next);
         } else {
                 queue_for_each_hw_ctx(q, hctx, i)
                         blk_mq_tag_idle(hctx);
@@@ -751,9 -715,7 +715,7 @@@ static void __blk_mq_run_hw_queue(struc
                 rq = list_first_entry(&rq_list, struct request, queuelist);
                 list_del_init(&rq->queuelist);
   
-               blk_mq_start_request(rq, list_empty(&rq_list));
- 
-               ret = q->mq_ops->queue_rq(hctx, rq);
+               ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
                 switch (ret) {
                 case BLK_MQ_RQ_QUEUE_OK:
                         queued++;
@@@ -766,7 -728,7 +728,7 @@@
                         pr_err("blk-mq: bad return on queue: %d\n", ret);
                 case BLK_MQ_RQ_QUEUE_ERROR:
                         rq->errors = -EIO;
-                       blk_mq_end_io(rq, rq->errors);
+                       blk_mq_end_request(rq, rq->errors);
                         break;
                 }
   
@@@ -1194,14 -1156,13 +1156,13 @@@ static void blk_mq_make_request(struct 
                 int ret;
   
                 blk_mq_bio_to_request(rq, bio);
-               blk_mq_start_request(rq, true);
   
                 /*
                  * For OK queue, we are done. For error, kill it. Any other
                  * error (busy), just add it to our list as we previously
                  * would have done
                  */
-               ret = q->mq_ops->queue_rq(data.hctx, rq);
+               ret = q->mq_ops->queue_rq(data.hctx, rq, true);
                 if (ret == BLK_MQ_RQ_QUEUE_OK)
                         goto done;
                 else {
@@@ -1209,7 -1170,7 +1170,7 @@@
   
                         if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
                                 rq->errors = -EIO;
-                               blk_mq_end_io(rq, rq->errors);
+                               blk_mq_end_request(rq, rq->errors);
                                 goto done;
                         }
                 }
@@@ -1531,6 -1492,28 +1492,28 @@@ static int blk_mq_hctx_notify(void *dat
         return NOTIFY_OK;
   }
   
+ static void blk_mq_exit_hctx(struct request_queue *q,
+               struct blk_mq_tag_set *set,
+               struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+ {
+       unsigned flush_start_tag = set->queue_depth;
+ 
+       blk_mq_tag_idle(hctx);
+ 
+       if (set->ops->exit_request)
+               set->ops->exit_request(set->driver_data,
+                                      hctx->fq->flush_rq, hctx_idx,
+                                      flush_start_tag + hctx_idx);
+ 
+       if (set->ops->exit_hctx)
+               set->ops->exit_hctx(hctx, hctx_idx);
+ 
+       blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+       blk_free_flush_queue(hctx->fq);
+       kfree(hctx->ctxs);
+       blk_mq_free_bitmap(&hctx->ctx_map);
+ }
+ 
   static void blk_mq_exit_hw_queues(struct request_queue *q,
                 struct blk_mq_tag_set *set, int nr_queue)
   {
@@@ -1540,17 -1523,8 +1523,8 @@@
         queue_for_each_hw_ctx(q, hctx, i) {
                 if (i == nr_queue)
                         break;
- 
-               blk_mq_tag_idle(hctx);
- 
-               if (set->ops->exit_hctx)
-                       set->ops->exit_hctx(hctx, i);
- 
-               blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-               kfree(hctx->ctxs);
-               blk_mq_free_bitmap(&hctx->ctx_map);
+               blk_mq_exit_hctx(q, set, hctx, i);
         }
- 
   }
   
   static void blk_mq_free_hw_queues(struct request_queue *q,
@@@ -1565,53 -1539,88 +1539,88 @@@
         }
   }
   
- static int blk_mq_init_hw_queues(struct request_queue *q,
-               struct blk_mq_tag_set *set)
+ static int blk_mq_init_hctx(struct request_queue *q,
+               struct blk_mq_tag_set *set,
+               struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
   {
-       struct blk_mq_hw_ctx *hctx;
-       unsigned int i;
+       int node;
+       unsigned flush_start_tag = set->queue_depth;
+ 
+       node = hctx->numa_node;
+       if (node == NUMA_NO_NODE)
+               node = hctx->numa_node = set->numa_node;
+ 
+       INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
+       INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
+       spin_lock_init(&hctx->lock);
+       INIT_LIST_HEAD(&hctx->dispatch);
+       hctx->queue = q;
+       hctx->queue_num = hctx_idx;
+       hctx->flags = set->flags;
+       hctx->cmd_size = set->cmd_size;
+ 
+       blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
+                                       blk_mq_hctx_notify, hctx);
+       blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
+ 
+       hctx->tags = set->tags[hctx_idx];
   
         /*
-        * Initialize hardware queues
+        * Allocate space for all possible cpus to avoid allocation at
+        * runtime
          */
-       queue_for_each_hw_ctx(q, hctx, i) {
-               int node;
+       hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
+                                       GFP_KERNEL, node);
+       if (!hctx->ctxs)
+               goto unregister_cpu_notifier;
   
-               node = hctx->numa_node;
-               if (node == NUMA_NO_NODE)
-                       node = hctx->numa_node = set->numa_node;
+       if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
+               goto free_ctxs;
   
-               INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
-               INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
-               spin_lock_init(&hctx->lock);
-               INIT_LIST_HEAD(&hctx->dispatch);
-               hctx->queue = q;
-               hctx->queue_num = i;
-               hctx->flags = set->flags;
-               hctx->cmd_size = set->cmd_size;
+       hctx->nr_ctx = 0;
   
-               blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
-                                               blk_mq_hctx_notify, hctx);
-               blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
+       if (set->ops->init_hctx &&
+           set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
+               goto free_bitmap;
   
-               hctx->tags = set->tags[i];
+       hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
+       if (!hctx->fq)
+               goto exit_hctx;
   
-               /*
-                * Allocate space for all possible cpus to avoid allocation at
-                * runtime
-                */
-               hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
-                                               GFP_KERNEL, node);
-               if (!hctx->ctxs)
-                       break;
+       if (set->ops->init_request &&
+           set->ops->init_request(set->driver_data,
+                                  hctx->fq->flush_rq, hctx_idx,
+                                  flush_start_tag + hctx_idx, node))
+               goto free_fq;
   
-               if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
-                       break;
+       return 0;
   
-               hctx->nr_ctx = 0;
+  free_fq:
+       kfree(hctx->fq);
+  exit_hctx:
+       if (set->ops->exit_hctx)
+               set->ops->exit_hctx(hctx, hctx_idx);
+  free_bitmap:
+       blk_mq_free_bitmap(&hctx->ctx_map);
+  free_ctxs:
+       kfree(hctx->ctxs);
+  unregister_cpu_notifier:
+       blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+ 
+       return -1;
+ }
+ 
+ static int blk_mq_init_hw_queues(struct request_queue *q,
+               struct blk_mq_tag_set *set)
+ {
+       struct blk_mq_hw_ctx *hctx;
+       unsigned int i;
   
-               if (set->ops->init_hctx &&
-                   set->ops->init_hctx(hctx, set->driver_data, i))
+       /*
+        * Initialize hardware queues
+        */
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (blk_mq_init_hctx(q, set, hctx, i))
                         break;
         }
   
@@@ -1765,6 -1774,16 +1774,16 @@@ struct request_queue *blk_mq_init_queue
         if (!ctx)
                 return ERR_PTR(-ENOMEM);
   
+       /*
+        * If a crashdump is active, then we are potentially in a very
+        * memory constrained environment. Limit us to 1 queue and
+        * 64 tags to prevent using too much memory.
+        */
+       if (is_kdump_kernel()) {
+               set->nr_hw_queues = 1;
+               set->queue_depth = min(64U, set->queue_depth);
+       }
+ 
         hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
                         set->numa_node);
   
@@@ -1783,7 -1802,8 +1802,8 @@@
                 if (!hctxs[i])
                         goto err_hctxs;
   
-               if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
+               if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
+                                               node))
                         goto err_hctxs;
   
                 atomic_set(&hctxs[i]->nr_active, 0);
@@@ -1795,12 -1815,7 +1815,12 @@@
         if (!q)
                 goto err_hctxs;
   
- -      if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release))
+ +      /*
+ +       * Init percpu_ref in atomic mode so that it's faster to shutdown.
+ +       * See blk_register_queue() for details.
+ +       */
+ +      if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
+ +                          PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
                 goto err_map;
   
         setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
@@@ -1830,7 -1845,6 +1850,6 @@@
         else
                 blk_queue_make_request(q, blk_sq_make_request);
   
-       blk_queue_rq_timed_out(q, blk_mq_rq_timed_out);
         if (set->timeout)
                 blk_queue_rq_timeout(q, set->timeout);
   
@@@ -1842,17 -1856,10 +1861,10 @@@
         if (set->ops->complete)
                 blk_queue_softirq_done(q, set->ops->complete);
   
-       blk_mq_init_flush(q);
         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
   
-       q->flush_rq = kzalloc(round_up(sizeof(struct request) +
-                               set->cmd_size, cache_line_size()),
-                               GFP_KERNEL);
-       if (!q->flush_rq)
-               goto err_hw;
- 
         if (blk_mq_init_hw_queues(q, set))
-               goto err_flush_rq;
+               goto err_hw;
   
         mutex_lock(&all_q_mutex);
         list_add_tail(&q->all_q_node, &all_q_list);
@@@ -1864,8 -1871,6 +1876,6 @@@
   
         return q;
   
- err_flush_rq:
-       kfree(q->flush_rq);
   err_hw:
         blk_cleanup_queue(q);
   err_hctxs:
diff --combined block/blk-sysfs.c

index 521ae90,e8f38a3..1fac434
--- 1/block/blk-sysfs.c
--- 2/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@@ -519,8 -519,8 +519,8 @@@ static void blk_release_queue(struct ko
   
         if (q->mq_ops)
                 blk_mq_free_queue(q);
- 
-       kfree(q->flush_rq);
+       else
+               blk_free_flush_queue(q->fq);
   
         blk_trace_shutdown(q);
   
@@@ -551,19 -551,12 +551,19 @@@ int blk_register_queue(struct gendisk *
                 return -ENXIO;
   
         /*
- -       * Initialization must be complete by now.  Finish the initial
- -       * bypass from queue allocation.
+ +       * SCSI probing may synchronously create and destroy a lot of
+ +       * request_queues for non-existent devices.  Shutting down a fully
+ +       * functional queue takes measureable wallclock time as RCU grace
+ +       * periods are involved.  To avoid excessive latency in these
+ +       * cases, a request_queue starts out in a degraded mode which is
+ +       * faster to shut down and is made fully functional here as
+ +       * request_queues for non-existent devices never get registered.
          */
         if (!blk_queue_init_done(q)) {
                 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
                 blk_queue_bypass_end(q);
+ +              if (q->mq_ops)
+ +                      blk_mq_finish_init(q);
         }
   
         ret = blk_trace_init_sysfs(dev);
diff --combined drivers/block/virtio_blk.c

index 930fee8,f751fc3..c6a27d5
--- 1/drivers/block/virtio_blk.c
--- 2/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@@ -41,6 -41,12 +41,6 @@@ struct virtio_bl
         /* Process context for config space updates */
         struct work_struct config_work;
   
- -      /* Lock for config space updates */
- -      struct mutex config_lock;
- -
- -      /* enable config space updates */
- -      bool config_enable;
- -
         /* What host tells us, plus 2 for header & tailer. */
         unsigned int sg_elems;
   
@@@ -129,7 -135,7 +129,7 @@@ static inline void virtblk_request_done
                 req->errors = (error != 0);
         }
   
-       blk_mq_end_io(req, error);
+       blk_mq_end_request(req, error);
   }
   
   static void virtblk_done(struct virtqueue *vq)
@@@ -158,14 -164,14 +158,14 @@@
         spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
   }
   
- static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
+ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
+               bool last)
   {
         struct virtio_blk *vblk = hctx->queue->queuedata;
         struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
         unsigned long flags;
         unsigned int num;
         int qid = hctx->queue_num;
-       const bool last = (req->cmd_flags & REQ_END) != 0;
         int err;
         bool notify = false;
   
@@@ -199,6 -205,8 +199,8 @@@
                 }
         }
   
+       blk_mq_start_request(req);
+ 
         num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
         if (num) {
                 if (rq_data_dir(vbr->req) == WRITE)
@@@ -341,6 -349,10 +343,6 @@@ static void virtblk_config_changed_work
         char *envp[] = { "RESIZE=1", NULL };
         u64 capacity, size;
   
- -      mutex_lock(&vblk->config_lock);
- -      if (!vblk->config_enable)
- -              goto done;
- -
         /* Host must always specify the capacity. */
         virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
   
@@@ -364,6 -376,8 +366,6 @@@
         set_capacity(vblk->disk, capacity);
         revalidate_disk(vblk->disk);
         kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
- -done:
- -      mutex_unlock(&vblk->config_lock);
   }
   
   static void virtblk_config_changed(struct virtio_device *vdev)
@@@ -594,8 -608,10 +596,8 @@@ static int virtblk_probe(struct virtio_
   
         vblk->vdev = vdev;
         vblk->sg_elems = sg_elems;
- -      mutex_init(&vblk->config_lock);
   
         INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
- -      vblk->config_enable = true;
   
         err = init_vq(vblk);
         if (err)
@@@ -719,8 -735,6 +721,8 @@@
         if (!err && opt_io_size)
                 blk_queue_io_opt(q, blk_size * opt_io_size);
   
+ +      virtio_device_ready(vdev);
+ +
         add_disk(vblk->disk);
         err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
         if (err)
@@@ -759,8 -773,10 +761,8 @@@ static void virtblk_remove(struct virti
         int index = vblk->index;
         int refc;
   
- -      /* Prevent config work handler from accessing the device. */
- -      mutex_lock(&vblk->config_lock);
- -      vblk->config_enable = false;
- -      mutex_unlock(&vblk->config_lock);
+ +      /* Make sure no work handler is accessing the device. */
+ +      flush_work(&vblk->config_work);
   
         del_gendisk(vblk->disk);
         blk_cleanup_queue(vblk->disk->queue);
@@@ -770,6 -786,8 +772,6 @@@
         /* Stop all the virtqueues. */
         vdev->config->reset(vdev);
   
- -      flush_work(&vblk->config_work);
- -
         refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
         put_disk(vblk->disk);
         vdev->config->del_vqs(vdev);
@@@ -789,7 -807,11 +791,7 @@@ static int virtblk_freeze(struct virtio
         /* Ensure we don't receive any more interrupts */
         vdev->config->reset(vdev);
   
- -      /* Prevent config work handler from accessing the device. */
- -      mutex_lock(&vblk->config_lock);
- -      vblk->config_enable = false;
- -      mutex_unlock(&vblk->config_lock);
- -
+ +      /* Make sure no work handler is accessing the device. */
         flush_work(&vblk->config_work);
   
         blk_mq_stop_hw_queues(vblk->disk->queue);
@@@ -803,14 -825,12 +805,14 @@@ static int virtblk_restore(struct virti
         struct virtio_blk *vblk = vdev->priv;
         int ret;
   
- -      vblk->config_enable = true;
         ret = init_vq(vdev->priv);
- -      if (!ret)
- -              blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+ +      if (ret)
+ +              return ret;
   
- -      return ret;
+ +      virtio_device_ready(vdev);
+ +
+ +      blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+ +      return 0;
   }
   #endif
   
diff --combined drivers/scsi/Kconfig

index 296619b,9ece13f..3a820f6
--- 1/drivers/scsi/Kconfig
--- 2/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@@ -43,18 -43,7 +43,18 @@@ config SCSI_DM
   config SCSI_NETLINK
         bool
         default n
- -      select NET
+ +      depends on NET
+ +
+ +config SCSI_MQ_DEFAULT
+ +      bool "SCSI: use blk-mq I/O path by default"
+ +      depends on SCSI
+ +      ---help---
+ +        This option enables the new blk-mq based I/O path for SCSI
+ +        devices by default.  With the option the scsi_mod.use_blk_mq
+ +        module/boot option defaults to Y, without it to N, but it can
+ +        still be overriden either way.
+ +
+ +        If unsure say N.
   
   config SCSI_PROC_FS
         bool "legacy /proc/scsi/ support"
@@@ -73,7 -62,6 +73,6 @@@ comment "SCSI support type (disk, tape
   config BLK_DEV_SD
         tristate "SCSI disk support"
         depends on SCSI
-       select CRC_T10DIF if BLK_DEV_INTEGRITY
         ---help---
           If you want to use SCSI hard disks, Fibre Channel disks,
           Serial ATA (SATA) or Parallel ATA (PATA) hard disks,
@@@ -268,7 -256,7 +267,7 @@@ config SCSI_SPI_ATTR
   
   config SCSI_FC_ATTRS
         tristate "FiberChannel Transport Attributes"
- -      depends on SCSI
+ +      depends on SCSI && NET
         select SCSI_NETLINK
         help
           If you wish to export transport-specific information about
@@@ -587,16 -575,6 +586,16 @@@ config VMWARE_PVSCS
           To compile this driver as a module, choose M here: the
           module will be called vmw_pvscsi.
   
+ +config XEN_SCSI_FRONTEND
+ +      tristate "XEN SCSI frontend driver"
+ +      depends on SCSI && XEN
+ +      select XEN_XENBUS_FRONTEND
+ +      help
+ +        The XEN SCSI frontend driver allows the kernel to access SCSI Devices
+ +        within another guest OS (usually Dom0).
+ +        Only needed if the kernel is running in a XEN guest and generic
+ +        SCSI access to a device is needed.
+ +
   config HYPERV_STORAGE
         tristate "Microsoft Hyper-V virtual storage driver"
         depends on SCSI && HYPERV
@@@ -606,28 -584,28 +605,28 @@@
   
   config LIBFC
         tristate "LibFC module"
- -      select SCSI_FC_ATTRS
+ +      depends on SCSI_FC_ATTRS
         select CRC32
         ---help---
           Fibre Channel library module
   
   config LIBFCOE
         tristate "LibFCoE module"
- -      select LIBFC
+ +      depends on LIBFC
         ---help---
           Library for Fibre Channel over Ethernet module
   
   config FCOE
         tristate "FCoE module"
         depends on PCI
- -      select LIBFCOE
+ +      depends on LIBFCOE
         ---help---
           Fibre Channel over Ethernet module
   
   config FCOE_FNIC
         tristate "Cisco FNIC Driver"
         depends on PCI && X86
- -      select LIBFCOE
+ +      depends on LIBFCOE
         help
           This is support for the Cisco PCI-Express FCoE HBA.
   
@@@ -837,7 -815,7 +836,7 @@@ config SCSI_IBMVSCS
   config SCSI_IBMVFC
         tristate "IBM Virtual FC support"
         depends on PPC_PSERIES && SCSI
- -      select SCSI_FC_ATTRS
+ +      depends on SCSI_FC_ATTRS
         help
           This is the IBM POWER Virtual FC Client
   
@@@ -1287,7 -1265,7 +1286,7 @@@ source "drivers/scsi/qla4xxx/Kconfig
   config SCSI_LPFC
         tristate "Emulex LightPulse Fibre Channel Support"
         depends on PCI && SCSI
- -      select SCSI_FC_ATTRS
+ +      depends on SCSI_FC_ATTRS
         select CRC_T10DIF
         help
             This lpfc driver supports the Emulex LightPulse
@@@ -1697,7 -1675,7 +1696,7 @@@ config SCSI_SUNES
   config ZFCP
         tristate "FCP host bus adapter driver for IBM eServer zSeries"
         depends on S390 && QDIO && SCSI
- -      select SCSI_FC_ATTRS
+ +      depends on SCSI_FC_ATTRS
         help
             If you want to access SCSI devices attached to your IBM eServer
             zSeries by means of Fibre Channel interfaces say Y.
@@@ -1725,7 -1703,7 +1724,7 @@@ config SCSI_PM800
   config SCSI_BFA_FC
         tristate "Brocade BFA Fibre Channel Support"
         depends on PCI && SCSI
- -      select SCSI_FC_ATTRS
+ +      depends on SCSI_FC_ATTRS
         help
           This bfa driver supports all Brocade PCIe FC/FCOE host adapters.
   
diff --combined drivers/scsi/scsi_error.c

index 6b20ef3,a2c3d3d..9a6f846
--- 1/drivers/scsi/scsi_error.c
--- 2/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@@ -1238,9 -1238,9 +1238,9 @@@ retry_tur
   /**
    * scsi_eh_test_devices - check if devices are responding from error recovery.
    * @cmd_list: scsi commands in error recovery.
- - * @work_q:     queue for commands which still need more error recovery
- - * @done_q:     queue for commands which are finished
- - * @try_stu:    boolean on if a STU command should be tried in addition to TUR.
+ + * @work_q:   queue for commands which still need more error recovery
+ + * @done_q:   queue for commands which are finished
+ + * @try_stu:  boolean on if a STU command should be tried in addition to TUR.
    *
    * Decription:
    *    Tests if devices are in a working state.  Commands to devices now in
@@@ -1373,7 -1373,7 +1373,7 @@@ static int scsi_eh_try_stu(struct scsi_
    /**
    * scsi_eh_stu - send START_UNIT if needed
    * @shost:    &scsi host being recovered.
- - * @work_q:     &list_head for pending commands.
+ + * @work_q:   &list_head for pending commands.
    * @done_q:   &list_head for processed commands.
    *
    * Notes:
@@@ -1436,7 -1436,7 +1436,7 @@@ static int scsi_eh_stu(struct Scsi_Hos
   /**
    * scsi_eh_bus_device_reset - send bdr if needed
    * @shost:    scsi host being recovered.
- - * @work_q:     &list_head for pending commands.
+ + * @work_q:   &list_head for pending commands.
    * @done_q:   &list_head for processed commands.
    *
    * Notes:
@@@ -1502,7 -1502,7 +1502,7 @@@ static int scsi_eh_bus_device_reset(str
   /**
    * scsi_eh_target_reset - send target reset if needed
    * @shost:    scsi host being recovered.
- - * @work_q:     &list_head for pending commands.
+ + * @work_q:   &list_head for pending commands.
    * @done_q:   &list_head for processed commands.
    *
    * Notes:
@@@ -1567,7 -1567,7 +1567,7 @@@ static int scsi_eh_target_reset(struct 
   /**
    * scsi_eh_bus_reset - send a bus reset
    * @shost:    &scsi host being recovered.
- - * @work_q:     &list_head for pending commands.
+ + * @work_q:   &list_head for pending commands.
    * @done_q:   &list_head for processed commands.
    */
   static int scsi_eh_bus_reset(struct Scsi_Host *shost,
@@@ -1638,9 -1638,8 +1638,9 @@@
   
   /**
    * scsi_eh_host_reset - send a host reset
- - * @work_q:   list_head for processed commands.
- - * @done_q:   list_head for processed commands.
+ + * @shost:    host to be reset.
+ + * @work_q:   &list_head for pending commands.
+ + * @done_q:   &list_head for processed commands.
    */
   static int scsi_eh_host_reset(struct Scsi_Host *shost,
                               struct list_head *work_q,
@@@ -1678,8 -1677,8 +1678,8 @@@
   
   /**
    * scsi_eh_offline_sdevs - offline scsi devices that fail to recover
- - * @work_q:   list_head for processed commands.
- - * @done_q:   list_head for processed commands.
+ + * @work_q:   &list_head for pending commands.
+ + * @done_q:   &list_head for processed commands.
    */
   static void scsi_eh_offline_sdevs(struct list_head *work_q,
                                   struct list_head *done_q)
@@@ -1961,6 -1960,8 +1961,8 @@@ static void scsi_eh_lock_door(struct sc
          * request becomes available
          */
         req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL);
+       if (IS_ERR(req))
+               return;
   
         blk_rq_set_block_pc(req);
   
@@@ -2044,8 -2045,8 +2046,8 @@@ static void scsi_restart_operations(str
   
   /**
    * scsi_eh_ready_devs - check device ready state and recover if not.
- - * @shost:    host to be recovered.
- - * @work_q:     &list_head for pending commands.
+ + * @shost:    host to be recovered.
+ + * @work_q:   &list_head for pending commands.
    * @done_q:   &list_head for processed commands.
    */
   void scsi_eh_ready_devs(struct Scsi_Host *shost,
diff --combined drivers/scsi/scsi_lib.c

index db8c449,5c5617e..9eff8a3
--- 1/drivers/scsi/scsi_lib.c
--- 2/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@@ -221,7 -221,7 +221,7 @@@ int scsi_execute(struct scsi_device *sd
         int ret = DRIVER_ERROR << 24;
   
         req = blk_get_request(sdev->request_queue, write, __GFP_WAIT);
-       if (!req)
+       if (IS_ERR(req))
                 return ret;
         blk_rq_set_block_pc(req);
   
@@@ -645,18 -645,16 +645,18 @@@ static void scsi_mq_free_sgtables(struc
   static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd)
   {
         struct scsi_device *sdev = cmd->device;
+ +      struct Scsi_Host *shost = sdev->host;
         unsigned long flags;
   
- -      BUG_ON(list_empty(&cmd->list));
- -
         scsi_mq_free_sgtables(cmd);
         scsi_uninit_cmd(cmd);
   
- -      spin_lock_irqsave(&sdev->list_lock, flags);
- -      list_del_init(&cmd->list);
- -      spin_unlock_irqrestore(&sdev->list_lock, flags);
+ +      if (shost->use_cmd_list) {
+ +              BUG_ON(list_empty(&cmd->list));
+ +              spin_lock_irqsave(&sdev->list_lock, flags);
+ +              list_del_init(&cmd->list);
+ +              spin_unlock_irqrestore(&sdev->list_lock, flags);
+ +      }
   }
   
   /*
@@@ -715,7 -713,7 +715,7 @@@ static bool scsi_end_request(struct req
   
         if (req->mq_ctx) {
                 /*
-                * In the MQ case the command gets freed by __blk_mq_end_io,
+                * In the MQ case the command gets freed by __blk_mq_end_request,
                  * so we have to do all cleanup that depends on it earlier.
                  *
                  * We also can't kick the queues from irq context, so we
@@@ -723,7 -721,7 +723,7 @@@
                  */
                 scsi_mq_uninit_cmd(cmd);
   
-               __blk_mq_end_io(req, error);
+               __blk_mq_end_request(req, error);
   
                 if (scsi_target(sdev)->single_lun ||
                     !list_empty(&sdev->host->starved_list))
@@@ -735,13 -733,12 +735,13 @@@
         } else {
                 unsigned long flags;
   
+ +              if (bidi_bytes)
+ +                      scsi_release_bidi_buffers(cmd);
+ +
                 spin_lock_irqsave(q->queue_lock, flags);
                 blk_finish_request(req, error);
                 spin_unlock_irqrestore(q->queue_lock, flags);
   
- -              if (bidi_bytes)
- -                      scsi_release_bidi_buffers(cmd);
                 scsi_release_buffers(cmd);
                 scsi_next_command(cmd);
         }
@@@ -1818,11 -1815,13 +1818,11 @@@ static int scsi_mq_prep_fn(struct reque
         INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
         cmd->jiffies_at_alloc = jiffies;
   
- -      /*
- -       * XXX: cmd_list lookups are only used by two drivers, try to get
- -       * rid of this list in common code.
- -       */
- -      spin_lock_irq(&sdev->list_lock);
- -      list_add_tail(&cmd->list, &sdev->cmd_list);
- -      spin_unlock_irq(&sdev->list_lock);
+ +      if (shost->use_cmd_list) {
+ +              spin_lock_irq(&sdev->list_lock);
+ +              list_add_tail(&cmd->list, &sdev->cmd_list);
+ +              spin_unlock_irq(&sdev->list_lock);
+ +      }
   
         sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
         cmd->sdb.table.sgl = sg;
@@@ -1847,6 -1846,8 +1847,8 @@@
                 next_rq->special = bidi_sdb;
         }
   
+       blk_mq_start_request(req);
+ 
         return scsi_setup_cmnd(sdev, req);
   }
   
@@@ -1856,7 -1857,8 +1858,8 @@@ static void scsi_mq_done(struct scsi_cm
         blk_mq_complete_request(cmd->request);
   }
   
- static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
+ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
+               bool last)
   {
         struct request_queue *q = req->q;
         struct scsi_device *sdev = q->queuedata;
@@@ -1880,11 -1882,14 +1883,14 @@@
         if (!scsi_host_queue_ready(q, shost, sdev))
                 goto out_dec_target_busy;
   
+ 
         if (!(req->cmd_flags & REQ_DONTPREP)) {
                 ret = prep_to_mq(scsi_mq_prep_fn(req));
                 if (ret)
                         goto out_dec_host_busy;
                 req->cmd_flags |= REQ_DONTPREP;
+       } else {
+               blk_mq_start_request(req);
         }
   
         scsi_init_cmd_errh(cmd);
@@@ -1931,6 -1936,14 +1937,14 @@@ out
         return ret;
   }
   
+ static enum blk_eh_timer_return scsi_timeout(struct request *req,
+               bool reserved)
+ {
+       if (reserved)
+               return BLK_EH_RESET_TIMER;
+       return scsi_times_out(req);
+ }
+ 
   static int scsi_init_request(void *data, struct request *rq,
                 unsigned int hctx_idx, unsigned int request_idx,
                 unsigned int numa_node)
@@@ -2042,7 -2055,7 +2056,7 @@@ static struct blk_mq_ops scsi_mq_ops = 
         .map_queue      = blk_mq_map_queue,
         .queue_rq       = scsi_queue_rq,
         .complete       = scsi_softirq_done,
-       .timeout        = scsi_times_out,
+       .timeout        = scsi_timeout,
         .init_request   = scsi_init_request,
         .exit_request   = scsi_exit_request,
   };
diff --combined drivers/scsi/sd.c

index 0cb5c9f,9f7099f..7ee8602
--- 1/drivers/scsi/sd.c
--- 2/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@@ -185,7 -185,7 +185,7 @@@ cache_type_store(struct device *dev, st
         if (ct < 0)
                 return -EINVAL;
         rcd = ct & 0x01 ? 1 : 0;
- -      wce = ct & 0x02 ? 1 : 0;
+ +      wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0;
   
         if (sdkp->cache_override) {
                 sdkp->WCE = wce;
@@@ -610,29 -610,44 +610,44 @@@ static void scsi_disk_put(struct scsi_d
         mutex_unlock(&sd_ref_mutex);
   }
   
- static void sd_prot_op(struct scsi_cmnd *scmd, unsigned int dif)
+ 
+ 
+ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd,
+                                          unsigned int dix, unsigned int dif)
   {
-       unsigned int prot_op = SCSI_PROT_NORMAL;
-       unsigned int dix = scsi_prot_sg_count(scmd);
- 
-       if (scmd->sc_data_direction == DMA_FROM_DEVICE) {
-               if (dif && dix)
-                       prot_op = SCSI_PROT_READ_PASS;
-               else if (dif && !dix)
-                       prot_op = SCSI_PROT_READ_STRIP;
-               else if (!dif && dix)
-                       prot_op = SCSI_PROT_READ_INSERT;
-       } else {
-               if (dif && dix)
-                       prot_op = SCSI_PROT_WRITE_PASS;
-               else if (dif && !dix)
-                       prot_op = SCSI_PROT_WRITE_INSERT;
-               else if (!dif && dix)
-                       prot_op = SCSI_PROT_WRITE_STRIP;
+       struct bio *bio = scmd->request->bio;
+       unsigned int prot_op = sd_prot_op(rq_data_dir(scmd->request), dix, dif);
+       unsigned int protect = 0;
+ 
+       if (dix) {                              /* DIX Type 0, 1, 2, 3 */
+               if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM))
+                       scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM;
+ 
+               if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
+                       scmd->prot_flags |= SCSI_PROT_GUARD_CHECK;
+       }
+ 
+       if (dif != SD_DIF_TYPE3_PROTECTION) {   /* DIX/DIF Type 0, 1, 2 */
+               scmd->prot_flags |= SCSI_PROT_REF_INCREMENT;
+ 
+               if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
+                       scmd->prot_flags |= SCSI_PROT_REF_CHECK;
+       }
+ 
+       if (dif) {                              /* DIX/DIF Type 1, 2, 3 */
+               scmd->prot_flags |= SCSI_PROT_TRANSFER_PI;
+ 
+               if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK))
+                       protect = 3 << 5;       /* Disable target PI checking */
+               else
+                       protect = 1 << 5;       /* Enable target PI checking */
         }
   
         scsi_set_prot_op(scmd, prot_op);
         scsi_set_prot_type(scmd, dif);
+       scmd->prot_flags &= sd_prot_flag_mask(prot_op);
+ 
+       return protect;
   }
   
   static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
@@@ -893,7 -908,8 +908,8 @@@ static int sd_setup_read_write_cmnd(str
         sector_t block = blk_rq_pos(rq);
         sector_t threshold;
         unsigned int this_count = blk_rq_sectors(rq);
-       int ret, host_dif;
+       unsigned int dif, dix;
+       int ret;
         unsigned char protect;
   
         ret = scsi_init_io(SCpnt, GFP_ATOMIC);
@@@ -995,7 -1011,7 +1011,7 @@@
                 SCpnt->cmnd[0] = WRITE_6;
   
                 if (blk_integrity_rq(rq))
-                       sd_dif_prepare(rq, block, sdp->sector_size);
+                       sd_dif_prepare(SCpnt);
   
         } else if (rq_data_dir(rq) == READ) {
                 SCpnt->cmnd[0] = READ_6;
@@@ -1010,14 -1026,15 +1026,15 @@@
                                         "writing" : "reading", this_count,
                                         blk_rq_sectors(rq)));
   
-       /* Set RDPROTECT/WRPROTECT if disk is formatted with DIF */
-       host_dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type);
-       if (host_dif)
-               protect = 1 << 5;
+       dix = scsi_prot_sg_count(SCpnt);
+       dif = scsi_host_dif_capable(SCpnt->device->host, sdkp->protection_type);
+ 
+       if (dif || dix)
+               protect = sd_setup_protect_cmnd(SCpnt, dix, dif);
         else
                 protect = 0;
   
-       if (host_dif == SD_DIF_TYPE2_PROTECTION) {
+       if (protect && sdkp->protection_type == SD_DIF_TYPE2_PROTECTION) {
                 SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC);
   
                 if (unlikely(SCpnt->cmnd == NULL)) {
@@@ -1102,10 -1119,6 +1119,6 @@@
         }
         SCpnt->sdb.length = this_count * sdp->sector_size;
   
-       /* If DIF or DIX is enabled, tell HBA how to handle request */
-       if (host_dif || scsi_prot_sg_count(SCpnt))
-               sd_prot_op(SCpnt, host_dif);
- 
         /*
          * We shouldn't disconnect in the middle of a sector, so with a dumb
          * host adapter, it's safe to assume that we can at least transfer
@@@ -2490,10 -2503,6 +2503,10 @@@ sd_read_cache_type(struct scsi_disk *sd
                         sdkp->DPOFUA = 0;
                 }
   
+ +              /* No cache flush allowed for write protected devices */
+ +              if (sdkp->WCE && sdkp->write_prot)
+ +                      sdkp->WCE = 0;
+ +
                 if (sdkp->first_scan || old_wce != sdkp->WCE ||
                     old_rcd != sdkp->RCD || old_dpofua != sdkp->DPOFUA)
                         sd_printk(KERN_NOTICE, sdkp,
@@@ -2965,7 -2974,6 +2978,7 @@@ static int sd_probe(struct device *dev
         int index;
         int error;
   
+ +      scsi_autopm_get_device(sdp);
         error = -ENODEV;
         if (sdp->type != TYPE_DISK && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC)
                 goto out;
@@@ -3042,7 -3050,6 +3055,7 @@@
    out_free:
         kfree(sdkp);
    out:
+ +      scsi_autopm_put_device(sdp);
         return error;
   }
   
diff --combined drivers/scsi/st.c

index d3fd6e8,59db5bf..4daa372
--- 1/drivers/scsi/st.c
--- 2/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@@ -490,7 -490,7 +490,7 @@@ static int st_scsi_execute(struct st_re
   
         req = blk_get_request(SRpnt->stp->device->request_queue, write,
                               GFP_KERNEL);
-       if (!req)
+       if (IS_ERR(req))
                 return DRIVER_ERROR << 24;
   
         blk_rq_set_block_pc(req);
@@@ -4105,7 -4105,6 +4105,7 @@@ static int st_probe(struct device *dev
                 return -ENODEV;
         }
   
+ +      scsi_autopm_get_device(SDp);
         i = queue_max_segments(SDp->request_queue);
         if (st_max_sg_segs < i)
                 i = st_max_sg_segs;
@@@ -4245,7 -4244,6 +4245,7 @@@ out_put_disk
   out_buffer_free:
         kfree(buffer);
   out:
+ +      scsi_autopm_put_device(SDp);
         return -ENODEV;
   };
   
diff --combined fs/block_dev.c

index e2f3ad0,cc8d68a..cc9d411
--- 1/fs/block_dev.c
--- 2/fs/block_dev.c
+++ b/fs/block_dev.c
@@@ -50,32 -50,22 +50,22 @@@ inline struct block_device *I_BDEV(stru
   EXPORT_SYMBOL(I_BDEV);
   
   /*
-  * Move the inode from its current bdi to a new bdi. If the inode is dirty we
-  * need to move it onto the dirty list of @dst so that the inode is always on
-  * the right list.
+  * Move the inode from its current bdi to a new bdi.  Make sure the inode
+  * is clean before moving so that it doesn't linger on the old bdi.
    */
   static void bdev_inode_switch_bdi(struct inode *inode,
                         struct backing_dev_info *dst)
   {
-       struct backing_dev_info *old = inode->i_data.backing_dev_info;
-       bool wakeup_bdi = false;
- 
-       if (unlikely(dst == old))               /* deadlock avoidance */
-               return;
-       bdi_lock_two(&old->wb, &dst->wb);
-       spin_lock(&inode->i_lock);
-       inode->i_data.backing_dev_info = dst;
-       if (inode->i_state & I_DIRTY) {
-               if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
-                       wakeup_bdi = true;
-               list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+       while (true) {
+               spin_lock(&inode->i_lock);
+               if (!(inode->i_state & I_DIRTY)) {
+                       inode->i_data.backing_dev_info = dst;
+                       spin_unlock(&inode->i_lock);
+                       return;
+               }
+               spin_unlock(&inode->i_lock);
+               WARN_ON_ONCE(write_inode_now(inode, true));
         }
-       spin_unlock(&inode->i_lock);
-       spin_unlock(&old->wb.list_lock);
-       spin_unlock(&dst->wb.list_lock);
- 
-       if (wakeup_bdi)
-               bdi_wakeup_thread_delayed(dst);
   }
   
   /* Kill _all_ buffers and pagecache , dirty or not.. */
@@@ -304,12 -294,6 +294,12 @@@ static int blkdev_readpage(struct file 
         return block_read_full_page(page, blkdev_get_block);
   }
   
+ +static int blkdev_readpages(struct file *file, struct address_space *mapping,
+ +                      struct list_head *pages, unsigned nr_pages)
+ +{
+ +      return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+ +}
+ +
   static int blkdev_write_begin(struct file *file, struct address_space *mapping,
                         loff_t pos, unsigned len, unsigned flags,
                         struct page **pagep, void **fsdata)
@@@ -1179,8 -1163,6 +1169,6 @@@ static int __blkdev_get(struct block_de
                         if (!ret) {
                                 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
                                 bdi = blk_get_backing_dev_info(bdev);
-                               if (bdi == NULL)
-                                       bdi = &default_backing_dev_info;
                                 bdev_inode_switch_bdi(bdev->bd_inode, bdi);
                         }
   
@@@ -1628,7 -1610,6 +1616,7 @@@ static int blkdev_releasepage(struct pa
   
   static const struct address_space_operations def_blk_aops = {
         .readpage       = blkdev_readpage,
+ +      .readpages      = blkdev_readpages,
         .writepage      = blkdev_writepage,
         .write_begin    = blkdev_write_begin,
         .write_end      = blkdev_write_end,
diff --combined fs/btrfs/disk-io.c

index fa45e3c,7e221b0..1ad0f47
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -72,41 -72,21 +72,41 @@@ static int btrfs_cleanup_transaction(st
   static void btrfs_error_commit_super(struct btrfs_root *root);
   
   /*
- - * end_io_wq structs are used to do processing in task context when an IO is
- - * complete.  This is used during reads to verify checksums, and it is used
+ + * btrfs_end_io_wq structs are used to do processing in task context when an IO
+ + * is complete.  This is used during reads to verify checksums, and it is used
    * by writes to insert metadata for new file extents after IO is complete.
    */
- -struct end_io_wq {
+ +struct btrfs_end_io_wq {
         struct bio *bio;
         bio_end_io_t *end_io;
         void *private;
         struct btrfs_fs_info *info;
         int error;
- -      int metadata;
+ +      enum btrfs_wq_endio_type metadata;
         struct list_head list;
         struct btrfs_work work;
   };
   
+ +static struct kmem_cache *btrfs_end_io_wq_cache;
+ +
+ +int __init btrfs_end_io_wq_init(void)
+ +{
+ +      btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
+ +                                      sizeof(struct btrfs_end_io_wq),
+ +                                      0,
+ +                                      SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ +                                      NULL);
+ +      if (!btrfs_end_io_wq_cache)
+ +              return -ENOMEM;
+ +      return 0;
+ +}
+ +
+ +void btrfs_end_io_wq_exit(void)
+ +{
+ +      if (btrfs_end_io_wq_cache)
+ +              kmem_cache_destroy(btrfs_end_io_wq_cache);
+ +}
+ +
   /*
    * async submit bios are used to offload expensive checksumming
    * onto the worker threads.  They checksum file and metadata bios
@@@ -347,7 -327,8 +347,7 @@@ static int verify_parent_transid(struc
   {
         struct extent_state *cached_state = NULL;
         int ret;
- -      bool need_lock = (current->journal_info ==
- -                        (void *)BTRFS_SEND_TRANS_STUB);
+ +      bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
   
         if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
                 return 0;
@@@ -367,9 -348,9 +367,9 @@@
                 ret = 0;
                 goto out;
         }
- -      printk_ratelimited("parent transid verify failed on %llu wanted %llu "
- -                     "found %llu\n",
- -                     eb->start, parent_transid, btrfs_header_generation(eb));
+ +      printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+ +                      eb->fs_info->sb->s_id, eb->start,
+ +                      parent_transid, btrfs_header_generation(eb));
         ret = 1;
   
         /*
@@@ -626,22 -607,22 +626,22 @@@ static int btree_readpage_end_io_hook(s
                 goto err;
   
         eb->read_mirror = mirror;
- -      if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+ +      if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
                 ret = -EIO;
                 goto err;
         }
   
         found_start = btrfs_header_bytenr(eb);
         if (found_start != eb->start) {
- -              printk_ratelimited(KERN_INFO "BTRFS: bad tree block start "
+ +              printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
                                "%llu %llu\n",
- -                             found_start, eb->start);
+ +                             eb->fs_info->sb->s_id, found_start, eb->start);
                 ret = -EIO;
                 goto err;
         }
         if (check_tree_block_fsid(root, eb)) {
- -              printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n",
- -                             eb->start);
+ +              printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
+ +                             eb->fs_info->sb->s_id, eb->start);
                 ret = -EIO;
                 goto err;
         }
@@@ -699,7 -680,7 +699,7 @@@ static int btree_io_failed_hook(struct 
         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
   
         eb = (struct extent_buffer *)page->private;
- -      set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+ +      set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
         eb->read_mirror = failed_mirror;
         atomic_dec(&eb->io_pages);
         if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@@ -709,7 -690,7 +709,7 @@@
   
   static void end_workqueue_bio(struct bio *bio, int err)
   {
- -      struct end_io_wq *end_io_wq = bio->bi_private;
+ +      struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
         struct btrfs_fs_info *fs_info;
         struct btrfs_workqueue *wq;
         btrfs_work_func_t func;
@@@ -732,11 -713,7 +732,11 @@@
                         func = btrfs_endio_write_helper;
                 }
         } else {
- -              if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ +              if (unlikely(end_io_wq->metadata ==
+ +                           BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+ +                      wq = fs_info->endio_repair_workers;
+ +                      func = btrfs_endio_repair_helper;
+ +              } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
                         wq = fs_info->endio_raid56_workers;
                         func = btrfs_endio_raid56_helper;
                 } else if (end_io_wq->metadata) {
@@@ -752,12 -729,19 +752,12 @@@
         btrfs_queue_work(wq, &end_io_wq->work);
   }
   
- -/*
- - * For the metadata arg you want
- - *
- - * 0 - if data
- - * 1 - if normal metadta
- - * 2 - if writing to the free space cache area
- - * 3 - raid parity work
- - */
   int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- -                      int metadata)
+ +                      enum btrfs_wq_endio_type metadata)
   {
- -      struct end_io_wq *end_io_wq;
- -      end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+ +      struct btrfs_end_io_wq *end_io_wq;
+ +
+ +      end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
         if (!end_io_wq)
                 return -ENOMEM;
   
@@@ -941,7 -925,7 +941,7 @@@ static int btree_submit_bio_hook(struc
                  * can happen in the async kernel threads
                  */
                 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
- -                                        bio, 1);
+ +                                        bio, BTRFS_WQ_ENDIO_METADATA);
                 if (ret)
                         goto out_w_error;
                 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
@@@ -1073,17 -1057,20 +1073,17 @@@ static const struct address_space_opera
         .set_page_dirty = btree_set_page_dirty,
   };
   
- -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
- -                       u64 parent_transid)
+ +void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
   {
         struct extent_buffer *buf = NULL;
         struct inode *btree_inode = root->fs_info->btree_inode;
- -      int ret = 0;
   
         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
         if (!buf)
- -              return 0;
+ +              return;
         read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
                                  buf, 0, WAIT_NONE, btree_get_extent, 0);
         free_extent_buffer(buf);
- -      return ret;
   }
   
   int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@@ -1119,7 -1106,7 +1119,7 @@@
   }
   
   struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
- -                                          u64 bytenr, u32 blocksize)
+ +                                          u64 bytenr)
   {
         return find_extent_buffer(root->fs_info, bytenr);
   }
@@@ -1127,9 -1114,11 +1127,9 @@@
   struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
                                                  u64 bytenr, u32 blocksize)
   {
- -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- -      if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ +      if (btrfs_test_is_dummy_root(root))
                 return alloc_test_extent_buffer(root->fs_info, bytenr,
                                                 blocksize);
- -#endif
         return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
   }
   
@@@ -1147,12 -1136,12 +1147,12 @@@ int btrfs_wait_tree_block_writeback(str
   }
   
   struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
- -                                    u32 blocksize, u64 parent_transid)
+ +                                    u64 parent_transid)
   {
         struct extent_buffer *buf = NULL;
         int ret;
   
- -      buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ +      buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
         if (!buf)
                 return NULL;
   
@@@ -1194,7 -1183,7 +1194,7 @@@ static struct btrfs_subvolume_writers *
         if (!writers)
                 return ERR_PTR(-ENOMEM);
   
- -      ret = percpu_counter_init(&writers->counter, 0);
+ +      ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
         if (ret < 0) {
                 kfree(writers);
                 return ERR_PTR(ret);
@@@ -1211,14 -1200,16 +1211,14 @@@ btrfs_free_subvolume_writers(struct btr
         kfree(writers);
   }
   
- -static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
- -                       u32 stripesize, struct btrfs_root *root,
- -                       struct btrfs_fs_info *fs_info,
+ +static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
+ +                       struct btrfs_root *root, struct btrfs_fs_info *fs_info,
                          u64 objectid)
   {
         root->node = NULL;
         root->commit_root = NULL;
         root->sectorsize = sectorsize;
         root->nodesize = nodesize;
- -      root->leafsize = leafsize;
         root->stripesize = stripesize;
         root->state = 0;
         root->orphan_cleanup_state = 0;
@@@ -1304,7 -1295,7 +1304,7 @@@ struct btrfs_root *btrfs_alloc_dummy_ro
         root = btrfs_alloc_root(NULL);
         if (!root)
                 return ERR_PTR(-ENOMEM);
- -      __setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
+ +      __setup_root(4096, 4096, 4096, root, NULL, 1);
         set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
         root->alloc_bytenr = 0;
   
@@@ -1327,13 -1318,15 +1327,13 @@@ struct btrfs_root *btrfs_create_tree(st
         if (!root)
                 return ERR_PTR(-ENOMEM);
   
- -      __setup_root(tree_root->nodesize, tree_root->leafsize,
- -                   tree_root->sectorsize, tree_root->stripesize,
- -                   root, fs_info, objectid);
+ +      __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ +              tree_root->stripesize, root, fs_info, objectid);
         root->root_key.objectid = objectid;
         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
         root->root_key.offset = 0;
   
- -      leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- -                                    0, objectid, NULL, 0, 0, 0);
+ +      leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
         if (IS_ERR(leaf)) {
                 ret = PTR_ERR(leaf);
                 leaf = NULL;
@@@ -1403,9 -1396,9 +1403,9 @@@ static struct btrfs_root *alloc_log_tre
         if (!root)
                 return ERR_PTR(-ENOMEM);
   
- -      __setup_root(tree_root->nodesize, tree_root->leafsize,
- -                   tree_root->sectorsize, tree_root->stripesize,
- -                   root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+ +      __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ +                   tree_root->stripesize, root, fs_info,
+ +                   BTRFS_TREE_LOG_OBJECTID);
   
         root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
@@@ -1420,8 -1413,9 +1420,8 @@@
          * updated (along with back refs to the log tree).
          */
   
- -      leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- -                                    BTRFS_TREE_LOG_OBJECTID, NULL,
- -                                    0, 0, 0);
+ +      leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
+ +                      NULL, 0, 0, 0);
         if (IS_ERR(leaf)) {
                 kfree(root);
                 return ERR_CAST(leaf);
@@@ -1471,7 -1465,7 +1471,7 @@@ int btrfs_add_log_tree(struct btrfs_tra
         btrfs_set_stack_inode_generation(inode_item, 1);
         btrfs_set_stack_inode_size(inode_item, 3);
         btrfs_set_stack_inode_nlink(inode_item, 1);
- -      btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+ +      btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
   
         btrfs_set_root_node(&log_root->root_item, log_root->node);
@@@ -1491,6 -1485,7 +1491,6 @@@ static struct btrfs_root *btrfs_read_tr
         struct btrfs_fs_info *fs_info = tree_root->fs_info;
         struct btrfs_path *path;
         u64 generation;
- -      u32 blocksize;
         int ret;
   
         path = btrfs_alloc_path();
@@@ -1503,8 -1498,9 +1503,8 @@@
                 goto alloc_fail;
         }
   
- -      __setup_root(tree_root->nodesize, tree_root->leafsize,
- -                   tree_root->sectorsize, tree_root->stripesize,
- -                   root, fs_info, key->objectid);
+ +      __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ +              tree_root->stripesize, root, fs_info, key->objectid);
   
         ret = btrfs_find_root(tree_root, key, path,
                               &root->root_item, &root->root_key);
@@@ -1515,8 -1511,9 +1515,8 @@@
         }
   
         generation = btrfs_root_generation(&root->root_item);
- -      blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
         root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
- -                                   blocksize, generation);
+ +                                   generation);
         if (!root->node) {
                 ret = -ENOMEM;
                 goto find_fail;
@@@ -1576,8 -1573,8 +1576,8 @@@ int btrfs_init_fs_root(struct btrfs_roo
         root->subv_writers = writers;
   
         btrfs_init_free_ino_ctl(root);
- -      spin_lock_init(&root->cache_lock);
- -      init_waitqueue_head(&root->cache_wait);
+ +      spin_lock_init(&root->ino_cache_lock);
+ +      init_waitqueue_head(&root->ino_cache_wait);
   
         ret = get_anon_bdev(&root->anon_dev);
         if (ret)
@@@ -1702,7 -1699,7 +1702,7 @@@ static int btrfs_congested_fn(void *con
                 if (!device->bdev)
                         continue;
                 bdi = blk_get_backing_dev_info(device->bdev);
-               if (bdi && bdi_congested(bdi, bdi_bits)) {
+               if (bdi_congested(bdi, bdi_bits)) {
                         ret = 1;
                         break;
                 }
@@@ -1711,6 -1708,10 +1711,6 @@@
         return ret;
   }
   
- -/*
- - * If this fails, caller must call bdi_destroy() to get rid of the
- - * bdi again.
- - */
   static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
   {
         int err;
@@@ -1733,16 -1734,16 +1733,16 @@@
   static void end_workqueue_fn(struct btrfs_work *work)
   {
         struct bio *bio;
- -      struct end_io_wq *end_io_wq;
+ +      struct btrfs_end_io_wq *end_io_wq;
         int error;
   
- -      end_io_wq = container_of(work, struct end_io_wq, work);
+ +      end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
         bio = end_io_wq->bio;
   
         error = end_io_wq->error;
         bio->bi_private = end_io_wq->private;
         bio->bi_end_io = end_io_wq->end_io;
- -      kfree(end_io_wq);
+ +      kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
         bio_endio_nodec(bio, error);
   }
   
@@@ -1771,7 -1772,6 +1771,7 @@@ static int cleaner_kthread(void *arg
                 }
   
                 btrfs_run_delayed_iputs(root);
+ +              btrfs_delete_unused_bgs(root->fs_info);
                 again = btrfs_clean_one_deleted_snapshot(root);
                 mutex_unlock(&root->fs_info->cleaner_mutex);
   
@@@ -2063,7 -2063,6 +2063,7 @@@ static void btrfs_stop_all_workers(stru
         btrfs_destroy_workqueue(fs_info->endio_workers);
         btrfs_destroy_workqueue(fs_info->endio_meta_workers);
         btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ +      btrfs_destroy_workqueue(fs_info->endio_repair_workers);
         btrfs_destroy_workqueue(fs_info->rmw_workers);
         btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
         btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@@ -2144,6 -2143,8 +2144,6 @@@ int open_ctree(struct super_block *sb
   {
         u32 sectorsize;
         u32 nodesize;
- -      u32 leafsize;
- -      u32 blocksize;
         u32 stripesize;
         u64 generation;
         u64 features;
@@@ -2187,7 -2188,7 +2187,7 @@@
                 goto fail_srcu;
         }
   
- -      ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+ +      ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
         if (ret) {
                 err = ret;
                 goto fail_bdi;
@@@ -2195,13 -2196,13 +2195,13 @@@
         fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
                                         (1 + ilog2(nr_cpu_ids));
   
- -      ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+ +      ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
         if (ret) {
                 err = ret;
                 goto fail_dirty_metadata_bytes;
         }
   
- -      ret = percpu_counter_init(&fs_info->bio_counter, 0);
+ +      ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
         if (ret) {
                 err = ret;
                 goto fail_delalloc_bytes;
@@@ -2232,7 -2233,6 +2232,7 @@@
         spin_lock_init(&fs_info->super_lock);
         spin_lock_init(&fs_info->qgroup_op_lock);
         spin_lock_init(&fs_info->buffer_lock);
+ +      spin_lock_init(&fs_info->unused_bgs_lock);
         rwlock_init(&fs_info->tree_mod_log_lock);
         mutex_init(&fs_info->reloc_mutex);
         mutex_init(&fs_info->delalloc_root_mutex);
@@@ -2242,7 -2242,6 +2242,7 @@@
         INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
         INIT_LIST_HEAD(&fs_info->space_info);
         INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ +      INIT_LIST_HEAD(&fs_info->unused_bgs);
         btrfs_mapping_init(&fs_info->mapping_tree);
         btrfs_init_block_rsv(&fs_info->global_block_rsv,
                              BTRFS_BLOCK_RSV_GLOBAL);
@@@ -2261,7 -2260,7 +2261,7 @@@
         atomic_set(&fs_info->qgroup_op_seq, 0);
         atomic64_set(&fs_info->tree_mod_seq, 0);
         fs_info->sb = sb;
- -      fs_info->max_inline = 8192 * 1024;
+ +      fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
         fs_info->metadata_ratio = 0;
         fs_info->defrag_inodes = RB_ROOT;
         fs_info->free_chunk_space = 0;
@@@ -2390,7 -2389,7 +2390,7 @@@
                 goto fail_alloc;
         }
   
- -      __setup_root(4096, 4096, 4096, 4096, tree_root,
+ +      __setup_root(4096, 4096, 4096, tree_root,
                      fs_info, BTRFS_ROOT_TREE_OBJECTID);
   
         invalidate_bdev(fs_devices->latest_bdev);
@@@ -2470,22 -2469,19 +2470,22 @@@
                 goto fail_alloc;
         }
   
- -      if (btrfs_super_leafsize(disk_super) !=
+ +      /*
+ +       * Leafsize and nodesize were always equal, this is only a sanity check.
+ +       */
+ +      if (le32_to_cpu(disk_super->__unused_leafsize) !=
             btrfs_super_nodesize(disk_super)) {
                 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
                        "blocksizes don't match.  node %d leaf %d\n",
                        btrfs_super_nodesize(disk_super),
- -                     btrfs_super_leafsize(disk_super));
+ +                     le32_to_cpu(disk_super->__unused_leafsize));
                 err = -EINVAL;
                 goto fail_alloc;
         }
- -      if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ +      if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
                 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
                        "blocksize (%d) was too large\n",
- -                     btrfs_super_leafsize(disk_super));
+ +                     btrfs_super_nodesize(disk_super));
                 err = -EINVAL;
                 goto fail_alloc;
         }
@@@ -2502,16 -2498,17 +2502,16 @@@
          * flag our filesystem as having big metadata blocks if
          * they are bigger than the page size
          */
- -      if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+ +      if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
                 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
                         printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
                 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
         }
   
         nodesize = btrfs_super_nodesize(disk_super);
- -      leafsize = btrfs_super_leafsize(disk_super);
         sectorsize = btrfs_super_sectorsize(disk_super);
         stripesize = btrfs_super_stripesize(disk_super);
- -      fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+ +      fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
         fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
   
         /*
@@@ -2519,7 -2516,7 +2519,7 @@@
          * extent buffers for the same range.  It leads to corruptions
          */
         if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- -          (sectorsize != leafsize)) {
+ +          (sectorsize != nodesize)) {
                 printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
                                 "are not allowed for mixed block groups on %s\n",
                                 sb->s_id);
@@@ -2582,8 -2579,6 +2582,8 @@@
                 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
         fs_info->endio_raid56_workers =
                 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ +      fs_info->endio_repair_workers =
+ +              btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
         fs_info->rmw_workers =
                 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
         fs_info->endio_write_workers =
@@@ -2605,12 -2600,11 +2605,12 @@@
               fs_info->submit_workers && fs_info->flush_workers &&
               fs_info->endio_workers && fs_info->endio_meta_workers &&
               fs_info->endio_meta_write_workers &&
+ +            fs_info->endio_repair_workers &&
               fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
               fs_info->endio_freespace_worker && fs_info->rmw_workers &&
               fs_info->caching_workers && fs_info->readahead_workers &&
               fs_info->fixup_workers && fs_info->delayed_workers &&
- -            fs_info->fixup_workers && fs_info->extent_workers &&
+ +            fs_info->extent_workers &&
               fs_info->qgroup_rescan_workers)) {
                 err = -ENOMEM;
                 goto fail_sb_buffer;
@@@ -2621,6 -2615,7 +2621,6 @@@
                                     4 * 1024 * 1024 / PAGE_CACHE_SIZE);
   
         tree_root->nodesize = nodesize;
- -      tree_root->leafsize = leafsize;
         tree_root->sectorsize = sectorsize;
         tree_root->stripesize = stripesize;
   
@@@ -2647,14 -2642,16 +2647,14 @@@
                 goto fail_sb_buffer;
         }
   
- -      blocksize = btrfs_level_size(tree_root,
- -                                   btrfs_super_chunk_root_level(disk_super));
         generation = btrfs_super_chunk_root_generation(disk_super);
   
- -      __setup_root(nodesize, leafsize, sectorsize, stripesize,
- -                   chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+ +      __setup_root(nodesize, sectorsize, stripesize, chunk_root,
+ +                   fs_info, BTRFS_CHUNK_TREE_OBJECTID);
   
         chunk_root->node = read_tree_block(chunk_root,
                                            btrfs_super_chunk_root(disk_super),
- -                                         blocksize, generation);
+ +                                         generation);
         if (!chunk_root->node ||
             !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
                 printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
@@@ -2687,11 -2684,13 +2687,11 @@@
         }
   
   retry_root_backup:
- -      blocksize = btrfs_level_size(tree_root,
- -                                   btrfs_super_root_level(disk_super));
         generation = btrfs_super_generation(disk_super);
   
         tree_root->node = read_tree_block(tree_root,
                                           btrfs_super_root(disk_super),
- -                                        blocksize, generation);
+ +                                        generation);
         if (!tree_root->node ||
             !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
                 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
@@@ -2860,6 -2859,9 +2860,6 @@@
                         err = -EIO;
                         goto fail_qgroup;
                 }
- -              blocksize =
- -                   btrfs_level_size(tree_root,
- -                                    btrfs_super_log_root_level(disk_super));
   
                 log_tree_root = btrfs_alloc_root(fs_info);
                 if (!log_tree_root) {
@@@ -2867,10 -2869,11 +2867,10 @@@
                         goto fail_qgroup;
                 }
   
- -              __setup_root(nodesize, leafsize, sectorsize, stripesize,
+ +              __setup_root(nodesize, sectorsize, stripesize,
                              log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
   
                 log_tree_root->node = read_tree_block(tree_root, bytenr,
- -                                                    blocksize,
                                                       generation + 1);
                 if (!log_tree_root->node ||
                     !extent_buffer_uptodate(log_tree_root->node)) {
@@@ -2977,8 -2980,6 +2977,8 @@@
                 fs_info->update_uuid_tree_gen = 1;
         }
   
+ +      fs_info->open = 1;
+ +
         return 0;
   
   fail_qgroup:
@@@ -3138,8 -3139,7 +3138,8 @@@ static int write_dev_supers(struct btrf
   
         for (i = 0; i < max_mirrors; i++) {
                 bytenr = btrfs_sb_offset(i);
- -              if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ +              if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ +                  device->commit_total_bytes)
                         break;
   
                 if (wait) {
@@@ -3456,9 -3456,8 +3456,9 @@@ static int write_all_supers(struct btrf
                 btrfs_set_stack_device_type(dev_item, dev->type);
                 btrfs_set_stack_device_id(dev_item, dev->devid);
                 btrfs_set_stack_device_total_bytes(dev_item,
- -                                                 dev->disk_total_bytes);
- -              btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+ +                                                 dev->commit_total_bytes);
+ +              btrfs_set_stack_device_bytes_used(dev_item,
+ +                                                dev->commit_bytes_used);
                 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
@@@ -3533,7 -3532,7 +3533,7 @@@ void btrfs_drop_and_free_fs_root(struc
   
   static void free_fs_root(struct btrfs_root *root)
   {
- -      iput(root->cache_inode);
+ +      iput(root->ino_cache_inode);
         WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
         btrfs_free_block_rsv(root, root->orphan_block_rsv);
         root->orphan_block_rsv = NULL;
@@@ -3624,7 -3623,7 +3624,7 @@@ int btrfs_commit_super(struct btrfs_roo
         return btrfs_commit_transaction(trans, root);
   }
   
- -int close_ctree(struct btrfs_root *root)
+ +void close_ctree(struct btrfs_root *root)
   {
         struct btrfs_fs_info *fs_info = root->fs_info;
         int ret;
@@@ -3690,7 -3689,6 +3690,7 @@@
         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
         btrfs_stop_all_workers(fs_info);
   
+ +      fs_info->open = 0;
         free_root_pointers(fs_info, 1);
   
         iput(fs_info->btree_inode);
@@@ -3713,6 -3711,8 +3713,6 @@@
   
         btrfs_free_block_rsv(root, root->orphan_block_rsv);
         root->orphan_block_rsv = NULL;
- -
- -      return 0;
   }
   
   int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@@ -3814,73 -3814,10 +3814,73 @@@ int btrfs_read_buffer(struct extent_buf
   static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                               int read_only)
   {
+ +      struct btrfs_super_block *sb = fs_info->super_copy;
+ +      int ret = 0;
+ +
+ +      if (sb->root_level > BTRFS_MAX_LEVEL) {
+ +              printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
+ +                              sb->root_level, BTRFS_MAX_LEVEL);
+ +              ret = -EINVAL;
+ +      }
+ +      if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
+ +              printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
+ +                              sb->chunk_root_level, BTRFS_MAX_LEVEL);
+ +              ret = -EINVAL;
+ +      }
+ +      if (sb->log_root_level > BTRFS_MAX_LEVEL) {
+ +              printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
+ +                              sb->log_root_level, BTRFS_MAX_LEVEL);
+ +              ret = -EINVAL;
+ +      }
+ +
         /*
- -       * Placeholder for checks
+ +       * The common minimum, we don't know if we can trust the nodesize/sectorsize
+ +       * items yet, they'll be verified later. Issue just a warning.
          */
- -      return 0;
+ +      if (!IS_ALIGNED(sb->root, 4096))
+ +              printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ +                              sb->root);
+ +      if (!IS_ALIGNED(sb->chunk_root, 4096))
+ +              printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ +                              sb->chunk_root);
+ +      if (!IS_ALIGNED(sb->log_root, 4096))
+ +              printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ +                              sb->log_root);
+ +
+ +      if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
+ +              printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
+ +                              fs_info->fsid, sb->dev_item.fsid);
+ +              ret = -EINVAL;
+ +      }
+ +
+ +      /*
+ +       * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+ +       * done later
+ +       */
+ +      if (sb->num_devices > (1UL << 31))
+ +              printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
+ +                              sb->num_devices);
+ +
+ +      if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
+ +              printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
+ +                              sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
+ +              ret = -EINVAL;
+ +      }
+ +
+ +      /*
+ +       * The generation is a global counter, we'll trust it more than the others
+ +       * but it's still possible that it's the one that's wrong.
+ +       */
+ +      if (sb->generation < sb->chunk_root_generation)
+ +              printk(KERN_WARNING
+ +                      "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
+ +                      sb->generation, sb->chunk_root_generation);
+ +      if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
+ +              printk(KERN_WARNING
+ +                      "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
+ +                      sb->generation, sb->cache_generation);
+ +
+ +      return ret;
   }
   
   static void btrfs_error_commit_super(struct btrfs_root *root)
@@@ -4072,8 -4009,9 +4072,8 @@@ static int btrfs_destroy_marked_extents
   
                 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
                 while (start <= end) {
- -                      eb = btrfs_find_tree_block(root, start,
- -                                                 root->leafsize);
- -                      start += root->leafsize;
+ +                      eb = btrfs_find_tree_block(root, start);
+ +                      start += root->nodesize;
                         if (!eb)
                                 continue;
                         wait_on_extent_buffer_writeback(eb);
diff --combined fs/nfs/direct.c

index dda4b86,891f7dd..20cffc8
--- 1/fs/nfs/direct.c
--- 2/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@@ -178,6 -178,7 +178,6 @@@ static int nfs_direct_set_or_cmp_hdr_ve
         return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
   }
   
- -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
   /*
    * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
    * @dreq - direct request possibly spanning multiple servers
@@@ -196,6 -197,7 +196,6 @@@ static int nfs_direct_cmp_commit_data_v
         WARN_ON_ONCE(verfp->committed < 0);
         return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
   }
- -#endif
   
   /**
    * nfs_direct_IO - NFS address space operation for direct I/O
@@@ -220,11 -222,9 +220,9 @@@ ssize_t nfs_direct_IO(int rw, struct ki
   #else
         VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
   
-       if (rw == READ || rw == KERNEL_READ)
-               return nfs_file_direct_read(iocb, iter, pos,
-                               rw == READ ? true : false);
-       return nfs_file_direct_write(iocb, iter, pos,
-                               rw == WRITE ? true : false);
+       if (rw == READ)
+               return nfs_file_direct_read(iocb, iter, pos);
+       return nfs_file_direct_write(iocb, iter, pos);
   #endif /* CONFIG_NFS_SWAP */
   }
   
@@@ -510,7 -510,7 +508,7 @@@ static ssize_t nfs_direct_read_schedule
    * cache.
    */
   ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
-                               loff_t pos, bool uio)
+                               loff_t pos)
   {
         struct file *file = iocb->ki_filp;
         struct address_space *mapping = file->f_mapping;
@@@ -574,6 -574,7 +572,6 @@@ out
         return result;
   }
   
- -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
   static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
   {
         struct nfs_pageio_descriptor desc;
@@@ -697,6 -698,17 +695,6 @@@ static void nfs_direct_write_complete(s
         schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
   }
   
- -#else
- -static void nfs_direct_write_schedule_work(struct work_struct *work)
- -{
- -}
- -
- -static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
- -{
- -      nfs_direct_complete(dreq, true);
- -}
- -#endif
- -
   static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
   {
         struct nfs_direct_req *dreq = hdr->dreq;
@@@ -879,7 -891,7 +877,7 @@@ static ssize_t nfs_direct_write_schedul
    * is no atomic O_APPEND write facility in the NFS protocol.
    */
   ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
-                               loff_t pos, bool uio)
+                               loff_t pos)
   {
         ssize_t result = -EINVAL;
         struct file *file = iocb->ki_filp;
diff --combined fs/nfs/file.c

index 4ea92ce,3b42cb8..2ab6f00
--- 1/fs/nfs/file.c
--- 2/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@@ -36,7 -36,6 +36,7 @@@
   #include "internal.h"
   #include "iostat.h"
   #include "fscache.h"
+ +#include "pnfs.h"
   
   #include "nfstrace.h"
   
@@@ -172,7 -171,7 +172,7 @@@ nfs_file_read(struct kiocb *iocb, struc
         ssize_t result;
   
         if (iocb->ki_filp->f_flags & O_DIRECT)
-               return nfs_file_direct_read(iocb, to, iocb->ki_pos, true);
+               return nfs_file_direct_read(iocb, to, iocb->ki_pos);
   
         dprintk("NFS: read(%pD2, %zu@%lu)\n",
                 iocb->ki_filp,
@@@ -328,12 -327,6 +328,12 @@@ static int nfs_want_read_modify_write(s
         unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
         unsigned int end = offset + len;
   
+ +      if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
+ +              if (!PageUptodate(page))
+ +                      return 1;
+ +              return 0;
+ +      }
+ +
         if ((file->f_mode & FMODE_READ) &&      /* open for read? */
             !PageUptodate(page) &&              /* Uptodate? */
             !PagePrivate(page) &&               /* i/o request already? */
@@@ -475,26 -468,17 +475,26 @@@ static int nfs_release_page(struct pag
   
         dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
   
- -      /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not
- -       * doing this memory reclaim for a fs-related allocation.
+ +      /* Always try to initiate a 'commit' if relevant, but only
+ +       * wait for it if __GFP_WAIT is set.  Even then, only wait 1
+ +       * second and only if the 'bdi' is not congested.
+ +       * Waiting indefinitely can cause deadlocks when the NFS
+ +       * server is on this machine, when a new TCP connection is
+ +       * needed and in other rare cases.  There is no particular
+ +       * need to wait extensively here.  A short wait has the
+ +       * benefit that someone else can worry about the freezer.
          */
- -      if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL &&
- -          !(current->flags & PF_FSTRANS)) {
- -              int how = FLUSH_SYNC;
- -
- -              /* Don't let kswapd deadlock waiting for OOM RPC calls */
- -              if (current_is_kswapd())
- -                      how = 0;
- -              nfs_commit_inode(mapping->host, how);
+ +      if (mapping) {
+ +              struct nfs_server *nfss = NFS_SERVER(mapping->host);
+ +              nfs_commit_inode(mapping->host, 0);
+ +              if ((gfp & __GFP_WAIT) &&
+ +                  !bdi_write_congested(&nfss->backing_dev_info)) {
+ +                      wait_on_page_bit_killable_timeout(page, PG_private,
+ +                                                        HZ);
+ +                      if (PagePrivate(page))
+ +                              set_bdi_congested(&nfss->backing_dev_info,
+ +                                                BLK_RW_ASYNC);
+ +              }
         }
         /* If PagePrivate() is set, then the page is not freeable */
         if (PagePrivate(page))
@@@ -555,25 -539,13 +555,25 @@@ static int nfs_launder_page(struct pag
   static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
                                                 sector_t *span)
   {
+ +      int ret;
+ +      struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+ +
         *span = sis->pages;
- -      return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
+ +
+ +      rcu_read_lock();
+ +      ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
+ +      rcu_read_unlock();
+ +
+ +      return ret;
   }
   
   static void nfs_swap_deactivate(struct file *file)
   {
- -      xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
+ +      struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+ +
+ +      rcu_read_lock();
+ +      xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
+ +      rcu_read_unlock();
   }
   #endif
   
@@@ -676,7 -648,7 +676,7 @@@ ssize_t nfs_file_write(struct kiocb *io
                 return result;
   
         if (file->f_flags & O_DIRECT)
-               return nfs_file_direct_write(iocb, from, pos, true);
+               return nfs_file_direct_write(iocb, from, pos);
   
         dprintk("NFS: write(%pD2, %zu@%Ld)\n",
                 file, count, (long long) pos);
@@@ -919,6 -891,17 +919,6 @@@ int nfs_flock(struct file *filp, int cm
   }
   EXPORT_SYMBOL_GPL(nfs_flock);
   
- -/*
- - * There is no protocol support for leases, so we have no way to implement
- - * them correctly in the face of opens by other clients.
- - */
- -int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
- -{
- -      dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg);
- -      return -EINVAL;
- -}
- -EXPORT_SYMBOL_GPL(nfs_setlease);
- -
   const struct file_operations nfs_file_operations = {
         .llseek         = nfs_file_llseek,
         .read           = new_sync_read,
@@@ -935,6 -918,6 +935,6 @@@
         .splice_read    = nfs_file_splice_read,
         .splice_write   = iter_file_splice_write,
         .check_flags    = nfs_check_flags,
- -      .setlease       = nfs_setlease,
+ +      .setlease       = simple_nosetlease,
   };
   EXPORT_SYMBOL_GPL(nfs_file_operations);
diff --combined fs/xfs/xfs_buf.c

index 017b6af,497fcde..24b4ebe
--- 1/fs/xfs/xfs_buf.c
--- 2/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@@ -623,11 -623,10 +623,11 @@@ _xfs_buf_read
         bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
         bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
   
- -      xfs_buf_iorequest(bp);
- -      if (flags & XBF_ASYNC)
+ +      if (flags & XBF_ASYNC) {
+ +              xfs_buf_submit(bp);
                 return 0;
- -      return xfs_buf_iowait(bp);
+ +      }
+ +      return xfs_buf_submit_wait(bp);
   }
   
   xfs_buf_t *
@@@ -688,39 -687,34 +688,39 @@@ xfs_buf_readahead_map
    * Read an uncached buffer from disk. Allocates and returns a locked
    * buffer containing the disk contents or nothing.
    */
- -struct xfs_buf *
+ +int
   xfs_buf_read_uncached(
         struct xfs_buftarg      *target,
         xfs_daddr_t             daddr,
         size_t                  numblks,
         int                     flags,
+ +      struct xfs_buf          **bpp,
         const struct xfs_buf_ops *ops)
   {
         struct xfs_buf          *bp;
   
+ +      *bpp = NULL;
+ +
         bp = xfs_buf_get_uncached(target, numblks, flags);
         if (!bp)
- -              return NULL;
+ +              return -ENOMEM;
   
         /* set up the buffer for a read IO */
         ASSERT(bp->b_map_count == 1);
- -      bp->b_bn = daddr;
+ +      bp->b_bn = XFS_BUF_DADDR_NULL;  /* always null for uncached buffers */
         bp->b_maps[0].bm_bn = daddr;
         bp->b_flags |= XBF_READ;
         bp->b_ops = ops;
   
- -      if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
+ +      xfs_buf_submit_wait(bp);
+ +      if (bp->b_error) {
+ +              int     error = bp->b_error;
                 xfs_buf_relse(bp);
- -              return NULL;
+ +              return error;
         }
- -      xfs_buf_iorequest(bp);
- -      xfs_buf_iowait(bp);
- -      return bp;
+ +
+ +      *bpp = bp;
+ +      return 0;
   }
   
   /*
@@@ -1004,56 -998,53 +1004,56 @@@ xfs_buf_wait_unpin
    *    Buffer Utility Routines
    */
   
- -STATIC void
- -xfs_buf_iodone_work(
- -      struct work_struct      *work)
+ +void
+ +xfs_buf_ioend(
+ +      struct xfs_buf  *bp)
   {
- -      struct xfs_buf          *bp =
- -              container_of(work, xfs_buf_t, b_iodone_work);
- -      bool                    read = !!(bp->b_flags & XBF_READ);
+ +      bool            read = bp->b_flags & XBF_READ;
+ +
+ +      trace_xfs_buf_iodone(bp, _RET_IP_);
   
         bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
   
- -      /* only validate buffers that were read without errors */
- -      if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
+ +      /*
+ +       * Pull in IO completion errors now. We are guaranteed to be running
+ +       * single threaded, so we don't need the lock to read b_io_error.
+ +       */
+ +      if (!bp->b_error && bp->b_io_error)
+ +              xfs_buf_ioerror(bp, bp->b_io_error);
+ +
+ +      /* Only validate buffers that were read without errors */
+ +      if (read && !bp->b_error && bp->b_ops) {
+ +              ASSERT(!bp->b_iodone);
                 bp->b_ops->verify_read(bp);
+ +      }
+ +
+ +      if (!bp->b_error)
+ +              bp->b_flags |= XBF_DONE;
   
         if (bp->b_iodone)
                 (*(bp->b_iodone))(bp);
         else if (bp->b_flags & XBF_ASYNC)
                 xfs_buf_relse(bp);
- -      else {
- -              ASSERT(read && bp->b_ops);
+ +      else
                 complete(&bp->b_iowait);
- -      }
   }
   
- -void
- -xfs_buf_ioend(
- -      struct xfs_buf  *bp,
- -      int             schedule)
+ +static void
+ +xfs_buf_ioend_work(
+ +      struct work_struct      *work)
   {
- -      bool            read = !!(bp->b_flags & XBF_READ);
- -
- -      trace_xfs_buf_iodone(bp, _RET_IP_);
+ +      struct xfs_buf          *bp =
+ +              container_of(work, xfs_buf_t, b_iodone_work);
   
- -      if (bp->b_error == 0)
- -              bp->b_flags |= XBF_DONE;
+ +      xfs_buf_ioend(bp);
+ +}
   
- -      if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
- -              if (schedule) {
- -                      INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
- -                      queue_work(xfslogd_workqueue, &bp->b_iodone_work);
- -              } else {
- -                      xfs_buf_iodone_work(&bp->b_iodone_work);
- -              }
- -      } else {
- -              bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
- -              complete(&bp->b_iowait);
- -      }
+ +void
+ +xfs_buf_ioend_async(
+ +      struct xfs_buf  *bp)
+ +{
+ +      INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
+ +      queue_work(xfslogd_workqueue, &bp->b_iodone_work);
   }
   
   void
@@@ -1076,6 -1067,96 +1076,6 @@@ xfs_buf_ioerror_alert
                 (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
   }
   
- -/*
- - * Called when we want to stop a buffer from getting written or read.
- - * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
- - * so that the proper iodone callbacks get called.
- - */
- -STATIC int
- -xfs_bioerror(
- -      xfs_buf_t *bp)
- -{
- -#ifdef XFSERRORDEBUG
- -      ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
- -#endif
- -
- -      /*
- -       * No need to wait until the buffer is unpinned, we aren't flushing it.
- -       */
- -      xfs_buf_ioerror(bp, -EIO);
- -
- -      /*
- -       * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
- -       */
- -      XFS_BUF_UNREAD(bp);
- -      XFS_BUF_UNDONE(bp);
- -      xfs_buf_stale(bp);
- -
- -      xfs_buf_ioend(bp, 0);
- -
- -      return -EIO;
- -}
- -
- -/*
- - * Same as xfs_bioerror, except that we are releasing the buffer
- - * here ourselves, and avoiding the xfs_buf_ioend call.
- - * This is meant for userdata errors; metadata bufs come with
- - * iodone functions attached, so that we can track down errors.
- - */
- -int
- -xfs_bioerror_relse(
- -      struct xfs_buf  *bp)
- -{
- -      int64_t         fl = bp->b_flags;
- -      /*
- -       * No need to wait until the buffer is unpinned.
- -       * We aren't flushing it.
- -       *
- -       * chunkhold expects B_DONE to be set, whether
- -       * we actually finish the I/O or not. We don't want to
- -       * change that interface.
- -       */
- -      XFS_BUF_UNREAD(bp);
- -      XFS_BUF_DONE(bp);
- -      xfs_buf_stale(bp);
- -      bp->b_iodone = NULL;
- -      if (!(fl & XBF_ASYNC)) {
- -              /*
- -               * Mark b_error and B_ERROR _both_.
- -               * Lot's of chunkcache code assumes that.
- -               * There's no reason to mark error for
- -               * ASYNC buffers.
- -               */
- -              xfs_buf_ioerror(bp, -EIO);
- -              complete(&bp->b_iowait);
- -      } else {
- -              xfs_buf_relse(bp);
- -      }
- -
- -      return -EIO;
- -}
- -
- -STATIC int
- -xfs_bdstrat_cb(
- -      struct xfs_buf  *bp)
- -{
- -      if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
- -              trace_xfs_bdstrat_shut(bp, _RET_IP_);
- -              /*
- -               * Metadata write that didn't get logged but
- -               * written delayed anyway. These aren't associated
- -               * with a transaction, and can be ignored.
- -               */
- -              if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
- -                      return xfs_bioerror_relse(bp);
- -              else
- -                      return xfs_bioerror(bp);
- -      }
- -
- -      xfs_buf_iorequest(bp);
- -      return 0;
- -}
- -
   int
   xfs_bwrite(
         struct xfs_buf          *bp)
@@@ -1085,10 -1166,11 +1085,10 @@@
         ASSERT(xfs_buf_islocked(bp));
   
         bp->b_flags |= XBF_WRITE;
- -      bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
+ +      bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
+ +                       XBF_WRITE_FAIL | XBF_DONE);
   
- -      xfs_bdstrat_cb(bp);
- -
- -      error = xfs_buf_iowait(bp);
+ +      error = xfs_buf_submit_wait(bp);
         if (error) {
                 xfs_force_shutdown(bp->b_target->bt_mount,
                                    SHUTDOWN_META_IO_ERROR);
@@@ -1097,6 -1179,15 +1097,6 @@@
   }
   
   STATIC void
- -_xfs_buf_ioend(
- -      xfs_buf_t               *bp,
- -      int                     schedule)
- -{
- -      if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
- -              xfs_buf_ioend(bp, schedule);
- -}
- -
- -STATIC void
   xfs_buf_bio_end_io(
         struct bio              *bio,
         int                     error)
@@@ -1107,18 -1198,13 +1107,18 @@@
          * don't overwrite existing errors - otherwise we can lose errors on
          * buffers that require multiple bios to complete.
          */
- -      if (!bp->b_error)
- -              xfs_buf_ioerror(bp, error);
+ +      if (error) {
+ +              spin_lock(&bp->b_lock);
+ +              if (!bp->b_io_error)
+ +                      bp->b_io_error = error;
+ +              spin_unlock(&bp->b_lock);
+ +      }
   
         if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
   
- -      _xfs_buf_ioend(bp, 1);
+ +      if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+ +              xfs_buf_ioend_async(bp);
         bio_put(bio);
   }
   
@@@ -1197,7 -1283,7 +1197,7 @@@ next_chunk
         } else {
                 /*
                  * This is guaranteed not to be the last io reference count
- -               * because the caller (xfs_buf_iorequest) holds a count itself.
+ +               * because the caller (xfs_buf_submit) holds a count itself.
                  */
                 atomic_dec(&bp->b_io_remaining);
                 xfs_buf_ioerror(bp, -EIO);
@@@ -1287,131 -1373,53 +1287,131 @@@ _xfs_buf_ioapply
         blk_finish_plug(&plug);
   }
   
+ +/*
+ + * Asynchronous IO submission path. This transfers the buffer lock ownership and
+ + * the current reference to the IO. It is not safe to reference the buffer after
+ + * a call to this function unless the caller holds an additional reference
+ + * itself.
+ + */
   void
- -xfs_buf_iorequest(
- -      xfs_buf_t               *bp)
+ +xfs_buf_submit(
+ +      struct xfs_buf  *bp)
   {
- -      trace_xfs_buf_iorequest(bp, _RET_IP_);
+ +      trace_xfs_buf_submit(bp, _RET_IP_);
   
         ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+ +      ASSERT(bp->b_flags & XBF_ASYNC);
+ +
+ +      /* on shutdown we stale and complete the buffer immediately */
+ +      if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ +              xfs_buf_ioerror(bp, -EIO);
+ +              bp->b_flags &= ~XBF_DONE;
+ +              xfs_buf_stale(bp);
+ +              xfs_buf_ioend(bp);
+ +              return;
+ +      }
   
         if (bp->b_flags & XBF_WRITE)
                 xfs_buf_wait_unpin(bp);
+ +
+ +      /* clear the internal error state to avoid spurious errors */
+ +      bp->b_io_error = 0;
+ +
+ +      /*
+ +       * The caller's reference is released during I/O completion.
+ +       * This occurs some time after the last b_io_remaining reference is
+ +       * released, so after we drop our Io reference we have to have some
+ +       * other reference to ensure the buffer doesn't go away from underneath
+ +       * us. Take a direct reference to ensure we have safe access to the
+ +       * buffer until we are finished with it.
+ +       */
         xfs_buf_hold(bp);
   
         /*
- -       * Set the count to 1 initially, this will stop an I/O
- -       * completion callout which happens before we have started
- -       * all the I/O from calling xfs_buf_ioend too early.
+ +       * Set the count to 1 initially, this will stop an I/O completion
+ +       * callout which happens before we have started all the I/O from calling
+ +       * xfs_buf_ioend too early.
          */
         atomic_set(&bp->b_io_remaining, 1);
         _xfs_buf_ioapply(bp);
+ +
         /*
- -       * If _xfs_buf_ioapply failed, we'll get back here with
- -       * only the reference we took above.  _xfs_buf_ioend will
- -       * drop it to zero, so we'd better not queue it for later,
- -       * or we'll free it before it's done.
+ +       * If _xfs_buf_ioapply failed, we can get back here with only the IO
+ +       * reference we took above. If we drop it to zero, run completion so
+ +       * that we don't return to the caller with completion still pending.
          */
- -      _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
+ +      if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+ +              if (bp->b_error)
+ +                      xfs_buf_ioend(bp);
+ +              else
+ +                      xfs_buf_ioend_async(bp);
+ +      }
   
         xfs_buf_rele(bp);
+ +      /* Note: it is not safe to reference bp now we've dropped our ref */
   }
   
   /*
- - * Waits for I/O to complete on the buffer supplied.  It returns immediately if
- - * no I/O is pending or there is already a pending error on the buffer, in which
- - * case nothing will ever complete.  It returns the I/O error code, if any, or
- - * 0 if there was no error.
+ + * Synchronous buffer IO submission path, read or write.
    */
   int
- -xfs_buf_iowait(
- -      xfs_buf_t               *bp)
+ +xfs_buf_submit_wait(
+ +      struct xfs_buf  *bp)
   {
- -      trace_xfs_buf_iowait(bp, _RET_IP_);
+ +      int             error;
   
- -      if (!bp->b_error)
- -              wait_for_completion(&bp->b_iowait);
+ +      trace_xfs_buf_submit_wait(bp, _RET_IP_);
+ +
+ +      ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC)));
+ +
+ +      if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ +              xfs_buf_ioerror(bp, -EIO);
+ +              xfs_buf_stale(bp);
+ +              bp->b_flags &= ~XBF_DONE;
+ +              return -EIO;
+ +      }
+ +
+ +      if (bp->b_flags & XBF_WRITE)
+ +              xfs_buf_wait_unpin(bp);
+ +
+ +      /* clear the internal error state to avoid spurious errors */
+ +      bp->b_io_error = 0;
+ +
+ +      /*
+ +       * For synchronous IO, the IO does not inherit the submitters reference
+ +       * count, nor the buffer lock. Hence we cannot release the reference we
+ +       * are about to take until we've waited for all IO completion to occur,
+ +       * including any xfs_buf_ioend_async() work that may be pending.
+ +       */
+ +      xfs_buf_hold(bp);
+ +
+ +      /*
+ +       * Set the count to 1 initially, this will stop an I/O completion
+ +       * callout which happens before we have started all the I/O from calling
+ +       * xfs_buf_ioend too early.
+ +       */
+ +      atomic_set(&bp->b_io_remaining, 1);
+ +      _xfs_buf_ioapply(bp);
+ +
+ +      /*
+ +       * make sure we run completion synchronously if it raced with us and is
+ +       * already complete.
+ +       */
+ +      if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+ +              xfs_buf_ioend(bp);
   
+ +      /* wait for completion before gathering the error from the buffer */
+ +      trace_xfs_buf_iowait(bp, _RET_IP_);
+ +      wait_for_completion(&bp->b_iowait);
         trace_xfs_buf_iowait_done(bp, _RET_IP_);
- -      return bp->b_error;
+ +      error = bp->b_error;
+ +
+ +      /*
+ +       * all done now, we can release the hold that keeps the buffer
+ +       * referenced for the entire IO.
+ +       */
+ +      xfs_buf_rele(bp);
+ +      return error;
   }
   
   xfs_caddr_t
@@@ -1670,8 -1678,6 +1670,6 @@@ xfs_alloc_buftarg
         btp->bt_dev =  bdev->bd_dev;
         btp->bt_bdev = bdev;
         btp->bt_bdi = blk_get_backing_dev_info(bdev);
-       if (!btp->bt_bdi)
-               goto error;
   
         if (xfs_setsize_buftarg_early(btp, bdev))
                 goto error;
@@@ -1805,19 -1811,13 +1803,19 @@@ __xfs_buf_delwri_submit
         blk_start_plug(&plug);
         list_for_each_entry_safe(bp, n, io_list, b_list) {
                 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
- -              bp->b_flags |= XBF_WRITE;
+ +              bp->b_flags |= XBF_WRITE | XBF_ASYNC;
   
- -              if (!wait) {
- -                      bp->b_flags |= XBF_ASYNC;
+ +              /*
+ +               * we do all Io submission async. This means if we need to wait
+ +               * for IO completion we need to take an extra reference so the
+ +               * buffer is still valid on the other side.
+ +               */
+ +              if (wait)
+ +                      xfs_buf_hold(bp);
+ +              else
                         list_del_init(&bp->b_list);
- -              }
- -              xfs_bdstrat_cb(bp);
+ +
+ +              xfs_buf_submit(bp);
         }
         blk_finish_plug(&plug);
   
@@@ -1864,10 -1864,7 +1862,10 @@@ xfs_buf_delwri_submit
                 bp = list_first_entry(&io_list, struct xfs_buf, b_list);
   
                 list_del_init(&bp->b_list);
- -              error2 = xfs_buf_iowait(bp);
+ +
+ +              /* locking the buffer will wait for async IO completion. */
+ +              xfs_buf_lock(bp);
+ +              error2 = bp->b_error;
                 xfs_buf_relse(bp);
                 if (!error)
                         error = error2;
@@@ -1885,7 -1882,7 +1883,7 @@@ xfs_buf_init(void
                 goto out;
   
         xfslogd_workqueue = alloc_workqueue("xfslogd",
- -                                      WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
+ +                              WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1);
         if (!xfslogd_workqueue)
                 goto out_free_buf_zone;
   
diff --combined include/linux/blk-mq.h

index c13a0c0,02c5d95..c9be158
--- 1/include/linux/blk-mq.h
--- 2/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@@ -4,6 -4,7 +4,7 @@@
   #include <linux/blkdev.h>
   
   struct blk_mq_tags;
+ struct blk_flush_queue;
   
   struct blk_mq_cpu_notifier {
         struct list_head list;
@@@ -34,6 -35,7 +35,7 @@@ struct blk_mq_hw_ctx 
   
         struct request_queue    *queue;
         unsigned int            queue_num;
+       struct blk_flush_queue  *fq;
   
         void                    *driver_data;
   
@@@ -77,8 -79,9 +79,9 @@@ struct blk_mq_tag_set 
         struct list_head        tag_list;
   };
   
- typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
+ typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool);
   typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
+ typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
   typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
   typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
   typedef int (init_request_fn)(void *, struct request *, unsigned int,
@@@ -86,6 -89,9 +89,9 @@@
   typedef void (exit_request_fn)(void *, struct request *, unsigned int,
                 unsigned int);
   
+ typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
+               bool);
+ 
   struct blk_mq_ops {
         /*
          * Queue request
@@@ -100,7 -106,7 +106,7 @@@
         /*
          * Called on request timeout
          */
-       rq_timed_out_fn         *timeout;
+       timeout_fn              *timeout;
   
         softirq_done_fn         *complete;
   
@@@ -115,6 -121,10 +121,10 @@@
         /*
          * Called for every command allocated by the block layer to allow
          * the driver to set up driver specific data.
+        *
+        * Tag greater than or equal to queue_depth is for setting up
+        * flush request.
+        *
          * Ditto for exit/teardown.
          */
         init_request_fn         *init_request;
@@@ -140,7 -150,6 +150,7 @@@ enum 
   };
   
   struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
+ +void blk_mq_finish_init(struct request_queue *q);
   int blk_mq_register_disk(struct gendisk *);
   void blk_mq_unregister_disk(struct gendisk *);
   
@@@ -160,8 -169,9 +170,9 @@@ struct request *blk_mq_tag_to_rq(struc
   struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
   struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
   
- void blk_mq_end_io(struct request *rq, int error);
- void __blk_mq_end_io(struct request *rq, int error);
+ void blk_mq_start_request(struct request *rq);
+ void blk_mq_end_request(struct request *rq, int error);
+ void __blk_mq_end_request(struct request *rq, int error);
   
   void blk_mq_requeue_request(struct request *rq);
   void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
@@@ -174,7 -184,8 +185,8 @@@ void blk_mq_stop_hw_queues(struct reque
   void blk_mq_start_hw_queues(struct request_queue *q);
   void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
   void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
- void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
+ void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
+               void *priv);
   
   /*
    * Driver command data is immediately after the request. So subtract request
diff --combined include/linux/blkdev.h

index 87be398,5546392..0207a78
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -36,6 -36,7 +36,7 @@@ struct request
   struct sg_io_hdr;
   struct bsg_job;
   struct blkcg_gq;
+ struct blk_flush_queue;
   
   #define BLKDEV_MIN_RQ 4
   #define BLKDEV_MAX_RQ 128     /* Default maximum */
@@@ -455,14 -456,7 +456,7 @@@ struct request_queue 
          */
         unsigned int            flush_flags;
         unsigned int            flush_not_queueable:1;
-       unsigned int            flush_queue_delayed:1;
-       unsigned int            flush_pending_idx:1;
-       unsigned int            flush_running_idx:1;
-       unsigned long           flush_pending_since;
-       struct list_head        flush_queue[2];
-       struct list_head        flush_data_in_flight;
-       struct request          *flush_rq;
-       spinlock_t              mq_flush_lock;
+       struct blk_flush_queue  *fq;
   
         struct list_head        requeue_list;
         spinlock_t              requeue_lock;
@@@ -865,7 -859,7 +859,7 @@@ extern void blk_execute_rq_nowait(struc
   
   static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
   {
-       return bdev->bd_disk->queue;
+       return bdev->bd_disk->queue;    /* this is never NULL */
   }
   
   /*
@@@ -1285,10 -1279,9 +1279,9 @@@ static inline int queue_alignment_offse
   static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector)
   {
         unsigned int granularity = max(lim->physical_block_size, lim->io_min);
-       unsigned int alignment = (sector << 9) & (granularity - 1);
+       unsigned int alignment = sector_div(sector, granularity >> 9) << 9;
   
-       return (granularity + lim->alignment_offset - alignment)
-               & (granularity - 1);
+       return (granularity + lim->alignment_offset - alignment) % granularity;
   }
   
   static inline int bdev_alignment_offset(struct block_device *bdev)
@@@ -1464,32 -1457,31 +1457,31 @@@ static inline uint64_t rq_io_start_time
   
   #if defined(CONFIG_BLK_DEV_INTEGRITY)
   
- #define INTEGRITY_FLAG_READ   2       /* verify data integrity on read */
- #define INTEGRITY_FLAG_WRITE  4       /* generate data integrity on write */
+ enum blk_integrity_flags {
+       BLK_INTEGRITY_VERIFY            = 1 << 0,
+       BLK_INTEGRITY_GENERATE          = 1 << 1,
+       BLK_INTEGRITY_DEVICE_CAPABLE    = 1 << 2,
+       BLK_INTEGRITY_IP_CHECKSUM       = 1 << 3,
+ };
   
- struct blk_integrity_exchg {
+ struct blk_integrity_iter {
         void                    *prot_buf;
         void                    *data_buf;
-       sector_t                sector;
+       sector_t                seed;
         unsigned int            data_size;
-       unsigned short          sector_size;
+       unsigned short          interval;
         const char              *disk_name;
   };
   
- typedef void (integrity_gen_fn) (struct blk_integrity_exchg *);
- typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *);
- typedef void (integrity_set_tag_fn) (void *, void *, unsigned int);
- typedef void (integrity_get_tag_fn) (void *, void *, unsigned int);
+ typedef int (integrity_processing_fn) (struct blk_integrity_iter *);
   
   struct blk_integrity {
-       integrity_gen_fn        *generate_fn;
-       integrity_vrfy_fn       *verify_fn;
-       integrity_set_tag_fn    *set_tag_fn;
-       integrity_get_tag_fn    *get_tag_fn;
+       integrity_processing_fn *generate_fn;
+       integrity_processing_fn *verify_fn;
   
         unsigned short          flags;
         unsigned short          tuple_size;
-       unsigned short          sector_size;
+       unsigned short          interval;
         unsigned short          tag_size;
   
         const char              *name;
@@@ -1504,10 -1496,10 +1496,10 @@@ extern int blk_integrity_compare(struc
   extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
                                    struct scatterlist *);
   extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
- extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
-                                 struct request *);
- extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
-                                  struct bio *);
+ extern bool blk_integrity_merge_rq(struct request_queue *, struct request *,
+                                  struct request *);
+ extern bool blk_integrity_merge_bio(struct request_queue *, struct request *,
+                                   struct bio *);
   
   static inline
   struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
@@@ -1520,12 -1512,9 +1512,9 @@@ static inline struct blk_integrity *blk
         return disk->integrity;
   }
   
- static inline int blk_integrity_rq(struct request *rq)
+ static inline bool blk_integrity_rq(struct request *rq)
   {
-       if (rq->bio == NULL)
-               return 0;
- 
-       return bio_integrity(rq->bio);
+       return rq->cmd_flags & REQ_INTEGRITY;
   }
   
   static inline void blk_queue_max_integrity_segments(struct request_queue *q,
@@@ -1564,7 -1553,7 +1553,7 @@@ static inline int blk_rq_map_integrity_
   }
   static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
   {
- -      return 0;
+ +      return NULL;
   }
   static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
   {
@@@ -1590,15 -1579,15 +1579,15 @@@ static inline unsigned short queue_max_
   {
         return 0;
   }
- static inline int blk_integrity_merge_rq(struct request_queue *rq,
-                                        struct request *r1,
-                                        struct request *r2)
+ static inline bool blk_integrity_merge_rq(struct request_queue *rq,
+                                         struct request *r1,
+                                         struct request *r2)
   {
         return 0;
   }
- static inline int blk_integrity_merge_bio(struct request_queue *rq,
-                                         struct request *r,
-                                         struct bio *b)
+ static inline bool blk_integrity_merge_bio(struct request_queue *rq,
+                                          struct request *r,
+                                          struct bio *b)
   {
         return 0;
   }
diff --combined include/linux/fs.h

index ab4f1a1,9b5bc1c..a957d43
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -192,8 -192,6 +192,6 @@@ typedef void (dio_iodone_t)(struct kioc
   #define READ                  0
   #define WRITE                 RW_MASK
   #define READA                 RWA_MASK
- #define KERNEL_READ           (READ|REQ_KERNEL)
- #define KERNEL_WRITE          (WRITE|REQ_KERNEL)
   
   #define READ_SYNC             (READ | REQ_SYNC)
   #define WRITE_SYNC            (WRITE | REQ_SYNC | REQ_NOIDLE)
@@@ -851,7 -849,13 +849,7 @@@ static inline struct file *get_file(str
    */
   #define FILE_LOCK_DEFERRED 1
   
- -/*
- - * The POSIX file lock owner is determined by
- - * the "struct files_struct" in the thread group
- - * (or NULL for no owner - BSD locks).
- - *
- - * Lockd stuffs a "host" pointer into this.
- - */
+ +/* legacy typedef, should eventually be removed */
   typedef void *fl_owner_t;
   
   struct file_lock_operations {
@@@ -862,13 -866,10 +860,13 @@@
   struct lock_manager_operations {
         int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
         unsigned long (*lm_owner_key)(struct file_lock *);
+ +      void (*lm_get_owner)(struct file_lock *, struct file_lock *);
+ +      void (*lm_put_owner)(struct file_lock *);
         void (*lm_notify)(struct file_lock *);  /* unblock callback */
- -      int (*lm_grant)(struct file_lock *, struct file_lock *, int);
- -      void (*lm_break)(struct file_lock *);
- -      int (*lm_change)(struct file_lock **, int);
+ +      int (*lm_grant)(struct file_lock *, int);
+ +      bool (*lm_break)(struct file_lock *);
+ +      int (*lm_change)(struct file_lock **, int, struct list_head *);
+ +      void (*lm_setup)(struct file_lock *, void **);
   };
   
   struct lock_manager {
@@@ -963,7 -964,7 +961,7 @@@ void locks_free_lock(struct file_lock *
   extern void locks_init_lock(struct file_lock *);
   extern struct file_lock * locks_alloc_lock(void);
   extern void locks_copy_lock(struct file_lock *, struct file_lock *);
- -extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
+ +extern void locks_copy_conflock(struct file_lock *, struct file_lock *);
   extern void locks_remove_posix(struct file *, fl_owner_t);
   extern void locks_remove_file(struct file *);
   extern void locks_release_private(struct file_lock *);
@@@ -977,9 -978,11 +975,9 @@@ extern int vfs_cancel_lock(struct file 
   extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
   extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
   extern void lease_get_mtime(struct inode *, struct timespec *time);
- -extern int generic_setlease(struct file *, long, struct file_lock **);
- -extern int vfs_setlease(struct file *, long, struct file_lock **);
- -extern int lease_modify(struct file_lock **, int);
- -extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
- -extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
+ +extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
+ +extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
+ +extern int lease_modify(struct file_lock **, int, struct list_head *);
   #else /* !CONFIG_FILE_LOCKING */
   static inline int fcntl_getlk(struct file *file, unsigned int cmd,
                               struct flock __user *user)
@@@ -1008,12 -1011,12 +1006,12 @@@ static inline int fcntl_setlk64(unsigne
   #endif
   static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
   {
- -      return 0;
+ +      return -EINVAL;
   }
   
   static inline int fcntl_getlease(struct file *filp)
   {
- -      return 0;
+ +      return F_UNLCK;
   }
   
   static inline void locks_init_lock(struct file_lock *fl)
@@@ -1021,7 -1024,7 +1019,7 @@@
         return;
   }
   
- -static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl)
+ +static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
   {
         return;
   }
@@@ -1095,22 -1098,33 +1093,22 @@@ static inline void lease_get_mtime(stru
   }
   
   static inline int generic_setlease(struct file *filp, long arg,
- -                                  struct file_lock **flp)
+ +                                  struct file_lock **flp, void **priv)
   {
         return -EINVAL;
   }
   
   static inline int vfs_setlease(struct file *filp, long arg,
- -                             struct file_lock **lease)
+ +                             struct file_lock **lease, void **priv)
   {
         return -EINVAL;
   }
   
- -static inline int lease_modify(struct file_lock **before, int arg)
+ +static inline int lease_modify(struct file_lock **before, int arg,
+ +                             struct list_head *dispose)
   {
         return -EINVAL;
   }
- -
- -static inline int lock_may_read(struct inode *inode, loff_t start,
- -                              unsigned long len)
- -{
- -      return 1;
- -}
- -
- -static inline int lock_may_write(struct inode *inode, loff_t start,
- -                               unsigned long len)
- -{
- -      return 1;
- -}
   #endif /* !CONFIG_FILE_LOCKING */
   
   
@@@ -1135,8 -1149,8 +1133,8 @@@ extern void fasync_free(struct fasync_s
   /* can be called from interrupts */
   extern void kill_fasync(struct fasync_struct **, int, int);
   
- -extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
- -extern int f_setown(struct file *filp, unsigned long arg, int force);
+ +extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
+ +extern void f_setown(struct file *filp, unsigned long arg, int force);
   extern void f_delown(struct file *filp);
   extern pid_t f_getown(struct file *filp);
   extern int send_sigurg(struct fown_struct *fown);
@@@ -1490,7 -1504,7 +1488,7 @@@ struct file_operations 
         int (*flock) (struct file *, int, struct file_lock *);
         ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
         ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
- -      int (*setlease)(struct file *, long, struct file_lock **);
+ +      int (*setlease)(struct file *, long, struct file_lock **, void **);
         long (*fallocate)(struct file *file, int mode, loff_t offset,
                           loff_t len);
         int (*show_fdinfo)(struct seq_file *m, struct file *f);
@@@ -1839,8 -1853,7 +1837,8 @@@ extern struct vfsmount *kern_mount_data
   extern void kern_unmount(struct vfsmount *mnt);
   extern int may_umount_tree(struct vfsmount *);
   extern int may_umount(struct vfsmount *);
- -extern long do_mount(const char *, const char *, const char *, unsigned long, void *);
+ +extern long do_mount(const char *, const char __user *,
+ +                   const char *, unsigned long, void *);
   extern struct vfsmount *collect_mounts(struct path *);
   extern void drop_collected_mounts(struct vfsmount *);
   extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
@@@ -1859,7 -1872,7 +1857,7 @@@ extern int current_umask(void)
   extern void ihold(struct inode * inode);
   extern void iput(struct inode *);
   
- -static inline struct inode *file_inode(struct file *f)
+ +static inline struct inode *file_inode(const struct file *f)
   {
         return f->f_inode;
   }
@@@ -2596,7 -2609,6 +2594,7 @@@ extern int simple_write_end(struct fil
                         struct page *page, void *fsdata);
   extern int always_delete_dentry(const struct dentry *);
   extern struct inode *alloc_anon_inode(struct super_block *);
+ +extern int simple_nosetlease(struct file *, long, struct file_lock **, void **);
   extern const struct dentry_operations simple_dentry_operations;
   
   extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
diff --combined include/linux/nfs_fs.h

index 28d6490,e6e1c4e..c72d1ad
--- 1/include/linux/nfs_fs.h
--- 2/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@@ -443,15 -443,31 +443,15 @@@ static inline struct rpc_cred *nfs_file
   }
   
   /*
- - * linux/fs/nfs/xattr.c
- - */
- -#ifdef CONFIG_NFS_V3_ACL
- -extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
- -extern ssize_t nfs3_getxattr(struct dentry *, const char *, void *, size_t);
- -extern int nfs3_setxattr(struct dentry *, const char *,
- -                      const void *, size_t, int);
- -extern int nfs3_removexattr (struct dentry *, const char *name);
- -#else
- -# define nfs3_listxattr NULL
- -# define nfs3_getxattr NULL
- -# define nfs3_setxattr NULL
- -# define nfs3_removexattr NULL
- -#endif
- -
- -/*
    * linux/fs/nfs/direct.c
    */
   extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t);
   extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
                         struct iov_iter *iter,
-                       loff_t pos, bool uio);
+                       loff_t pos);
   extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
                         struct iov_iter *iter,
-                       loff_t pos, bool uio);
+                       loff_t pos);
   
   /*
    * linux/fs/nfs/dir.c
@@@ -513,9 -529,17 +513,9 @@@ extern int  nfs_updatepage(struct file 
   extern int nfs_wb_all(struct inode *inode);
   extern int nfs_wb_page(struct inode *inode, struct page* page);
   extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
- -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
   extern int  nfs_commit_inode(struct inode *, int);
   extern struct nfs_commit_data *nfs_commitdata_alloc(void);
   extern void nfs_commit_free(struct nfs_commit_data *data);
- -#else
- -static inline int
- -nfs_commit_inode(struct inode *inode, int how)
- -{
- -      return 0;
- -}
- -#endif
   
   static inline int
   nfs_have_writebacks(struct inode *inode)
@@@ -533,6 -557,23 +533,6 @@@ extern int  nfs_readpage_async(struct n
                                struct page *);
   
   /*
- - * linux/fs/nfs3proc.c
- - */
- -#ifdef CONFIG_NFS_V3_ACL
- -extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
- -extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
- -extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
- -              struct posix_acl *dfacl);
- -extern const struct xattr_handler *nfs3_xattr_handlers[];
- -#else
- -static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
- -              struct posix_acl *dfacl)
- -{
- -      return 0;
- -}
- -#endif /* CONFIG_NFS_V3_ACL */
- -
- -/*
    * inline functions
    */
   
diff --combined mm/backing-dev.c

index 12a992b,7d63d5e..0ae0df5
--- 1/mm/backing-dev.c
--- 2/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@@ -40,7 -40,7 +40,7 @@@ LIST_HEAD(bdi_list)
   /* bdi_wq serves all asynchronous writeback tasks */
   struct workqueue_struct *bdi_wq;
   
- void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+ static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
   {
         if (wb1 < wb2) {
                 spin_lock(&wb1->list_lock);
@@@ -376,13 -376,7 +376,7 @@@ static void bdi_wb_shutdown(struct back
         mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
         flush_delayed_work(&bdi->wb.dwork);
         WARN_ON(!list_empty(&bdi->work_list));
- 
-       /*
-        * This shouldn't be necessary unless @bdi for some reason has
-        * unflushed dirty IO after work_list is drained.  Do it anyway
-        * just in case.
-        */
-       cancel_delayed_work_sync(&bdi->wb.dwork);
+       WARN_ON(delayed_work_pending(&bdi->wb.dwork));
   }
   
   /*
@@@ -402,21 -396,15 +396,15 @@@ static void bdi_prune_sb(struct backing
   
   void bdi_unregister(struct backing_dev_info *bdi)
   {
-       struct device *dev = bdi->dev;
- 
-       if (dev) {
+       if (bdi->dev) {
                 bdi_set_min_ratio(bdi, 0);
                 trace_writeback_bdi_unregister(bdi);
                 bdi_prune_sb(bdi);
   
                 bdi_wb_shutdown(bdi);
                 bdi_debug_unregister(bdi);
- 
-               spin_lock_bh(&bdi->wb_lock);
+               device_unregister(bdi->dev);
                 bdi->dev = NULL;
-               spin_unlock_bh(&bdi->wb_lock);
- 
-               device_unregister(dev);
         }
   }
   EXPORT_SYMBOL(bdi_unregister);
@@@ -455,7 -443,7 +443,7 @@@ int bdi_init(struct backing_dev_info *b
         bdi_wb_init(&bdi->wb, bdi);
   
         for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
- -              err = percpu_counter_init(&bdi->bdi_stat[i], 0);
+ +              err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
                 if (err)
                         goto err;
         }
@@@ -470,7 -458,7 +458,7 @@@
         bdi->write_bandwidth = INIT_BW;
         bdi->avg_write_bandwidth = INIT_BW;
   
- -      err = fprop_local_init_percpu(&bdi->completions);
+ +      err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
   
         if (err) {
   err:
@@@ -487,8 -475,17 +475,17 @@@ void bdi_destroy(struct backing_dev_inf
         int i;
   
         /*
-        * Splice our entries to the default_backing_dev_info, if this
-        * bdi disappears
+        * Splice our entries to the default_backing_dev_info.  This
+        * condition shouldn't happen.  @wb must be empty at this point and
+        * dirty inodes on it might cause other issues.  This workaround is
+        * added by ce5f8e779519 ("writeback: splice dirty inode entries to
+        * default bdi on bdi_destroy()") without root-causing the issue.
+        *
+        * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com
+        * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350
+        *
+        * We should probably add WARN_ON() to find out whether it still
+        * happens and track it down if so.
          */
         if (bdi_has_dirty_io(bdi)) {
                 struct bdi_writeback *dst = &default_backing_dev_info.wb;
@@@ -503,12 -500,7 +500,7 @@@
   
         bdi_unregister(bdi);
   
-       /*
-        * If bdi_unregister() had already been called earlier, the dwork
-        * could still be pending because bdi_prune_sb() can race with the
-        * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
-        */
-       cancel_delayed_work_sync(&bdi->wb.dwork);
+       WARN_ON(delayed_work_pending(&bdi->wb.dwork));
   
         for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
                 percpu_counter_destroy(&bdi->bdi_stat[i]);
@@@ -631,7 -623,7 +623,7 @@@ long wait_iff_congested(struct zone *zo
          * of sleeping on the congestion queue
          */
         if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
- -                      !zone_is_reclaim_congested(zone)) {
+ +          !test_bit(ZONE_CONGESTED, &zone->flags)) {
                 cond_resched();
   
                 /* In case we scheduled, work out time remaining */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 18 Oct 2014 18:53:51 +0000 (11:53 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 18 Oct 2014 18:53:51 +0000 (11:53 -0700)
		1	2
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-sysfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/virtio_blk.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/scsi_error.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/scsi_lib.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/sd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/st.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/block_dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/direct.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_buf.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk-mq.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/nfs_fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/backing-dev.c	patch \|	diff1 \|	diff2 \|	blob \| history