Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)
Pull cgroup writeback support from Jens Axboe:
 "This is the big pull request for adding cgroup writeback support.

  This code has been in development for a long time, and it has been
  simmering in for-next for a good chunk of this cycle too.  This is one
  of those problems that has been talked about for at least half a
  decade, finally there's a solution and code to go with it.

  Also see last weeks writeup on LWN:

        http://lwn.net/Articles/648292/"

* 'for-4.2/writeback' of git://git.kernel.dk/linux-block: (85 commits)
  writeback, blkio: add documentation for cgroup writeback support
  vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB
  writeback: do foreign inode detection iff cgroup writeback is enabled
  v9fs: fix error handling in v9fs_session_init()
  bdi: fix wrong error return value in cgwb_create()
  buffer: remove unusued 'ret' variable
  writeback: disassociate inodes from dying bdi_writebacks
  writeback: implement foreign cgroup inode bdi_writeback switching
  writeback: add lockdep annotation to inode_to_wb()
  writeback: use unlocked_inode_to_wb transaction in inode_congested()
  writeback: implement unlocked_inode_to_wb transaction and use it for stat updates
  writeback: implement [locked_]inode_to_wb_and_lock_list()
  writeback: implement foreign cgroup inode detection
  writeback: make writeback_control track the inode being written back
  writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb()
  mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use
  writeback: implement memcg writeback domain based throttling
  writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes
  writeback: implement memcg wb_domain
  writeback: update wb_over_bg_thresh() to use wb_domain aware operations
  ...

32 files changed:
1  2 
block/blk-cgroup.c
block/blk-core.c
block/blk-sysfs.c
block/bounce.c
block/cfq-iosched.c
block/elevator.c
block/genhd.c
drivers/md/dm.c
drivers/md/raid10.c
fs/ext4/extents.c
fs/ext4/mballoc.c
fs/ext4/super.c
fs/f2fs/node.c
fs/f2fs/segment.h
fs/inode.c
fs/nfs/write.c
fs/ocfs2/file.c
fs/xfs/xfs_file.c
include/linux/backing-dev.h
include/linux/blk-cgroup.h
include/linux/blkdev.h
include/linux/fs.h
include/linux/memcontrol.h
include/linux/mm.h
include/trace/events/writeback.h
init/Kconfig
mm/backing-dev.c
mm/filemap.c
mm/memcontrol.c
mm/page-writeback.c
mm/rmap.c
mm/vmscan.c

  
  static DEFINE_MUTEX(blkcg_pol_mutex);
  
 -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
 -                          .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
 +struct blkcg blkcg_root;
  EXPORT_SYMBOL_GPL(blkcg_root);
  
+ struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
  static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  
  static bool blkcg_policy_enabled(struct request_queue *q,
@@@ -868,16 -843,10 +872,18 @@@ done
        spin_lock_init(&blkcg->lock);
        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&blkcg->cgwb_list);
+ #endif
        return &blkcg->css;
 +
 +free_pd_blkcg:
 +      for (i--; i >= 0; i--)
 +              kfree(blkcg->pd[i]);
 +
 +free_blkcg:
 +      kfree(blkcg);
 +      return ret;
  }
  
  /**
@@@ -995,57 -1000,20 +1037,26 @@@ int blkcg_activate_policy(struct reques
                          const struct blkcg_policy *pol)
  {
        LIST_HEAD(pds);
-       struct blkcg_gq *blkg, *new_blkg;
 +      LIST_HEAD(cpds);
 -      struct blkg_policy_data *pd, *n;
+       struct blkcg_gq *blkg;
 +      struct blkg_policy_data *pd, *nd;
 +      struct blkcg_policy_data *cpd, *cnd;
        int cnt = 0, ret;
-       bool preloaded;
  
        if (blkcg_policy_enabled(q, pol))
                return 0;
  
-       /* preallocations for root blkg */
-       new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
-       if (!new_blkg)
-               return -ENOMEM;
+       /* count and allocate policy_data for all existing blkgs */
        blk_queue_bypass_start(q);
-       preloaded = !radix_tree_preload(GFP_KERNEL);
-       /*
-        * Make sure the root blkg exists and count the existing blkgs.  As
-        * @q is bypassing at this point, blkg_lookup_create() can't be
-        * used.  Open code it.
-        */
        spin_lock_irq(q->queue_lock);
-       rcu_read_lock();
-       blkg = __blkg_lookup(&blkcg_root, q, false);
-       if (blkg)
-               blkg_free(new_blkg);
-       else
-               blkg = blkg_create(&blkcg_root, q, new_blkg);
-       rcu_read_unlock();
-       if (preloaded)
-               radix_tree_preload_end();
-       if (IS_ERR(blkg)) {
-               ret = PTR_ERR(blkg);
-               goto out_unlock;
-       }
        list_for_each_entry(blkg, &q->blkg_list, q_node)
                cnt++;
        spin_unlock_irq(q->queue_lock);
  
 +      /*
 +       * Allocate per-blkg and per-blkcg policy data
 +       * for all existing blkgs.
 +       */
        while (cnt--) {
                pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
                if (!pd) {
Simple merge
Simple merge
diff --cc block/bounce.c
Simple merge
Simple merge
Simple merge
diff --cc block/genhd.c
Simple merge
diff --cc drivers/md/dm.c
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc fs/ext4/super.c
Simple merge
diff --cc fs/f2fs/node.c
Simple merge
Simple merge
diff --cc fs/inode.c
Simple merge
diff --cc fs/nfs/write.c
Simple merge
diff --cc fs/ocfs2/file.c
Simple merge
Simple merge
@@@ -116,13 -23,13 +23,12 @@@ __printf(3, 4
  int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                const char *fmt, ...);
  int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 -void bdi_unregister(struct backing_dev_info *bdi);
  int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
- void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                       enum wb_reason reason);
- void bdi_start_background_writeback(struct backing_dev_info *bdi);
- void bdi_writeback_workfn(struct work_struct *work);
- int bdi_has_dirty_io(struct backing_dev_info *bdi);
- void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+                       bool range_cyclic, enum wb_reason reason);
+ void wb_start_background_writeback(struct bdi_writeback *wb);
+ void wb_workfn(struct work_struct *work);
+ void wb_wakeup_delayed(struct bdi_writeback *wb);
  
  extern spinlock_t bdi_lock;
  extern struct list_head bdi_list;
@@@ -45,7 -50,13 +45,11 @@@ struct blkcg 
        struct blkcg_gq                 *blkg_hint;
        struct hlist_head               blkg_list;
  
 -      /* TODO: per-policy storage in blkcg */
 -      unsigned int                    cfq_weight;     /* belongs to cfq */
 -      unsigned int                    cfq_leaf_weight;
 +      struct blkcg_policy_data        *pd[BLKCG_MAX_POLS];
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head                cgwb_list;
+ #endif
  };
  
  struct blkg_stat {
@@@ -787,25 -788,8 +787,6 @@@ extern int scsi_cmd_ioctl(struct reques
  extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
                         struct scsi_ioctl_command __user *);
  
- /*
-  * A queue has just exitted congestion.  Note this in the global counter of
-  * congested queues, and wake up anyone who was waiting for requests to be
-  * put back.
-  */
- static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
- {
-       clear_bdi_congested(&q->backing_dev_info, sync);
- }
- /*
-  * A queue has just entered congestion.  Flag that in the queue's VM-visible
-  * state flags and increment the global gounter of congested queues.
-  */
- static inline void blk_set_queue_congested(struct request_queue *q, int sync)
- {
-       set_bdi_congested(&q->backing_dev_info, sync);
- }
 -extern void blk_queue_bio(struct request_queue *q, struct bio *bio);
--
  extern void blk_start_queue(struct request_queue *q);
  extern void blk_stop_queue(struct request_queue *q);
  extern void blk_sync_queue(struct request_queue *q);
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc init/Kconfig
Simple merge
@@@ -387,49 -746,91 +746,74 @@@ int bdi_init(struct backing_dev_info *b
        bdi->min_ratio = 0;
        bdi->max_ratio = 100;
        bdi->max_prop_frac = FPROP_FRAC_BASE;
-       spin_lock_init(&bdi->wb_lock);
        INIT_LIST_HEAD(&bdi->bdi_list);
-       INIT_LIST_HEAD(&bdi->work_list);
+       init_waitqueue_head(&bdi->wb_waitq);
  
-       bdi_wb_init(&bdi->wb, bdi);
+       err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
+       if (err)
+               return err;
  
-       for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
-               err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
-               if (err)
-                       goto err;
-       }
+       bdi->wb_congested.state = 0;
+       bdi->wb.congested = &bdi->wb_congested;
  
-       bdi->dirty_exceeded = 0;
+       cgwb_bdi_init(bdi);
+       return 0;
+ }
+ EXPORT_SYMBOL(bdi_init);
  
-       bdi->bw_time_stamp = jiffies;
-       bdi->written_stamp = 0;
+ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+               const char *fmt, ...)
+ {
+       va_list args;
+       struct device *dev;
  
-       bdi->balanced_dirty_ratelimit = INIT_BW;
-       bdi->dirty_ratelimit = INIT_BW;
-       bdi->write_bandwidth = INIT_BW;
-       bdi->avg_write_bandwidth = INIT_BW;
+       if (bdi->dev)   /* The driver needs to use separate queues per device */
+               return 0;
  
-       err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
+       va_start(args, fmt);
+       dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+       va_end(args);
+       if (IS_ERR(dev))
+               return PTR_ERR(dev);
  
-       if (err) {
- err:
-               while (i--)
-                       percpu_counter_destroy(&bdi->bdi_stat[i]);
-       }
+       bdi->dev = dev;
  
-       return err;
+       bdi_debug_register(bdi, dev_name(dev));
+       set_bit(WB_registered, &bdi->wb.state);
+       spin_lock_bh(&bdi_lock);
+       list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+       spin_unlock_bh(&bdi_lock);
+       trace_writeback_bdi_register(bdi);
+       return 0;
  }
- EXPORT_SYMBOL(bdi_init);
+ EXPORT_SYMBOL(bdi_register);
  
void bdi_destroy(struct backing_dev_info *bdi)
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
  {
-       int i;
+       return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+ }
+ EXPORT_SYMBOL(bdi_register_dev);
+ /*
+  * Remove bdi from bdi_list, and ensure that it is no longer visible
+  */
+ static void bdi_remove_from_list(struct backing_dev_info *bdi)
+ {
+       spin_lock_bh(&bdi_lock);
+       list_del_rcu(&bdi->bdi_list);
+       spin_unlock_bh(&bdi_lock);
  
-       bdi_wb_shutdown(bdi);
-       bdi_set_min_ratio(bdi, 0);
+       synchronize_rcu_expedited();
+ }
  
-       WARN_ON(!list_empty(&bdi->work_list));
-       WARN_ON(delayed_work_pending(&bdi->wb.dwork));
 -/*
 - * Called when the device behind @bdi has been removed or ejected.
 - *
 - * We can't really do much here except for reducing the dirty ratio at
 - * the moment.  In the future we should be able to set a flag so that
 - * the filesystem can handle errors at mark_inode_dirty time instead
 - * of only at writeback time.
 - */
 -void bdi_unregister(struct backing_dev_info *bdi)
 -{
 -      if (WARN_ON_ONCE(!bdi->dev))
 -              return;
 -
 -      bdi_set_min_ratio(bdi, 0);
 -}
 -EXPORT_SYMBOL(bdi_unregister);
 -
+ void bdi_destroy(struct backing_dev_info *bdi)
+ {
+       /* make sure nobody finds us on the bdi_list anymore */
+       bdi_remove_from_list(bdi);
+       wb_shutdown(&bdi->wb);
+       cgwb_bdi_destroy(bdi);
  
        if (bdi->dev) {
                bdi_debug_unregister(bdi);
diff --cc mm/filemap.c
@@@ -485,15 -498,11 +500,16 @@@ int replace_page_cache_page(struct pag
                error = radix_tree_insert(&mapping->page_tree, offset, new);
                BUG_ON(error);
                mapping->nrpages++;
 -              __inc_zone_page_state(new, NR_FILE_PAGES);
 +
 +              /*
 +               * hugetlb pages do not participate in page cache accounting.
 +               */
 +              if (!PageHuge(new))
 +                      __inc_zone_page_state(new, NR_FILE_PAGES);
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
-               spin_unlock_irq(&mapping->tree_lock);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
                mem_cgroup_migrate(old, new, true);
                radix_tree_preload_end();
                if (freepage)
diff --cc mm/memcontrol.c
Simple merge
@@@ -802,27 -990,27 +990,27 @@@ static void wb_position_ratio(struct di
         * threshold, so that the occasional writes won't be blocked and active
         * writes can rampup the threshold quickly.
         */
-       bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+       wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
        /*
-        * scale global setpoint to bdi's:
-        *      bdi_setpoint = setpoint * bdi_thresh / thresh
+        * scale global setpoint to wb's:
+        *      wb_setpoint = setpoint * wb_thresh / thresh
         */
-       x = div_u64((u64)bdi_thresh << 16, thresh | 1);
-       bdi_setpoint = setpoint * (u64)x >> 16;
 -      x = div_u64((u64)wb_thresh << 16, dtc->thresh + 1);
++      x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
+       wb_setpoint = setpoint * (u64)x >> 16;
        /*
-        * Use span=(8*write_bw) in single bdi case as indicated by
-        * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+        * Use span=(8*write_bw) in single wb case as indicated by
+        * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
         *
-        *        bdi_thresh                    thresh - bdi_thresh
-        * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
-        *          thresh                            thresh
+        *        wb_thresh                    thresh - wb_thresh
+        * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
+        *         thresh                           thresh
         */
-       span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
-       x_intercept = bdi_setpoint + span;
+       span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
+       x_intercept = wb_setpoint + span;
  
-       if (bdi_dirty < x_intercept - span / 4) {
-               pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
-                                     (x_intercept - bdi_setpoint) | 1);
+       if (dtc->wb_dirty < x_intercept - span / 4) {
+               pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
 -                                    x_intercept - wb_setpoint + 1);
++                                    (x_intercept - wb_setpoint) | 1);
        } else
                pos_ratio /= 4;
  
diff --cc mm/rmap.c
Simple merge
diff --cc mm/vmscan.c
Simple merge