From: Linus Torvalds Date: Thu, 25 Jun 2015 23:00:17 +0000 (-0700) Subject: Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block X-Git-Tag: v4.2-rc1~113 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e4bc13adfd016fc1036838170288b5680d1a98b0;p=platform%2Fkernel%2Flinux-exynos.git Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block Pull cgroup writeback support from Jens Axboe: "This is the big pull request for adding cgroup writeback support. This code has been in development for a long time, and it has been simmering in for-next for a good chunk of this cycle too. This is one of those problems that has been talked about for at least half a decade, finally there's a solution and code to go with it. Also see last weeks writeup on LWN: http://lwn.net/Articles/648292/" * 'for-4.2/writeback' of git://git.kernel.dk/linux-block: (85 commits) writeback, blkio: add documentation for cgroup writeback support vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB writeback: do foreign inode detection iff cgroup writeback is enabled v9fs: fix error handling in v9fs_session_init() bdi: fix wrong error return value in cgwb_create() buffer: remove unusued 'ret' variable writeback: disassociate inodes from dying bdi_writebacks writeback: implement foreign cgroup inode bdi_writeback switching writeback: add lockdep annotation to inode_to_wb() writeback: use unlocked_inode_to_wb transaction in inode_congested() writeback: implement unlocked_inode_to_wb transaction and use it for stat updates writeback: implement [locked_]inode_to_wb_and_lock_list() writeback: implement foreign cgroup inode detection writeback: make writeback_control track the inode being written back writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use writeback: implement memcg writeback domain based throttling writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes writeback: implement memcg wb_domain writeback: update wb_over_bg_thresh() to use wb_domain aware operations ... --- e4bc13adfd016fc1036838170288b5680d1a98b0 diff --cc block/blk-cgroup.c index 6e43fa3,31610ae..9f97da5 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@@ -30,9 -27,12 +31,11 @@@ static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, - .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; +struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); + struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; + static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static bool blkcg_policy_enabled(struct request_queue *q, @@@ -868,16 -843,10 +872,18 @@@ done spin_lock_init(&blkcg->lock); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&blkcg->blkg_list); - + #ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&blkcg->cgwb_list); + #endif return &blkcg->css; + +free_pd_blkcg: + for (i--; i >= 0; i--) + kfree(blkcg->pd[i]); + +free_blkcg: + kfree(blkcg); + return ret; } /** @@@ -995,57 -1000,20 +1037,26 @@@ int blkcg_activate_policy(struct reques const struct blkcg_policy *pol) { LIST_HEAD(pds); + LIST_HEAD(cpds); - struct blkcg_gq *blkg, *new_blkg; + struct blkcg_gq *blkg; - struct blkg_policy_data *pd, *n; + struct blkg_policy_data *pd, *nd; + struct blkcg_policy_data *cpd, *cnd; int cnt = 0, ret; - bool preloaded; if (blkcg_policy_enabled(q, pol)) return 0; - /* preallocations for root blkg */ - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!new_blkg) - return -ENOMEM; - + /* count and allocate policy_data for all existing blkgs */ blk_queue_bypass_start(q); - - preloaded = !radix_tree_preload(GFP_KERNEL); - - /* - * Make sure the root blkg exists and count the existing blkgs. As - * @q is bypassing at this point, blkg_lookup_create() can't be - * used. Open code it. - */ spin_lock_irq(q->queue_lock); - - rcu_read_lock(); - blkg = __blkg_lookup(&blkcg_root, q, false); - if (blkg) - blkg_free(new_blkg); - else - blkg = blkg_create(&blkcg_root, q, new_blkg); - rcu_read_unlock(); - - if (preloaded) - radix_tree_preload_end(); - - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); - goto out_unlock; - } - list_for_each_entry(blkg, &q->blkg_list, q_node) cnt++; - spin_unlock_irq(q->queue_lock); + /* + * Allocate per-blkg and per-blkcg policy data + * for all existing blkgs. + */ while (cnt--) { pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); if (!pd) { diff --cc include/linux/backing-dev.h index d87d8ec,a13181a..0e6d482 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@@ -116,13 -23,13 +23,12 @@@ __printf(3, 4 int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); -void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); - void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - enum wb_reason reason); - void bdi_start_background_writeback(struct backing_dev_info *bdi); - void bdi_writeback_workfn(struct work_struct *work); - int bdi_has_dirty_io(struct backing_dev_info *bdi); - void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); + void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + bool range_cyclic, enum wb_reason reason); + void wb_start_background_writeback(struct bdi_writeback *wb); + void wb_workfn(struct work_struct *work); + void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; extern struct list_head bdi_list; diff --cc include/linux/blk-cgroup.h index 74296a7,07a32b8..58cfab8 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@@ -45,7 -50,13 +45,11 @@@ struct blkcg struct blkcg_gq *blkg_hint; struct hlist_head blkg_list; - /* TODO: per-policy storage in blkcg */ - unsigned int cfq_weight; /* belongs to cfq */ - unsigned int cfq_leaf_weight; + struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; + + #ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; + #endif }; struct blkg_stat { diff --cc include/linux/blkdev.h index 5ced29c,ab4a278..7f2f54b --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@@ -787,25 -788,8 +787,6 @@@ extern int scsi_cmd_ioctl(struct reques extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); - /* - * A queue has just exitted congestion. Note this in the global counter of - * congested queues, and wake up anyone who was waiting for requests to be - * put back. - */ - static inline void blk_clear_queue_congested(struct request_queue *q, int sync) - { - clear_bdi_congested(&q->backing_dev_info, sync); - } - - /* - * A queue has just entered congestion. Flag that in the queue's VM-visible - * state flags and increment the global gounter of congested queues. - */ - static inline void blk_set_queue_congested(struct request_queue *q, int sync) - { - set_bdi_congested(&q->backing_dev_info, sync); - } -extern void blk_queue_bio(struct request_queue *q, struct bio *bio); -- extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); diff --cc mm/backing-dev.c index 000e7b3,436bb53..7756da3 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@@ -387,49 -746,91 +746,74 @@@ int bdi_init(struct backing_dev_info *b bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; - spin_lock_init(&bdi->wb_lock); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->work_list); + init_waitqueue_head(&bdi->wb_waitq); - bdi_wb_init(&bdi->wb, bdi); + err = wb_init(&bdi->wb, bdi, GFP_KERNEL); + if (err) + return err; - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); - if (err) - goto err; - } + bdi->wb_congested.state = 0; + bdi->wb.congested = &bdi->wb_congested; - bdi->dirty_exceeded = 0; + cgwb_bdi_init(bdi); + return 0; + } + EXPORT_SYMBOL(bdi_init); - bdi->bw_time_stamp = jiffies; - bdi->written_stamp = 0; + int bdi_register(struct backing_dev_info *bdi, struct device *parent, + const char *fmt, ...) + { + va_list args; + struct device *dev; - bdi->balanced_dirty_ratelimit = INIT_BW; - bdi->dirty_ratelimit = INIT_BW; - bdi->write_bandwidth = INIT_BW; - bdi->avg_write_bandwidth = INIT_BW; + if (bdi->dev) /* The driver needs to use separate queues per device */ + return 0; - err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); + va_start(args, fmt); + dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); + va_end(args); + if (IS_ERR(dev)) + return PTR_ERR(dev); - if (err) { - err: - while (i--) - percpu_counter_destroy(&bdi->bdi_stat[i]); - } + bdi->dev = dev; - return err; + bdi_debug_register(bdi, dev_name(dev)); + set_bit(WB_registered, &bdi->wb.state); + + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + trace_writeback_bdi_register(bdi); + return 0; } - EXPORT_SYMBOL(bdi_init); + EXPORT_SYMBOL(bdi_register); - void bdi_destroy(struct backing_dev_info *bdi) + int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) { - int i; + return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); + } + EXPORT_SYMBOL(bdi_register_dev); + + /* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ + static void bdi_remove_from_list(struct backing_dev_info *bdi) + { + spin_lock_bh(&bdi_lock); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); - bdi_wb_shutdown(bdi); - bdi_set_min_ratio(bdi, 0); + synchronize_rcu_expedited(); + } - WARN_ON(!list_empty(&bdi->work_list)); - WARN_ON(delayed_work_pending(&bdi->wb.dwork)); -/* - * Called when the device behind @bdi has been removed or ejected. - * - * We can't really do much here except for reducing the dirty ratio at - * the moment. In the future we should be able to set a flag so that - * the filesystem can handle errors at mark_inode_dirty time instead - * of only at writeback time. - */ -void bdi_unregister(struct backing_dev_info *bdi) -{ - if (WARN_ON_ONCE(!bdi->dev)) - return; - - bdi_set_min_ratio(bdi, 0); -} -EXPORT_SYMBOL(bdi_unregister); - + void bdi_destroy(struct backing_dev_info *bdi) + { + /* make sure nobody finds us on the bdi_list anymore */ + bdi_remove_from_list(bdi); + wb_shutdown(&bdi->wb); + cgwb_bdi_destroy(bdi); if (bdi->dev) { bdi_debug_unregister(bdi); diff --cc mm/filemap.c index 8d17cee,bfc1ab053..11f10ef --- a/mm/filemap.c +++ b/mm/filemap.c @@@ -485,15 -498,11 +500,16 @@@ int replace_page_cache_page(struct pag error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; - __inc_zone_page_state(new, NR_FILE_PAGES); + + /* + * hugetlb pages do not participate in page cache accounting. + */ + if (!PageHuge(new)) + __inc_zone_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); mem_cgroup_migrate(old, new, true); radix_tree_preload_end(); if (freepage) diff --cc mm/page-writeback.c index eb59f7e,e1514d5..22cddd3 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@@ -802,27 -990,27 +990,27 @@@ static void wb_position_ratio(struct di * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ - bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); + wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); /* - * scale global setpoint to bdi's: - * bdi_setpoint = setpoint * bdi_thresh / thresh + * scale global setpoint to wb's: + * wb_setpoint = setpoint * wb_thresh / thresh */ - x = div_u64((u64)bdi_thresh << 16, thresh | 1); - bdi_setpoint = setpoint * (u64)x >> 16; - x = div_u64((u64)wb_thresh << 16, dtc->thresh + 1); ++ x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); + wb_setpoint = setpoint * (u64)x >> 16; /* - * Use span=(8*write_bw) in single bdi case as indicated by - * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case. + * Use span=(8*write_bw) in single wb case as indicated by + * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * - * bdi_thresh thresh - bdi_thresh - * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh - * thresh thresh + * wb_thresh thresh - wb_thresh + * span = --------- * (8 * write_bw) + ------------------ * wb_thresh + * thresh thresh */ - span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16; - x_intercept = bdi_setpoint + span; + span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; + x_intercept = wb_setpoint + span; - if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), - (x_intercept - bdi_setpoint) | 1); + if (dtc->wb_dirty < x_intercept - span / 4) { + pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), - x_intercept - wb_setpoint + 1); ++ (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4;