static DEFINE_MUTEX(blkcg_pol_mutex);
-struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
- .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
+struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);
+ struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static bool blkcg_policy_enabled(struct request_queue *q,
spin_lock_init(&blkcg->lock);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&blkcg->blkg_list);
-
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ INIT_LIST_HEAD(&blkcg->cgwb_list);
+ #endif
return &blkcg->css;
+
+free_pd_blkcg:
+ for (i--; i >= 0; i--)
+ kfree(blkcg->pd[i]);
+
+free_blkcg:
+ kfree(blkcg);
+ return ret;
}
/**
const struct blkcg_policy *pol)
{
LIST_HEAD(pds);
- struct blkcg_gq *blkg, *new_blkg;
+ LIST_HEAD(cpds);
- struct blkg_policy_data *pd, *n;
+ struct blkcg_gq *blkg;
+ struct blkg_policy_data *pd, *nd;
+ struct blkcg_policy_data *cpd, *cnd;
int cnt = 0, ret;
- bool preloaded;
if (blkcg_policy_enabled(q, pol))
return 0;
- /* preallocations for root blkg */
- new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
- if (!new_blkg)
- return -ENOMEM;
-
+ /* count and allocate policy_data for all existing blkgs */
blk_queue_bypass_start(q);
-
- preloaded = !radix_tree_preload(GFP_KERNEL);
-
- /*
- * Make sure the root blkg exists and count the existing blkgs. As
- * @q is bypassing at this point, blkg_lookup_create() can't be
- * used. Open code it.
- */
spin_lock_irq(q->queue_lock);
-
- rcu_read_lock();
- blkg = __blkg_lookup(&blkcg_root, q, false);
- if (blkg)
- blkg_free(new_blkg);
- else
- blkg = blkg_create(&blkcg_root, q, new_blkg);
- rcu_read_unlock();
-
- if (preloaded)
- radix_tree_preload_end();
-
- if (IS_ERR(blkg)) {
- ret = PTR_ERR(blkg);
- goto out_unlock;
- }
-
list_for_each_entry(blkg, &q->blkg_list, q_node)
cnt++;
-
spin_unlock_irq(q->queue_lock);
+ /*
+ * Allocate per-blkg and per-blkcg policy data
+ * for all existing blkgs.
+ */
while (cnt--) {
pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
if (!pd) {
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...);
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
-void bdi_unregister(struct backing_dev_info *bdi);
int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
- void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- enum wb_reason reason);
- void bdi_start_background_writeback(struct backing_dev_info *bdi);
- void bdi_writeback_workfn(struct work_struct *work);
- int bdi_has_dirty_io(struct backing_dev_info *bdi);
- void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+ bool range_cyclic, enum wb_reason reason);
+ void wb_start_background_writeback(struct bdi_writeback *wb);
+ void wb_workfn(struct work_struct *work);
+ void wb_wakeup_delayed(struct bdi_writeback *wb);
extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
struct blkcg_gq *blkg_hint;
struct hlist_head blkg_list;
- /* TODO: per-policy storage in blkcg */
- unsigned int cfq_weight; /* belongs to cfq */
- unsigned int cfq_leaf_weight;
+ struct blkcg_policy_data *pd[BLKCG_MAX_POLS];
+
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_list;
+ #endif
};
struct blkg_stat {
extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
struct scsi_ioctl_command __user *);
- /*
- * A queue has just exitted congestion. Note this in the global counter of
- * congested queues, and wake up anyone who was waiting for requests to be
- * put back.
- */
- static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
- {
- clear_bdi_congested(&q->backing_dev_info, sync);
- }
-
- /*
- * A queue has just entered congestion. Flag that in the queue's VM-visible
- * state flags and increment the global gounter of congested queues.
- */
- static inline void blk_set_queue_congested(struct request_queue *q, int sync)
- {
- set_bdi_congested(&q->backing_dev_info, sync);
- }
-extern void blk_queue_bio(struct request_queue *q, struct bio *bio);
--
extern void blk_start_queue(struct request_queue *q);
extern void blk_stop_queue(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = FPROP_FRAC_BASE;
- spin_lock_init(&bdi->wb_lock);
INIT_LIST_HEAD(&bdi->bdi_list);
- INIT_LIST_HEAD(&bdi->work_list);
+ init_waitqueue_head(&bdi->wb_waitq);
- bdi_wb_init(&bdi->wb, bdi);
+ err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
+ if (err)
+ return err;
- for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
- err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
- if (err)
- goto err;
- }
+ bdi->wb_congested.state = 0;
+ bdi->wb.congested = &bdi->wb_congested;
- bdi->dirty_exceeded = 0;
+ cgwb_bdi_init(bdi);
+ return 0;
+ }
+ EXPORT_SYMBOL(bdi_init);
- bdi->bw_time_stamp = jiffies;
- bdi->written_stamp = 0;
+ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+ const char *fmt, ...)
+ {
+ va_list args;
+ struct device *dev;
- bdi->balanced_dirty_ratelimit = INIT_BW;
- bdi->dirty_ratelimit = INIT_BW;
- bdi->write_bandwidth = INIT_BW;
- bdi->avg_write_bandwidth = INIT_BW;
+ if (bdi->dev) /* The driver needs to use separate queues per device */
+ return 0;
- err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
+ va_start(args, fmt);
+ dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+ va_end(args);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
- if (err) {
- err:
- while (i--)
- percpu_counter_destroy(&bdi->bdi_stat[i]);
- }
+ bdi->dev = dev;
- return err;
+ bdi_debug_register(bdi, dev_name(dev));
+ set_bit(WB_registered, &bdi->wb.state);
+
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ trace_writeback_bdi_register(bdi);
+ return 0;
}
- EXPORT_SYMBOL(bdi_init);
+ EXPORT_SYMBOL(bdi_register);
- void bdi_destroy(struct backing_dev_info *bdi)
+ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
{
- int i;
+ return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+ }
+ EXPORT_SYMBOL(bdi_register_dev);
+
+ /*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+ static void bdi_remove_from_list(struct backing_dev_info *bdi)
+ {
+ spin_lock_bh(&bdi_lock);
+ list_del_rcu(&bdi->bdi_list);
+ spin_unlock_bh(&bdi_lock);
- bdi_wb_shutdown(bdi);
- bdi_set_min_ratio(bdi, 0);
+ synchronize_rcu_expedited();
+ }
- WARN_ON(!list_empty(&bdi->work_list));
- WARN_ON(delayed_work_pending(&bdi->wb.dwork));
-/*
- * Called when the device behind @bdi has been removed or ejected.
- *
- * We can't really do much here except for reducing the dirty ratio at
- * the moment. In the future we should be able to set a flag so that
- * the filesystem can handle errors at mark_inode_dirty time instead
- * of only at writeback time.
- */
-void bdi_unregister(struct backing_dev_info *bdi)
-{
- if (WARN_ON_ONCE(!bdi->dev))
- return;
-
- bdi_set_min_ratio(bdi, 0);
-}
-EXPORT_SYMBOL(bdi_unregister);
-
+ void bdi_destroy(struct backing_dev_info *bdi)
+ {
+ /* make sure nobody finds us on the bdi_list anymore */
+ bdi_remove_from_list(bdi);
+ wb_shutdown(&bdi->wb);
+ cgwb_bdi_destroy(bdi);
if (bdi->dev) {
bdi_debug_unregister(bdi);
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
- __inc_zone_page_state(new, NR_FILE_PAGES);
+
+ /*
+ * hugetlb pages do not participate in page cache accounting.
+ */
+ if (!PageHuge(new))
+ __inc_zone_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
mem_cgroup_migrate(old, new, true);
radix_tree_preload_end();
if (freepage)
* threshold, so that the occasional writes won't be blocked and active
* writes can rampup the threshold quickly.
*/
- bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+ wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
/*
- * scale global setpoint to bdi's:
- * bdi_setpoint = setpoint * bdi_thresh / thresh
+ * scale global setpoint to wb's:
+ * wb_setpoint = setpoint * wb_thresh / thresh
*/
- x = div_u64((u64)bdi_thresh << 16, thresh | 1);
- bdi_setpoint = setpoint * (u64)x >> 16;
- x = div_u64((u64)wb_thresh << 16, dtc->thresh + 1);
++ x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
+ wb_setpoint = setpoint * (u64)x >> 16;
/*
- * Use span=(8*write_bw) in single bdi case as indicated by
- * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+ * Use span=(8*write_bw) in single wb case as indicated by
+ * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
*
- * bdi_thresh thresh - bdi_thresh
- * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
- * thresh thresh
+ * wb_thresh thresh - wb_thresh
+ * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
+ * thresh thresh
*/
- span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
- x_intercept = bdi_setpoint + span;
+ span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
+ x_intercept = wb_setpoint + span;
- if (bdi_dirty < x_intercept - span / 4) {
- pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
- (x_intercept - bdi_setpoint) | 1);
+ if (dtc->wb_dirty < x_intercept - span / 4) {
+ pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
- x_intercept - wb_setpoint + 1);
++ (x_intercept - wb_setpoint) | 1);
} else
pos_ratio /= 4;