From c42843f2f0bbc9d716a32caf667d18fc2bf3bc4c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 2 Mar 2011 15:54:09 -0600 Subject: [PATCH] writeback: introduce smoothed global dirty limit The start of a heavy weight application (ie. KVM) may instantly knock down determine_dirtyable_memory() if the swap is not enabled or full. global_dirty_limits() and bdi_dirty_limit() will in turn get global/bdi dirty thresholds that are _much_ lower than the global/bdi dirty pages. balance_dirty_pages() will then heavily throttle all dirtiers including the light ones, until the dirty pages drop below the new dirty thresholds. During this _deep_ dirty-exceeded state, the system may appear rather unresponsive to the users. About "deep" dirty-exceeded: task_dirty_limit() assigns 1/8 lower dirty threshold to heavy dirtiers than light ones, and the dirty pages will be throttled around the heavy dirtiers' dirty threshold and reasonably below the light dirtiers' dirty threshold. In this state, only the heavy dirtiers will be throttled and the dirty pages are carefully controlled to not exceed the light dirtiers' dirty threshold. However if the threshold itself suddenly drops below the number of dirty pages, the light dirtiers will get heavily throttled. So introduce global_dirty_limit for tracking the global dirty threshold with policies - follow downwards slowly - follow up in one shot global_dirty_limit can effectively mask out the impact of sudden drop of dirtyable memory. It will be used in the next patch for two new type of dirty limits. Note that the new dirty limits are not going to avoid throttling the light dirtiers, but could limit their sleep time to 200ms. Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 2 +- include/linux/writeback.h | 6 ++++ mm/page-writeback.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 3 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5826992..227ff12 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -699,7 +699,7 @@ static inline bool over_bground_thresh(void) static void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - __bdi_update_bandwidth(wb->bdi, start_time); + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); } /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 66862f2..e9d371b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -84,6 +84,8 @@ static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); +extern unsigned long global_dirty_limit; + /* These are exported to sysctl. */ extern int dirty_background_ratio; extern unsigned long dirty_background_bytes; @@ -119,6 +121,10 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty); void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, unsigned long start_time); void page_writeback_init(void); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5f3e1b4..da95995 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -116,6 +116,7 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ +unsigned long global_dirty_limit; /* * Scale the writeback cache size proportional to the relative writeout speeds. @@ -516,7 +517,67 @@ out: bdi->avg_write_bandwidth = avg; } +/* + * The global dirtyable memory and dirty threshold could be suddenly knocked + * down by a large amount (eg. on the startup of KVM in a swapless system). + * This may throw the system into deep dirty exceeded state and throttle + * heavy/light dirtiers alike. To retain good responsiveness, maintain + * global_dirty_limit for tracking slowly down to the knocked down dirty + * threshold. + */ +static void update_dirty_limit(unsigned long thresh, unsigned long dirty) +{ + unsigned long limit = global_dirty_limit; + + /* + * Follow up in one step. + */ + if (limit < thresh) { + limit = thresh; + goto update; + } + + /* + * Follow down slowly. Use the higher one as the target, because thresh + * may drop below dirty. This is exactly the reason to introduce + * global_dirty_limit which is guaranteed to lie above the dirty pages. + */ + thresh = max(thresh, dirty); + if (limit > thresh) { + limit -= (limit - thresh) >> 5; + goto update; + } + return; +update: + global_dirty_limit = limit; +} + +static void global_update_bandwidth(unsigned long thresh, + unsigned long dirty, + unsigned long now) +{ + static DEFINE_SPINLOCK(dirty_lock); + static unsigned long update_time; + + /* + * check locklessly first to optimize away locking for the most time + */ + if (time_before(now, update_time + BANDWIDTH_INTERVAL)) + return; + + spin_lock(&dirty_lock); + if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { + update_dirty_limit(thresh, dirty); + update_time = now; + } + spin_unlock(&dirty_lock); +} + void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, unsigned long start_time) { unsigned long now = jiffies; @@ -538,6 +599,9 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) goto snapshot; + if (thresh) + global_update_bandwidth(thresh, dirty, now); + bdi_update_write_bandwidth(bdi, elapsed, written); snapshot: @@ -546,12 +610,17 @@ snapshot: } static void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, unsigned long start_time) { if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) return; spin_lock(&bdi->wb.list_lock); - __bdi_update_bandwidth(bdi, start_time); + __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, + start_time); spin_unlock(&bdi->wb.list_lock); } @@ -630,7 +699,8 @@ static void balance_dirty_pages(struct address_space *mapping, if (!bdi->dirty_exceeded) bdi->dirty_exceeded = 1; - bdi_update_bandwidth(bdi, start_time); + bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, + bdi_thresh, bdi_dirty, start_time); /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked -- 2.7.4