1 // SPDX-License-Identifier: GPL-2.0
3 * buffered writeback throttling. loosely based on CoDel. We can't drop
4 * packets for IO scheduling, so the logic is something like this:
6 * - Monitor latencies in a defined window of time.
7 * - If the minimum latency in the above window exceeds some target, increment
8 * scaling step and scale down queue depth by a factor of 2x. The monitoring
9 * window is then shrunk to 100 / sqrt(scaling step + 1).
10 * - For any window where we don't have solid data on what the latencies
11 * look like, retain status quo.
12 * - If latencies look good, decrement scaling step.
13 * - If we're only doing writes, allow the scaling step to go negative. This
14 * will temporarily boost write performance, snapping back to a stable
15 * scaling step of 0 if reads show up or the heavy writers finish. Unlike
16 * positive scaling steps where we shrink the monitoring window, a negative
17 * scaling step retains the default step==0 window size.
19 * Copyright (C) 2016 Jens Axboe
22 #include <linux/kernel.h>
23 #include <linux/blk_types.h>
24 #include <linux/slab.h>
25 #include <linux/backing-dev.h>
26 #include <linux/swap.h>
29 #include "blk-rq-qos.h"
32 #define CREATE_TRACE_POINTS
33 #include <trace/events/wbt.h>
35 static inline void wbt_clear_state(struct request *rq)
40 static inline enum wbt_flags wbt_flags(struct request *rq)
45 static inline bool wbt_is_tracked(struct request *rq)
47 return rq->wbt_flags & WBT_TRACKED;
50 static inline bool wbt_is_read(struct request *rq)
52 return rq->wbt_flags & WBT_READ;
57 * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
58 * from here depending on device stats
65 RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
68 * Disregard stats, if we don't meet this minimum
70 RWB_MIN_WRITE_SAMPLES = 3,
73 * If we have this number of consecutive windows with not enough
74 * information to scale up or down, scale up.
79 static inline bool rwb_enabled(struct rq_wb *rwb)
81 return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT &&
85 static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
87 if (rwb_enabled(rwb)) {
88 const unsigned long cur = jiffies;
96 * If a task was rate throttled in balance_dirty_pages() within the last
97 * second or so, use that to indicate a higher cleaning rate.
99 static bool wb_recent_wait(struct rq_wb *rwb)
101 struct bdi_writeback *wb = &rwb->rqos.q->disk->bdi->wb;
103 return time_before(jiffies, wb->dirty_sleep + HZ);
106 static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
107 enum wbt_flags wb_acct)
109 if (wb_acct & WBT_KSWAPD)
110 return &rwb->rq_wait[WBT_RWQ_KSWAPD];
111 else if (wb_acct & WBT_DISCARD)
112 return &rwb->rq_wait[WBT_RWQ_DISCARD];
114 return &rwb->rq_wait[WBT_RWQ_BG];
117 static void rwb_wake_all(struct rq_wb *rwb)
121 for (i = 0; i < WBT_NUM_RWQ; i++) {
122 struct rq_wait *rqw = &rwb->rq_wait[i];
124 if (wq_has_sleeper(&rqw->wait))
125 wake_up_all(&rqw->wait);
129 static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
130 enum wbt_flags wb_acct)
134 inflight = atomic_dec_return(&rqw->inflight);
137 * wbt got disabled with IO in flight. Wake up any potential
138 * waiters, we don't have to do more than that.
140 if (unlikely(!rwb_enabled(rwb))) {
146 * For discards, our limit is always the background. For writes, if
147 * the device does write back caching, drop further down before we
150 if (wb_acct & WBT_DISCARD)
151 limit = rwb->wb_background;
152 else if (rwb->wc && !wb_recent_wait(rwb))
155 limit = rwb->wb_normal;
158 * Don't wake anyone up if we are above the normal limit.
160 if (inflight && inflight >= limit)
163 if (wq_has_sleeper(&rqw->wait)) {
164 int diff = limit - inflight;
166 if (!inflight || diff >= rwb->wb_background / 2)
167 wake_up_all(&rqw->wait);
171 static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
173 struct rq_wb *rwb = RQWB(rqos);
176 if (!(wb_acct & WBT_TRACKED))
179 rqw = get_rq_wait(rwb, wb_acct);
180 wbt_rqw_done(rwb, rqw, wb_acct);
184 * Called on completion of a request. Note that it's also called when
185 * a request is merged, when the request gets freed.
187 static void wbt_done(struct rq_qos *rqos, struct request *rq)
189 struct rq_wb *rwb = RQWB(rqos);
191 if (!wbt_is_tracked(rq)) {
192 if (rwb->sync_cookie == rq) {
194 rwb->sync_cookie = NULL;
198 wb_timestamp(rwb, &rwb->last_comp);
200 WARN_ON_ONCE(rq == rwb->sync_cookie);
201 __wbt_done(rqos, wbt_flags(rq));
206 static inline bool stat_sample_valid(struct blk_rq_stat *stat)
209 * We need at least one read sample, and a minimum of
210 * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
211 * that it's writes impacting us, and not just some sole read on
212 * a device that is in a lower power state.
214 return (stat[READ].nr_samples >= 1 &&
215 stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
218 static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
220 u64 now, issue = READ_ONCE(rwb->sync_issue);
222 if (!issue || !rwb->sync_cookie)
225 now = ktime_to_ns(ktime_get());
236 static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
238 struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi;
239 struct rq_depth *rqd = &rwb->rq_depth;
243 * If our stored sync issue exceeds the window size, or it
244 * exceeds our min target AND we haven't logged any entries,
245 * flag the latency as exceeded. wbt works off completion latencies,
246 * but for a flooded device, a single sync IO can take a long time
247 * to complete after being issued. If this time exceeds our
248 * monitoring window AND we didn't see any other completions in that
249 * window, then count that sync IO as a violation of the latency.
251 thislat = rwb_sync_issue_lat(rwb);
252 if (thislat > rwb->cur_win_nsec ||
253 (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
254 trace_wbt_lat(bdi, thislat);
259 * No read/write mix, if stat isn't valid
261 if (!stat_sample_valid(stat)) {
263 * If we had writes in this stat window and the window is
264 * current, we're only doing writes. If a task recently
265 * waited or still has writes in flights, consider us doing
266 * just writes as well.
268 if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
270 return LAT_UNKNOWN_WRITES;
275 * If the 'min' latency exceeds our target, step down.
277 if (stat[READ].min > rwb->min_lat_nsec) {
278 trace_wbt_lat(bdi, stat[READ].min);
279 trace_wbt_stat(bdi, stat);
284 trace_wbt_stat(bdi, stat);
289 static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
291 struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi;
292 struct rq_depth *rqd = &rwb->rq_depth;
294 trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
295 rwb->wb_background, rwb->wb_normal, rqd->max_depth);
298 static void calc_wb_limits(struct rq_wb *rwb)
300 if (rwb->min_lat_nsec == 0) {
301 rwb->wb_normal = rwb->wb_background = 0;
302 } else if (rwb->rq_depth.max_depth <= 2) {
303 rwb->wb_normal = rwb->rq_depth.max_depth;
304 rwb->wb_background = 1;
306 rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2;
307 rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4;
311 static void scale_up(struct rq_wb *rwb)
313 if (!rq_depth_scale_up(&rwb->rq_depth))
316 rwb->unknown_cnt = 0;
318 rwb_trace_step(rwb, tracepoint_string("scale up"));
321 static void scale_down(struct rq_wb *rwb, bool hard_throttle)
323 if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle))
326 rwb->unknown_cnt = 0;
327 rwb_trace_step(rwb, tracepoint_string("scale down"));
330 static void rwb_arm_timer(struct rq_wb *rwb)
332 struct rq_depth *rqd = &rwb->rq_depth;
334 if (rqd->scale_step > 0) {
336 * We should speed this up, using some variant of a fast
337 * integer inverse square root calculation. Since we only do
338 * this for every window expiration, it's not a huge deal,
341 rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
342 int_sqrt((rqd->scale_step + 1) << 8));
345 * For step < 0, we don't want to increase/decrease the
348 rwb->cur_win_nsec = rwb->win_nsec;
351 blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
354 static void wb_timer_fn(struct blk_stat_callback *cb)
356 struct rq_wb *rwb = cb->data;
357 struct rq_depth *rqd = &rwb->rq_depth;
358 unsigned int inflight = wbt_inflight(rwb);
361 if (!rwb->rqos.q->disk)
364 status = latency_exceeded(rwb, cb->stat);
366 trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step,
370 * If we exceeded the latency target, step down. If we did not,
371 * step one level up. If we don't know enough to say either exceeded
372 * or ok, then don't do anything.
376 scale_down(rwb, true);
381 case LAT_UNKNOWN_WRITES:
383 * We started a the center step, but don't have a valid
384 * read/write sample, but we do have writes going on.
385 * Allow step to go negative, to increase write perf.
390 if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
393 * We get here when previously scaled reduced depth, and we
394 * currently don't have a valid read/write sample. For that
395 * case, slowly return to center state (step == 0).
397 if (rqd->scale_step > 0)
399 else if (rqd->scale_step < 0)
400 scale_down(rwb, false);
407 * Re-arm timer, if we have IO in flight
409 if (rqd->scale_step || inflight)
413 static void wbt_update_limits(struct rq_wb *rwb)
415 struct rq_depth *rqd = &rwb->rq_depth;
418 rqd->scaled_max = false;
420 rq_depth_calc_max_depth(rqd);
426 bool wbt_disabled(struct request_queue *q)
428 struct rq_qos *rqos = wbt_rq_qos(q);
430 return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT ||
431 RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL;
434 u64 wbt_get_min_lat(struct request_queue *q)
436 struct rq_qos *rqos = wbt_rq_qos(q);
439 return RQWB(rqos)->min_lat_nsec;
442 void wbt_set_min_lat(struct request_queue *q, u64 val)
444 struct rq_qos *rqos = wbt_rq_qos(q);
448 RQWB(rqos)->min_lat_nsec = val;
450 RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
452 RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;
454 wbt_update_limits(RQWB(rqos));
458 static bool close_io(struct rq_wb *rwb)
460 const unsigned long now = jiffies;
462 return time_before(now, rwb->last_issue + HZ / 10) ||
463 time_before(now, rwb->last_comp + HZ / 10);
466 #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
468 static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
473 * If we got disabled, just return UINT_MAX. This ensures that
474 * we'll properly inc a new IO, and dec+wakeup at the end.
476 if (!rwb_enabled(rwb))
479 if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
480 return rwb->wb_background;
483 * At this point we know it's a buffered write. If this is
484 * kswapd trying to free memory, or REQ_SYNC is set, then
485 * it's WB_SYNC_ALL writeback, and we'll use the max limit for
486 * that. If the write is marked as a background write, then use
487 * the idle limit, or go to normal if we haven't had competing
490 if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
491 limit = rwb->rq_depth.max_depth;
492 else if ((opf & REQ_BACKGROUND) || close_io(rwb)) {
494 * If less than 100ms since we completed unrelated IO,
495 * limit us to half the depth for background writeback.
497 limit = rwb->wb_background;
499 limit = rwb->wb_normal;
504 struct wbt_wait_data {
506 enum wbt_flags wb_acct;
510 static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
512 struct wbt_wait_data *data = private_data;
513 return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf));
516 static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
518 struct wbt_wait_data *data = private_data;
519 wbt_rqw_done(data->rwb, rqw, data->wb_acct);
523 * Block if we will exceed our limit, or if we are currently waiting for
524 * the timer to kick off queuing again.
526 static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
529 struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
530 struct wbt_wait_data data = {
536 rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
539 static inline bool wbt_should_throttle(struct bio *bio)
541 switch (bio_op(bio)) {
544 * Don't throttle WRITE_ODIRECT
546 if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) ==
547 (REQ_SYNC | REQ_IDLE))
557 static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
559 enum wbt_flags flags = 0;
561 if (!rwb_enabled(rwb))
564 if (bio_op(bio) == REQ_OP_READ) {
566 } else if (wbt_should_throttle(bio)) {
567 if (current_is_kswapd())
569 if (bio_op(bio) == REQ_OP_DISCARD)
570 flags |= WBT_DISCARD;
571 flags |= WBT_TRACKED;
576 static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
578 struct rq_wb *rwb = RQWB(rqos);
579 enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
580 __wbt_done(rqos, flags);
584 * May sleep, if we have exceeded the writeback limits. Caller can pass
585 * in an irq held spinlock, if it holds one when calling this function.
586 * If we do sleep, we'll release and re-grab it.
588 static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
590 struct rq_wb *rwb = RQWB(rqos);
591 enum wbt_flags flags;
593 flags = bio_to_wbt_flags(rwb, bio);
594 if (!(flags & WBT_TRACKED)) {
595 if (flags & WBT_READ)
596 wb_timestamp(rwb, &rwb->last_issue);
600 __wbt_wait(rwb, flags, bio->bi_opf);
602 if (!blk_stat_is_active(rwb->cb))
606 static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
608 struct rq_wb *rwb = RQWB(rqos);
609 rq->wbt_flags |= bio_to_wbt_flags(rwb, bio);
612 static void wbt_issue(struct rq_qos *rqos, struct request *rq)
614 struct rq_wb *rwb = RQWB(rqos);
616 if (!rwb_enabled(rwb))
620 * Track sync issue, in case it takes a long time to complete. Allows us
621 * to react quicker, if a sync IO takes a long time to complete. Note
622 * that this is just a hint. The request can go away when it completes,
623 * so it's important we never dereference it. We only use the address to
624 * compare with, which is why we store the sync_issue time locally.
626 if (wbt_is_read(rq) && !rwb->sync_issue) {
627 rwb->sync_cookie = rq;
628 rwb->sync_issue = rq->io_start_time_ns;
632 static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
634 struct rq_wb *rwb = RQWB(rqos);
635 if (!rwb_enabled(rwb))
637 if (rq == rwb->sync_cookie) {
639 rwb->sync_cookie = NULL;
643 void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
645 struct rq_qos *rqos = wbt_rq_qos(q);
647 RQWB(rqos)->wc = write_cache_on;
651 * Enable wbt if defaults are configured that way
653 void wbt_enable_default(struct request_queue *q)
656 bool disable_flag = q->elevator &&
657 test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags);
659 /* Throttling already enabled? */
660 rqos = wbt_rq_qos(q);
663 RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
664 RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
668 /* Queue not registered? Maybe shutting down... */
669 if (!blk_queue_registered(q))
672 if (queue_is_mq(q) && !disable_flag)
675 EXPORT_SYMBOL_GPL(wbt_enable_default);
677 u64 wbt_default_latency_nsec(struct request_queue *q)
680 * We default to 2msec for non-rotational storage, and 75msec
681 * for rotational storage.
683 if (blk_queue_nonrot(q))
689 static int wbt_data_dir(const struct request *rq)
691 const enum req_op op = req_op(rq);
693 if (op == REQ_OP_READ)
695 else if (op_is_write(op))
702 static void wbt_queue_depth_changed(struct rq_qos *rqos)
704 RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q);
705 wbt_update_limits(RQWB(rqos));
708 static void wbt_exit(struct rq_qos *rqos)
710 struct rq_wb *rwb = RQWB(rqos);
711 struct request_queue *q = rqos->q;
713 blk_stat_remove_callback(q, rwb->cb);
714 blk_stat_free_callback(rwb->cb);
719 * Disable wbt, if enabled by default.
721 void wbt_disable_default(struct request_queue *q)
723 struct rq_qos *rqos = wbt_rq_qos(q);
728 if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
729 blk_stat_deactivate(rwb->cb);
730 rwb->enable_state = WBT_STATE_OFF_DEFAULT;
733 EXPORT_SYMBOL_GPL(wbt_disable_default);
735 #ifdef CONFIG_BLK_DEBUG_FS
736 static int wbt_curr_win_nsec_show(void *data, struct seq_file *m)
738 struct rq_qos *rqos = data;
739 struct rq_wb *rwb = RQWB(rqos);
741 seq_printf(m, "%llu\n", rwb->cur_win_nsec);
745 static int wbt_enabled_show(void *data, struct seq_file *m)
747 struct rq_qos *rqos = data;
748 struct rq_wb *rwb = RQWB(rqos);
750 seq_printf(m, "%d\n", rwb->enable_state);
754 static int wbt_id_show(void *data, struct seq_file *m)
756 struct rq_qos *rqos = data;
758 seq_printf(m, "%u\n", rqos->id);
762 static int wbt_inflight_show(void *data, struct seq_file *m)
764 struct rq_qos *rqos = data;
765 struct rq_wb *rwb = RQWB(rqos);
768 for (i = 0; i < WBT_NUM_RWQ; i++)
769 seq_printf(m, "%d: inflight %d\n", i,
770 atomic_read(&rwb->rq_wait[i].inflight));
774 static int wbt_min_lat_nsec_show(void *data, struct seq_file *m)
776 struct rq_qos *rqos = data;
777 struct rq_wb *rwb = RQWB(rqos);
779 seq_printf(m, "%lu\n", rwb->min_lat_nsec);
783 static int wbt_unknown_cnt_show(void *data, struct seq_file *m)
785 struct rq_qos *rqos = data;
786 struct rq_wb *rwb = RQWB(rqos);
788 seq_printf(m, "%u\n", rwb->unknown_cnt);
792 static int wbt_normal_show(void *data, struct seq_file *m)
794 struct rq_qos *rqos = data;
795 struct rq_wb *rwb = RQWB(rqos);
797 seq_printf(m, "%u\n", rwb->wb_normal);
801 static int wbt_background_show(void *data, struct seq_file *m)
803 struct rq_qos *rqos = data;
804 struct rq_wb *rwb = RQWB(rqos);
806 seq_printf(m, "%u\n", rwb->wb_background);
810 static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
811 {"curr_win_nsec", 0400, wbt_curr_win_nsec_show},
812 {"enabled", 0400, wbt_enabled_show},
813 {"id", 0400, wbt_id_show},
814 {"inflight", 0400, wbt_inflight_show},
815 {"min_lat_nsec", 0400, wbt_min_lat_nsec_show},
816 {"unknown_cnt", 0400, wbt_unknown_cnt_show},
817 {"wb_normal", 0400, wbt_normal_show},
818 {"wb_background", 0400, wbt_background_show},
823 static struct rq_qos_ops wbt_rqos_ops = {
824 .throttle = wbt_wait,
827 .requeue = wbt_requeue,
829 .cleanup = wbt_cleanup,
830 .queue_depth_changed = wbt_queue_depth_changed,
832 #ifdef CONFIG_BLK_DEBUG_FS
833 .debugfs_attrs = wbt_debugfs_attrs,
837 int wbt_init(struct request_queue *q)
843 rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
847 rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
853 for (i = 0; i < WBT_NUM_RWQ; i++)
854 rq_wait_init(&rwb->rq_wait[i]);
856 rwb->rqos.id = RQ_QOS_WBT;
857 rwb->rqos.ops = &wbt_rqos_ops;
859 rwb->last_comp = rwb->last_issue = jiffies;
860 rwb->win_nsec = RWB_WINDOW_NSEC;
861 rwb->enable_state = WBT_STATE_ON_DEFAULT;
862 rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags);
863 rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
864 rwb->min_lat_nsec = wbt_default_latency_nsec(q);
866 wbt_queue_depth_changed(&rwb->rqos);
869 * Assign rwb and add the stats callback.
871 ret = rq_qos_add(q, &rwb->rqos);
875 blk_stat_add_callback(q, rwb->cb);
880 blk_stat_free_callback(rwb->cb);