#include "blk-ioprio.h"
#include "blk-throttle.h"
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
+
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
* blkcg_pol_register_mutex nests outside of it and synchronizes entire
bool blkcg_debug_stats = false;
+static DEFINE_RAW_SPINLOCK(blkg_stat_lock);
+
#define BLKG_DESTROY_BATCH_SIZE 64
/*
static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+ struct blkcg *blkcg = blkg->blkcg;
+ int cpu;
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
WARN_ON(!bio_list_empty(&blkg->async_bios));
#endif
+ /*
+ * Flush all the non-empty percpu lockless lists before releasing
+ * us, given these stat belongs to us.
+ *
+ * blkg_stat_lock is for serializing blkg stat update
+ */
+ for_each_possible_cpu(cpu)
+ __blkcg_rstat_flush(blkcg, cpu);
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
{
- struct blkcg *blkcg = css_to_blkcg(css);
struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
struct llist_node *lnode;
struct blkg_iostat_set *bisc, *next_bisc;
-
- /* Root-level stats are sourced from system-wide IO stats */
- if (!cgroup_parent(css->cgroup))
- return;
+ unsigned long flags;
rcu_read_lock();
goto out;
/*
+ * For covering concurrent parent blkg update from blkg_release().
+ *
+ * When flushing from cgroup, cgroup_rstat_lock is always held, so
+ * this lock won't cause contention most of time.
+ */
+ raw_spin_lock_irqsave(&blkg_stat_lock, flags);
+
+ /*
* Iterate only the iostat_cpu's queued in the lockless list.
*/
llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
if (parent && parent->parent)
blkcg_iostat_update(parent, &blkg->iostat.cur,
&blkg->iostat.last);
- percpu_ref_put(&blkg->refcnt);
}
-
+ raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
out:
rcu_read_unlock();
}
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ /* Root-level stats are sourced from system-wide IO stats */
+ if (cgroup_parent(css->cgroup))
+ __blkcg_rstat_flush(css_to_blkcg(css), cpu);
+}
+
/*
* We source root cgroup stats from the system-wide stats to avoid
* tracking the same information twice and incurring overhead when no
struct blkg_iostat_set *bis;
unsigned long flags;
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys))
+ return;
+
/* Root-level stats are sourced from system-wide IO stats */
if (!cgroup_parent(blkcg->css.cgroup))
return;
llist_add(&bis->lnode, lhead);
WRITE_ONCE(bis->lqueued, true);
- percpu_ref_get(&bis->blkg->refcnt);
}
u64_stats_update_end_irqrestore(&bis->sync, flags);
- if (cgroup_subsys_on_dfl(io_cgrp_subsys))
- cgroup_rstat_updated(blkcg->css.cgroup, cpu);
+ cgroup_rstat_updated(blkcg->css.cgroup, cpu);
put_cpu();
}
blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
rq->mq_hctx = NULL;
+
+ if (rq->rq_flags & RQF_MQ_INFLIGHT)
+ __blk_mq_dec_active_requests(hctx);
+
if (rq->tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
if (sched_tag != BLK_MQ_NO_TAG)
void blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
- struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if ((rq->rq_flags & RQF_USE_SCHED) &&
q->elevator->type->ops.finish_request)
q->elevator->type->ops.finish_request(rq);
- if (rq->rq_flags & RQF_MQ_INFLIGHT)
- __blk_mq_dec_active_requests(hctx);
-
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->disk->bdi);
if (!plug->multiple_queues && last && last->q != rq->q)
plug->multiple_queues = true;
- if (!plug->has_elevator && (rq->rq_flags & RQF_USE_SCHED))
+ /*
+ * Any request allocated from sched tags can't be issued to
+ * ->queue_rqs() directly
+ */
+ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
plug->has_elevator = true;
rq->rq_next = NULL;
rq_list_add(&plug->mq_list, rq);
static inline bool rwb_enabled(struct rq_wb *rwb)
{
return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT &&
- rwb->wb_normal != 0;
+ rwb->enable_state != WBT_STATE_OFF_MANUAL;
}
static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
inflight = atomic_dec_return(&rqw->inflight);
/*
- * wbt got disabled with IO in flight. Wake up any potential
- * waiters, we don't have to do more than that.
- */
- if (unlikely(!rwb_enabled(rwb))) {
- rwb_wake_all(rwb);
- return;
- }
-
- /*
* For discards, our limit is always the background. For writes, if
* the device does write back caching, drop further down before we
* wake people up.
{
struct rq_qos *rqos = wbt_rq_qos(q);
- return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT ||
- RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL;
+ return !rqos || !rwb_enabled(RQWB(rqos));
}
u64 wbt_get_min_lat(struct request_queue *q)
{
unsigned int limit;
- /*
- * If we got disabled, just return UINT_MAX. This ensures that
- * we'll properly inc a new IO, and dec+wakeup at the end.
- */
- if (!rwb_enabled(rwb))
- return UINT_MAX;
-
if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
return rwb->wb_background;
{
struct request_queue *q = disk->queue;
struct rq_qos *rqos;
- bool disable_flag = q->elevator &&
- test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags);
+ bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
+
+ if (q->elevator &&
+ test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
+ enable = false;
/* Throttling already enabled? */
rqos = wbt_rq_qos(q);
if (rqos) {
- if (!disable_flag &&
- RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
+ if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
return;
}
if (!blk_queue_registered(q))
return;
- if (queue_is_mq(q) && !disable_flag)
+ if (queue_is_mq(q) && enable)
wbt_init(disk);
}
EXPORT_SYMBOL_GPL(wbt_enable_default);