From 0f0601f4ea2f53cfd8bcae060fb03d9bbde070ec Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 11 Aug 2010 23:40:24 +0200 Subject: [PATCH] drbd: new configuration parameter c-min-rate We now track the data rate of locally submitted resync related requests, and can thus detect non-resync activity on the lower level device. If the current sync rate is above c-min-rate, and the lower level device appears to be busy, we throttle the resyncer. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 1 + drivers/block/drbd/drbd_main.c | 7 ++- drivers/block/drbd/drbd_nl.c | 3 +- drivers/block/drbd/drbd_receiver.c | 88 +++++++++++++++++++++++++++++++++++--- drivers/block/drbd/drbd_worker.c | 29 ++++++++----- include/linux/drbd_limits.h | 4 ++ include/linux/drbd_nl.h | 1 + 7 files changed, 116 insertions(+), 17 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 0fce3f3..0fedcc0 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1513,6 +1513,7 @@ extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); /* drbd_receiver.c */ +extern int drbd_rs_should_slow_down(struct drbd_conf *mdev); extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, const unsigned rw, const int fault_type); extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 1ff8418a..db93eee 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1098,6 +1098,8 @@ int __drbd_set_state(struct drbd_conf *mdev, mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(mdev->ov_position); mdev->rs_start = now; + mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; mdev->ov_last_oos_size = 0; mdev->ov_last_oos_start = 0; @@ -2706,7 +2708,8 @@ static void drbd_set_defaults(struct drbd_conf *mdev) /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF, /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF, /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF, - /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF + /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF, + /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF }; /* Have to use that way, because the layout differs between @@ -2742,6 +2745,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->packet_seq, 0); atomic_set(&mdev->pp_in_use, 0); atomic_set(&mdev->rs_sect_in, 0); + atomic_set(&mdev->rs_sect_ev, 0); mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); @@ -2819,6 +2823,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev) mdev->rs_total = mdev->rs_failed = 0; mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; for (i = 0; i < DRBD_SYNC_MARKS; i++) { mdev->rs_mark_left[i] = 0; mdev->rs_mark_time[i] = 0; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 295b8d5..6b35d41 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1604,7 +1604,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF; sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF; sc.c_fill_target = DRBD_C_FILL_TARGET_DEF; - sc.c_max_rate = DRBD_C_MAX_RATE_DEF; + sc.c_max_rate = DRBD_C_MAX_RATE_DEF; + sc.c_min_rate = DRBD_C_MIN_RATE_DEF; } else memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 346aed9..0d9967fef 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1561,6 +1561,7 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si list_add(&e->w.list, &mdev->sync_ee); spin_unlock_irq(&mdev->req_lock); + atomic_add(data_size >> 9, &mdev->rs_sect_ev); if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0) return TRUE; @@ -2017,17 +2018,66 @@ out_interrupted: return FALSE; } +/* We may throttle resync, if the lower device seems to be busy, + * and current sync rate is above c_min_rate. + * + * To decide whether or not the lower device is busy, we use a scheme similar + * to MD RAID is_mddev_idle(): if the partition stats reveal "significant" + * (more than 64 sectors) of activity we cannot account for with our own resync + * activity, it obviously is "busy". + * + * The current sync rate used here uses only the most recent two step marks, + * to have a short time average so we can react faster. + */ +int drbd_rs_should_slow_down(struct drbd_conf *mdev) +{ + struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk; + unsigned long db, dt, dbdt; + int curr_events; + int throttle = 0; + + /* feature disabled? */ + if (mdev->sync_conf.c_min_rate == 0) + return 0; + + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&mdev->rs_sect_ev); + if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) { + unsigned long rs_left; + int i; + + mdev->rs_last_events = curr_events; + + /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP, + * approx. */ + i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS; + rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; + + dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ; + if (!dt) + dt++; + db = mdev->rs_mark_left[i] - rs_left; + dbdt = Bit2KB(db/dt); + + if (dbdt > mdev->sync_conf.c_min_rate) + throttle = 1; + } + return throttle; +} + + static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) { sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); struct drbd_epoch_entry *e; struct digest_info *di = NULL; + struct p_block_req *p = (struct p_block_req *)h; + const int brps = sizeof(*p)-sizeof(*h); int size, digest_size; unsigned int fault_type; - struct p_block_req *p = - (struct p_block_req *)h; - const int brps = sizeof(*p)-sizeof(*h); + if (drbd_recv(mdev, h->payload, brps) != brps) return FALSE; @@ -2099,8 +2149,9 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) } else if (h->command == P_OV_REPLY) { e->w.cb = w_e_end_ov_reply; dec_rs_pending(mdev); - /* drbd_rs_begin_io done when we sent this request */ - goto submit; + /* drbd_rs_begin_io done when we sent this request, + * but accounting still needs to be done. */ + goto submit_for_resync; } break; @@ -2128,9 +2179,36 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) goto out_free_e; } + /* Throttle, drbd_rs_begin_io and submit should become asynchronous + * wrt the receiver, but it is not as straightforward as it may seem. + * Various places in the resync start and stop logic assume resync + * requests are processed in order, requeuing this on the worker thread + * introduces a bunch of new code for synchronization between threads. + * + * Unlimited throttling before drbd_rs_begin_io may stall the resync + * "forever", throttling after drbd_rs_begin_io will lock that extent + * for application writes for the same time. For now, just throttle + * here, where the rest of the code expects the receiver to sleep for + * a while, anyways. + */ + + /* Throttle before drbd_rs_begin_io, as that locks out application IO; + * this defers syncer requests for some time, before letting at least + * on request through. The resync controller on the receiving side + * will adapt to the incoming rate accordingly. + * + * We cannot throttle here if remote is Primary/SyncTarget: + * we would also throttle its application reads. + * In that case, throttling is done on the SyncTarget only. + */ + if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev)) + msleep(100); if (drbd_rs_begin_io(mdev, e->sector)) goto out_free_e; +submit_for_resync: + atomic_add(size >> 9, &mdev->rs_sect_ev); + submit: inc_unacked(mdev); spin_lock_irq(&mdev->req_lock); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index f5d779b..99c937a 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -215,10 +215,8 @@ void drbd_endio_sec(struct bio *bio, int error) */ void drbd_endio_pri(struct bio *bio, int error) { - unsigned long flags; struct drbd_request *req = bio->bi_private; struct drbd_conf *mdev = req->mdev; - struct bio_and_error m; enum drbd_req_event what; int uptodate = bio_flagged(bio, BIO_UPTODATE); @@ -244,12 +242,7 @@ void drbd_endio_pri(struct bio *bio, int error) bio_put(req->private_bio); req->private_bio = ERR_PTR(error); - spin_lock_irqsave(&mdev->req_lock, flags); - __req_mod(req, what, &m); - spin_unlock_irqrestore(&mdev->req_lock, flags); - - if (m.bio) - complete_master_bio(mdev, &m); + req_mod(req, what); } int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) @@ -376,6 +369,9 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) if (!get_ldev(mdev)) return -EIO; + if (drbd_rs_should_slow_down(mdev)) + goto defer; + /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); @@ -387,6 +383,7 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) list_add(&e->w.list, &mdev->read_ee); spin_unlock_irq(&mdev->req_lock); + atomic_add(size >> 9, &mdev->rs_sect_ev); if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0) return 0; @@ -512,8 +509,9 @@ int w_make_resync_request(struct drbd_conf *mdev, sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); int max_segment_size; - int number, i, rollback_i, size, pe, mx; + int number, rollback_i, size, pe, mx; int align, queued, sndbuf; + int i = 0; if (unlikely(cancel)) return 1; @@ -549,7 +547,14 @@ int w_make_resync_request(struct drbd_conf *mdev, mdev->c_sync_rate = mdev->sync_conf.rate; number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); } - pe = atomic_read(&mdev->rs_pending_cnt); + + /* Throttle resync on lower level disk activity, which may also be + * caused by application IO on Primary/SyncTarget. + * Keep this after the call to drbd_rs_controller, as that assumes + * to be called as precisely as possible every SLEEP_TIME, + * and would be confused otherwise. */ + if (drbd_rs_should_slow_down(mdev)) + goto requeue; mutex_lock(&mdev->data.mutex); if (mdev->data.socket) @@ -563,6 +568,7 @@ int w_make_resync_request(struct drbd_conf *mdev, mx = number; /* Limit the number of pending RS requests to no more than the peer's receive buffer */ + pe = atomic_read(&mdev->rs_pending_cnt); if ((pe + number) > mx) { number = mx - pe; } @@ -1492,6 +1498,8 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) mdev->rs_failed = 0; mdev->rs_paused = 0; mdev->rs_same_csum = 0; + mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; mdev->rs_total = tw; mdev->rs_start = now; for (i = 0; i < DRBD_SYNC_MARKS; i++) { @@ -1516,6 +1524,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) } atomic_set(&mdev->rs_sect_in, 0); + atomic_set(&mdev->rs_sect_ev, 0); mdev->rs_in_flight = 0; mdev->rs_planed = 0; spin_lock(&mdev->peer_seq_lock); diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 06dbba4..0b24ded 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -150,5 +150,9 @@ #define DRBD_C_MAX_RATE_MAX (4 << 20) #define DRBD_C_MAX_RATE_DEF 102400 +#define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */ +#define DRBD_C_MIN_RATE_MAX (4 << 20) +#define DRBD_C_MIN_RATE_DEF 4096 + #undef RANGE #endif diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index e23683c..ade9110 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -92,6 +92,7 @@ NL_PACKET(syncer_conf, 8, NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target) NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target) NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate) + NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate) ) NL_PACKET(invalidate, 9, ) -- 2.7.4