drbd: new configuration parameter c-min-rate
We now track the data rate of locally submitted resync related requests, and can thus detect non-resync activity on the lower level device. If the current sync rate is above c-min-rate, and the lower level device appears to be busy, we throttle the resyncer. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
This commit is contained in:
Родитель
80a40e439e
Коммит
0f0601f4ea
|
@ -1513,6 +1513,7 @@ extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
|
||||||
extern void resync_timer_fn(unsigned long data);
|
extern void resync_timer_fn(unsigned long data);
|
||||||
|
|
||||||
/* drbd_receiver.c */
|
/* drbd_receiver.c */
|
||||||
|
extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
|
||||||
extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
|
extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
|
||||||
const unsigned rw, const int fault_type);
|
const unsigned rw, const int fault_type);
|
||||||
extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
|
extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
|
||||||
|
|
|
@ -1098,6 +1098,8 @@ int __drbd_set_state(struct drbd_conf *mdev,
|
||||||
mdev->ov_left = mdev->rs_total
|
mdev->ov_left = mdev->rs_total
|
||||||
- BM_SECT_TO_BIT(mdev->ov_position);
|
- BM_SECT_TO_BIT(mdev->ov_position);
|
||||||
mdev->rs_start = now;
|
mdev->rs_start = now;
|
||||||
|
mdev->rs_last_events = 0;
|
||||||
|
mdev->rs_last_sect_ev = 0;
|
||||||
mdev->ov_last_oos_size = 0;
|
mdev->ov_last_oos_size = 0;
|
||||||
mdev->ov_last_oos_start = 0;
|
mdev->ov_last_oos_start = 0;
|
||||||
|
|
||||||
|
@ -2706,7 +2708,8 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
|
||||||
/* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
|
/* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
|
||||||
/* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
|
/* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
|
||||||
/* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
|
/* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
|
||||||
/* .c_max_rate = */ DRBD_C_MAX_RATE_DEF
|
/* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
|
||||||
|
/* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Have to use that way, because the layout differs between
|
/* Have to use that way, because the layout differs between
|
||||||
|
@ -2742,6 +2745,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
|
||||||
atomic_set(&mdev->packet_seq, 0);
|
atomic_set(&mdev->packet_seq, 0);
|
||||||
atomic_set(&mdev->pp_in_use, 0);
|
atomic_set(&mdev->pp_in_use, 0);
|
||||||
atomic_set(&mdev->rs_sect_in, 0);
|
atomic_set(&mdev->rs_sect_in, 0);
|
||||||
|
atomic_set(&mdev->rs_sect_ev, 0);
|
||||||
|
|
||||||
mutex_init(&mdev->md_io_mutex);
|
mutex_init(&mdev->md_io_mutex);
|
||||||
mutex_init(&mdev->data.mutex);
|
mutex_init(&mdev->data.mutex);
|
||||||
|
@ -2819,6 +2823,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
|
||||||
mdev->rs_total =
|
mdev->rs_total =
|
||||||
mdev->rs_failed = 0;
|
mdev->rs_failed = 0;
|
||||||
mdev->rs_last_events = 0;
|
mdev->rs_last_events = 0;
|
||||||
|
mdev->rs_last_sect_ev = 0;
|
||||||
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
|
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
|
||||||
mdev->rs_mark_left[i] = 0;
|
mdev->rs_mark_left[i] = 0;
|
||||||
mdev->rs_mark_time[i] = 0;
|
mdev->rs_mark_time[i] = 0;
|
||||||
|
|
|
@ -1604,7 +1604,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
|
||||||
sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
|
sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
|
||||||
sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
|
sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
|
||||||
sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
|
sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
|
||||||
sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
|
sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
|
||||||
|
sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
|
||||||
} else
|
} else
|
||||||
memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
|
memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
|
||||||
|
|
||||||
|
|
|
@ -1561,6 +1561,7 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
|
||||||
list_add(&e->w.list, &mdev->sync_ee);
|
list_add(&e->w.list, &mdev->sync_ee);
|
||||||
spin_unlock_irq(&mdev->req_lock);
|
spin_unlock_irq(&mdev->req_lock);
|
||||||
|
|
||||||
|
atomic_add(data_size >> 9, &mdev->rs_sect_ev);
|
||||||
if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
|
if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
|
||||||
return TRUE;
|
return TRUE;
|
||||||
|
|
||||||
|
@ -2017,17 +2018,66 @@ out_interrupted:
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* We may throttle resync, if the lower device seems to be busy,
|
||||||
|
* and current sync rate is above c_min_rate.
|
||||||
|
*
|
||||||
|
* To decide whether or not the lower device is busy, we use a scheme similar
|
||||||
|
* to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
|
||||||
|
* (more than 64 sectors) of activity we cannot account for with our own resync
|
||||||
|
* activity, it obviously is "busy".
|
||||||
|
*
|
||||||
|
* The current sync rate used here uses only the most recent two step marks,
|
||||||
|
* to have a short time average so we can react faster.
|
||||||
|
*/
|
||||||
|
int drbd_rs_should_slow_down(struct drbd_conf *mdev)
|
||||||
|
{
|
||||||
|
struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
|
||||||
|
unsigned long db, dt, dbdt;
|
||||||
|
int curr_events;
|
||||||
|
int throttle = 0;
|
||||||
|
|
||||||
|
/* feature disabled? */
|
||||||
|
if (mdev->sync_conf.c_min_rate == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
|
||||||
|
(int)part_stat_read(&disk->part0, sectors[1]) -
|
||||||
|
atomic_read(&mdev->rs_sect_ev);
|
||||||
|
if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
|
||||||
|
unsigned long rs_left;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
mdev->rs_last_events = curr_events;
|
||||||
|
|
||||||
|
/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
|
||||||
|
* approx. */
|
||||||
|
i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
|
||||||
|
rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
|
||||||
|
|
||||||
|
dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
|
||||||
|
if (!dt)
|
||||||
|
dt++;
|
||||||
|
db = mdev->rs_mark_left[i] - rs_left;
|
||||||
|
dbdt = Bit2KB(db/dt);
|
||||||
|
|
||||||
|
if (dbdt > mdev->sync_conf.c_min_rate)
|
||||||
|
throttle = 1;
|
||||||
|
}
|
||||||
|
return throttle;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
|
static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
|
||||||
{
|
{
|
||||||
sector_t sector;
|
sector_t sector;
|
||||||
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
|
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
|
||||||
struct drbd_epoch_entry *e;
|
struct drbd_epoch_entry *e;
|
||||||
struct digest_info *di = NULL;
|
struct digest_info *di = NULL;
|
||||||
|
struct p_block_req *p = (struct p_block_req *)h;
|
||||||
|
const int brps = sizeof(*p)-sizeof(*h);
|
||||||
int size, digest_size;
|
int size, digest_size;
|
||||||
unsigned int fault_type;
|
unsigned int fault_type;
|
||||||
struct p_block_req *p =
|
|
||||||
(struct p_block_req *)h;
|
|
||||||
const int brps = sizeof(*p)-sizeof(*h);
|
|
||||||
|
|
||||||
if (drbd_recv(mdev, h->payload, brps) != brps)
|
if (drbd_recv(mdev, h->payload, brps) != brps)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
@ -2099,8 +2149,9 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
|
||||||
} else if (h->command == P_OV_REPLY) {
|
} else if (h->command == P_OV_REPLY) {
|
||||||
e->w.cb = w_e_end_ov_reply;
|
e->w.cb = w_e_end_ov_reply;
|
||||||
dec_rs_pending(mdev);
|
dec_rs_pending(mdev);
|
||||||
/* drbd_rs_begin_io done when we sent this request */
|
/* drbd_rs_begin_io done when we sent this request,
|
||||||
goto submit;
|
* but accounting still needs to be done. */
|
||||||
|
goto submit_for_resync;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -2128,9 +2179,36 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
|
||||||
goto out_free_e;
|
goto out_free_e;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Throttle, drbd_rs_begin_io and submit should become asynchronous
|
||||||
|
* wrt the receiver, but it is not as straightforward as it may seem.
|
||||||
|
* Various places in the resync start and stop logic assume resync
|
||||||
|
* requests are processed in order, requeuing this on the worker thread
|
||||||
|
* introduces a bunch of new code for synchronization between threads.
|
||||||
|
*
|
||||||
|
* Unlimited throttling before drbd_rs_begin_io may stall the resync
|
||||||
|
* "forever", throttling after drbd_rs_begin_io will lock that extent
|
||||||
|
* for application writes for the same time. For now, just throttle
|
||||||
|
* here, where the rest of the code expects the receiver to sleep for
|
||||||
|
* a while, anyways.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Throttle before drbd_rs_begin_io, as that locks out application IO;
|
||||||
|
* this defers syncer requests for some time, before letting at least
|
||||||
|
* on request through. The resync controller on the receiving side
|
||||||
|
* will adapt to the incoming rate accordingly.
|
||||||
|
*
|
||||||
|
* We cannot throttle here if remote is Primary/SyncTarget:
|
||||||
|
* we would also throttle its application reads.
|
||||||
|
* In that case, throttling is done on the SyncTarget only.
|
||||||
|
*/
|
||||||
|
if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
|
||||||
|
msleep(100);
|
||||||
if (drbd_rs_begin_io(mdev, e->sector))
|
if (drbd_rs_begin_io(mdev, e->sector))
|
||||||
goto out_free_e;
|
goto out_free_e;
|
||||||
|
|
||||||
|
submit_for_resync:
|
||||||
|
atomic_add(size >> 9, &mdev->rs_sect_ev);
|
||||||
|
|
||||||
submit:
|
submit:
|
||||||
inc_unacked(mdev);
|
inc_unacked(mdev);
|
||||||
spin_lock_irq(&mdev->req_lock);
|
spin_lock_irq(&mdev->req_lock);
|
||||||
|
|
|
@ -215,10 +215,8 @@ void drbd_endio_sec(struct bio *bio, int error)
|
||||||
*/
|
*/
|
||||||
void drbd_endio_pri(struct bio *bio, int error)
|
void drbd_endio_pri(struct bio *bio, int error)
|
||||||
{
|
{
|
||||||
unsigned long flags;
|
|
||||||
struct drbd_request *req = bio->bi_private;
|
struct drbd_request *req = bio->bi_private;
|
||||||
struct drbd_conf *mdev = req->mdev;
|
struct drbd_conf *mdev = req->mdev;
|
||||||
struct bio_and_error m;
|
|
||||||
enum drbd_req_event what;
|
enum drbd_req_event what;
|
||||||
int uptodate = bio_flagged(bio, BIO_UPTODATE);
|
int uptodate = bio_flagged(bio, BIO_UPTODATE);
|
||||||
|
|
||||||
|
@ -244,12 +242,7 @@ void drbd_endio_pri(struct bio *bio, int error)
|
||||||
bio_put(req->private_bio);
|
bio_put(req->private_bio);
|
||||||
req->private_bio = ERR_PTR(error);
|
req->private_bio = ERR_PTR(error);
|
||||||
|
|
||||||
spin_lock_irqsave(&mdev->req_lock, flags);
|
req_mod(req, what);
|
||||||
__req_mod(req, what, &m);
|
|
||||||
spin_unlock_irqrestore(&mdev->req_lock, flags);
|
|
||||||
|
|
||||||
if (m.bio)
|
|
||||||
complete_master_bio(mdev, &m);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
|
int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
|
||||||
|
@ -376,6 +369,9 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
|
||||||
if (!get_ldev(mdev))
|
if (!get_ldev(mdev))
|
||||||
return -EIO;
|
return -EIO;
|
||||||
|
|
||||||
|
if (drbd_rs_should_slow_down(mdev))
|
||||||
|
goto defer;
|
||||||
|
|
||||||
/* GFP_TRY, because if there is no memory available right now, this may
|
/* GFP_TRY, because if there is no memory available right now, this may
|
||||||
* be rescheduled for later. It is "only" background resync, after all. */
|
* be rescheduled for later. It is "only" background resync, after all. */
|
||||||
e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
|
e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
|
||||||
|
@ -387,6 +383,7 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
|
||||||
list_add(&e->w.list, &mdev->read_ee);
|
list_add(&e->w.list, &mdev->read_ee);
|
||||||
spin_unlock_irq(&mdev->req_lock);
|
spin_unlock_irq(&mdev->req_lock);
|
||||||
|
|
||||||
|
atomic_add(size >> 9, &mdev->rs_sect_ev);
|
||||||
if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
|
if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
@ -512,8 +509,9 @@ int w_make_resync_request(struct drbd_conf *mdev,
|
||||||
sector_t sector;
|
sector_t sector;
|
||||||
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
|
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
|
||||||
int max_segment_size;
|
int max_segment_size;
|
||||||
int number, i, rollback_i, size, pe, mx;
|
int number, rollback_i, size, pe, mx;
|
||||||
int align, queued, sndbuf;
|
int align, queued, sndbuf;
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
if (unlikely(cancel))
|
if (unlikely(cancel))
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -549,7 +547,14 @@ int w_make_resync_request(struct drbd_conf *mdev,
|
||||||
mdev->c_sync_rate = mdev->sync_conf.rate;
|
mdev->c_sync_rate = mdev->sync_conf.rate;
|
||||||
number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
|
number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
|
||||||
}
|
}
|
||||||
pe = atomic_read(&mdev->rs_pending_cnt);
|
|
||||||
|
/* Throttle resync on lower level disk activity, which may also be
|
||||||
|
* caused by application IO on Primary/SyncTarget.
|
||||||
|
* Keep this after the call to drbd_rs_controller, as that assumes
|
||||||
|
* to be called as precisely as possible every SLEEP_TIME,
|
||||||
|
* and would be confused otherwise. */
|
||||||
|
if (drbd_rs_should_slow_down(mdev))
|
||||||
|
goto requeue;
|
||||||
|
|
||||||
mutex_lock(&mdev->data.mutex);
|
mutex_lock(&mdev->data.mutex);
|
||||||
if (mdev->data.socket)
|
if (mdev->data.socket)
|
||||||
|
@ -563,6 +568,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
|
||||||
mx = number;
|
mx = number;
|
||||||
|
|
||||||
/* Limit the number of pending RS requests to no more than the peer's receive buffer */
|
/* Limit the number of pending RS requests to no more than the peer's receive buffer */
|
||||||
|
pe = atomic_read(&mdev->rs_pending_cnt);
|
||||||
if ((pe + number) > mx) {
|
if ((pe + number) > mx) {
|
||||||
number = mx - pe;
|
number = mx - pe;
|
||||||
}
|
}
|
||||||
|
@ -1492,6 +1498,8 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
|
||||||
mdev->rs_failed = 0;
|
mdev->rs_failed = 0;
|
||||||
mdev->rs_paused = 0;
|
mdev->rs_paused = 0;
|
||||||
mdev->rs_same_csum = 0;
|
mdev->rs_same_csum = 0;
|
||||||
|
mdev->rs_last_events = 0;
|
||||||
|
mdev->rs_last_sect_ev = 0;
|
||||||
mdev->rs_total = tw;
|
mdev->rs_total = tw;
|
||||||
mdev->rs_start = now;
|
mdev->rs_start = now;
|
||||||
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
|
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
|
||||||
|
@ -1516,6 +1524,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
|
||||||
}
|
}
|
||||||
|
|
||||||
atomic_set(&mdev->rs_sect_in, 0);
|
atomic_set(&mdev->rs_sect_in, 0);
|
||||||
|
atomic_set(&mdev->rs_sect_ev, 0);
|
||||||
mdev->rs_in_flight = 0;
|
mdev->rs_in_flight = 0;
|
||||||
mdev->rs_planed = 0;
|
mdev->rs_planed = 0;
|
||||||
spin_lock(&mdev->peer_seq_lock);
|
spin_lock(&mdev->peer_seq_lock);
|
||||||
|
|
|
@ -150,5 +150,9 @@
|
||||||
#define DRBD_C_MAX_RATE_MAX (4 << 20)
|
#define DRBD_C_MAX_RATE_MAX (4 << 20)
|
||||||
#define DRBD_C_MAX_RATE_DEF 102400
|
#define DRBD_C_MAX_RATE_DEF 102400
|
||||||
|
|
||||||
|
#define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */
|
||||||
|
#define DRBD_C_MIN_RATE_MAX (4 << 20)
|
||||||
|
#define DRBD_C_MIN_RATE_DEF 4096
|
||||||
|
|
||||||
#undef RANGE
|
#undef RANGE
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -92,6 +92,7 @@ NL_PACKET(syncer_conf, 8,
|
||||||
NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target)
|
NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target)
|
||||||
NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target)
|
NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target)
|
||||||
NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate)
|
NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate)
|
||||||
|
NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate)
|
||||||
)
|
)
|
||||||
|
|
||||||
NL_PACKET(invalidate, 9, )
|
NL_PACKET(invalidate, 9, )
|
||||||
|
|
Загрузка…
Ссылка в новой задаче