From 0cc15d03bcccdf95e2bd82e094e6064e61b54207 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 2 Jul 2012 17:27:04 -0700 Subject: [PATCH 01/17] floppy: Run floppy initialization asynchronous floppy_init is quite slow, 3s on my test system to determine that there is no floppy. Run it asynchronous to the other init calls to improve boot time. [jkosina@suse.cz: fix modular build] Signed-off-by: Andi Kleen Signed-off-by: Jiri Kosina --- drivers/block/floppy.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index cce7df367b79..1347ba8b8377 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -191,6 +191,7 @@ static int print_unex = 1; #include #include #include +#include /* * PS/2 floppies have much slower step rates than regular floppies. @@ -4122,7 +4123,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data) return get_disk(disks[drive]); } -static int __init floppy_init(void) +static int __init do_floppy_init(void) { int i, unit, drive; int err, dr; @@ -4337,6 +4338,24 @@ out_put_disk: return err; } +#ifndef MODULE +static __init void floppy_async_init(void *data, async_cookie_t cookie) +{ + do_floppy_init(); +} +#endif + +static int __init floppy_init(void) +{ +#ifdef MODULE + return do_floppy_init(); +#else + /* Don't hold up the bootup by the floppy initialization */ + async_schedule(floppy_async_init, NULL); + return 0; +#endif +} + static const struct io_region { int offset; int size; From d264580145a0aee2f5113c37b178a55b6e1b0b32 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 18 Jun 2012 14:18:56 +0200 Subject: [PATCH 02/17] drbd: cleanup, remove two unused global flags Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 02f013a073a7..356a6e5b4415 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -813,7 +813,6 @@ enum { SIGNAL_ASENDER, /* whether asender wants to be interrupted */ SEND_PING, /* whether asender should send a ping asap */ - UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ MD_DIRTY, /* current uuids and flags not yet on disk */ DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ @@ -824,7 +823,6 @@ enum { CRASHED_PRIMARY, /* This node was a crashed primary. * Gets cleared when the state.conn * goes into C_CONNECTED state. */ - NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ CONSIDER_RESYNC, MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ From 383606e0dea6a380097dbcb0c319b09ca372f36b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Jun 2012 14:21:32 +0200 Subject: [PATCH 03/17] drbd: differentiate between normal and forced detach Aborting local requests (not waiting for completion from the lower level disk) is dangerous: if the master bio has been completed to upper layers, data pages may be re-used for other things already. If local IO is still pending and later completes, this may cause crashes or corrupt unrelated data. Only abort local IO if explicitly requested. Intended use case is a lower level device that turned into a tarpit, not completing io requests, not even doing error completion. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 2 +- drivers/block/drbd/drbd_bitmap.c | 4 ++-- drivers/block/drbd/drbd_int.h | 17 ++++++++++++++--- drivers/block/drbd/drbd_main.c | 20 ++++++++++++++++---- drivers/block/drbd/drbd_nl.c | 4 ++++ drivers/block/drbd/drbd_req.c | 6 +++--- drivers/block/drbd/drbd_worker.c | 4 ++-- 7 files changed, 42 insertions(+), 15 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index e54e31b02b88..6ace11e9e5a1 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -411,7 +411,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) + mdev->ldev->md.al_offset + mdev->al_tr_pos; if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) - drbd_chk_io_error(mdev, 1, true); + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); if (++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index fcb956bb4b4c..ba91b408abad 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1096,7 +1096,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w if (ctx->error) { dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); - drbd_chk_io_error(mdev, 1, true); + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); err = -EIO; /* ctx->error ? */ } @@ -1212,7 +1212,7 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); if (ctx->error) - drbd_chk_io_error(mdev, 1, true); + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); /* that should force detach, so the in memory bitmap will be * gone in a moment as well. */ diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 356a6e5b4415..79c69ebb0653 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -832,6 +832,7 @@ enum { BITMAP_IO_QUEUED, /* Started bitmap IO */ GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ WAS_IO_ERROR, /* Local disk failed returned IO error */ + FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ NET_CONGESTED, /* The data socket is congested */ @@ -1838,12 +1839,20 @@ static inline int drbd_request_state(struct drbd_conf *mdev, return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); } +enum drbd_force_detach_flags { + DRBD_IO_ERROR, + DRBD_META_IO_ERROR, + DRBD_FORCE_DETACH, +}; + #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) -static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where) +static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, + enum drbd_force_detach_flags forcedetach, + const char *where) { switch (mdev->ldev->dc.on_io_error) { case EP_PASS_ON: - if (!forcedetach) { + if (forcedetach == DRBD_IO_ERROR) { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Local IO failed in %s.\n", where); if (mdev->state.disk > D_INCONSISTENT) @@ -1854,6 +1863,8 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, case EP_DETACH: case EP_CALL_HELPER: set_bit(WAS_IO_ERROR, &mdev->flags); + if (forcedetach == DRBD_FORCE_DETACH) + set_bit(FORCE_DETACH, &mdev->flags); if (mdev->state.disk > D_FAILED) { _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); dev_err(DEV, @@ -1873,7 +1884,7 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, */ #define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) static inline void drbd_chk_io_error_(struct drbd_conf *mdev, - int error, int forcedetach, const char *where) + int error, enum drbd_force_detach_flags forcedetach, const char *where) { if (error) { unsigned long flags; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 920ede2829d6..5bebe8d8ace3 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1630,9 +1630,21 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, eh = mdev->ldev->dc.on_io_error; was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); - /* Immediately allow completion of all application IO, that waits - for completion from the local disk. */ - tl_abort_disk_io(mdev); + /* Immediately allow completion of all application IO, + * that waits for completion from the local disk, + * if this was a force-detach due to disk_timeout + * or administrator request (drbdsetup detach --force). + * Do NOT abort otherwise. + * Aborting local requests may cause serious problems, + * if requests are completed to upper layers already, + * and then later the already submitted local bio completes. + * This can cause DMA into former bio pages that meanwhile + * have been re-used for other things. + * So aborting local requests may cause crashes, + * or even worse, silent data corruption. + */ + if (test_and_clear_bit(FORCE_DETACH, &mdev->flags)) + tl_abort_disk_io(mdev); /* current state still has to be D_FAILED, * there is only one way out: to D_DISKLESS, @@ -3870,7 +3882,7 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { /* this was a try anyways ... */ dev_err(DEV, "meta data update failed!\n"); - drbd_chk_io_error(mdev, 1, true); + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); } /* Update mdev->ldev->md.la_size_sect, diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6d4de6a72e80..40a1c4f07190 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -950,6 +950,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp * to realize a "hot spare" feature (not that I'd recommend that) */ wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + /* make sure there is no leftover from previous force-detach attempts */ + clear_bit(FORCE_DETACH, &mdev->flags); + /* allocation not in the IO path, cqueue thread context */ nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); if (!nbc) { @@ -1345,6 +1348,7 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, } if (dt.detach_force) { + set_bit(FORCE_DETACH, &mdev->flags); drbd_force_state(mdev, NS(disk, D_FAILED)); reply->ret_code = SS_SUCCESS; goto out; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 8e93a6ac9bb6..1f4b2dbb7d4a 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -455,7 +455,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); _req_may_be_done_not_susp(req, m); break; @@ -477,7 +477,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; } - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); goto_queue_for_net_read: @@ -1275,7 +1275,7 @@ void request_timer_fn(unsigned long data) time_after(now, req->start_time + dt) && !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); - __drbd_chk_io_error(mdev, 1); + __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH); } nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; spin_unlock_irq(&mdev->req_lock); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 620c70ff2231..a35393f2fd1b 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -111,7 +111,7 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local) if (list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait); if (test_bit(__EE_WAS_ERROR, &e->flags)) - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); spin_unlock_irqrestore(&mdev->req_lock, flags); drbd_queue_work(&mdev->data.work, &e->w); @@ -154,7 +154,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo : list_empty(&mdev->active_ee); if (test_bit(__EE_WAS_ERROR, &e->flags)) - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); spin_unlock_irqrestore(&mdev->req_lock, flags); if (is_syncer_req) From c2ba686f353972cc89a006ffb6bab7ba1822271e Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Jun 2012 15:14:06 +0200 Subject: [PATCH 04/17] drbd: report congestion if we are waiting for some userland callback If the drbd worker thread is synchronously waiting for some userland callback, we don't want some casual pageout to block on us. Have drbd_congested() report congestion in that case. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 7 +++++++ drivers/block/drbd/drbd_main.c | 16 ++++++++++++++++ drivers/block/drbd/drbd_nl.c | 6 ++++++ 3 files changed, 29 insertions(+) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 79c69ebb0653..5136510ec8be 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -850,6 +850,13 @@ enum { AL_SUSPENDED, /* Activity logging is currently suspended. */ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ STATE_SENT, /* Do not change state/UUIDs while this is set */ + + CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) + * pending, from drbd worker context. + * If set, bdi_write_congested() returns true, + * so shrink_page_list() would not recurse into, + * and potentially deadlock on, this drbd worker. + */ }; struct drbd_bitmap; /* opaque for drbd_conf */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 5bebe8d8ace3..41ccb580d5ac 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3553,6 +3553,22 @@ static int drbd_congested(void *congested_data, int bdi_bits) goto out; } + if (test_bit(CALLBACK_PENDING, &mdev->flags)) { + r |= (1 << BDI_async_congested); + /* Without good local data, we would need to read from remote, + * and that would need the worker thread as well, which is + * currently blocked waiting for that usermode helper to + * finish. + */ + if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) + r |= (1 << BDI_sync_congested); + else + put_ldev(mdev); + r &= bdi_bits; + reason = 'c'; + goto out; + } + if (get_ldev(mdev)) { q = bdev_get_queue(mdev->ldev->backing_bdev); r = bdi_congested(&q->backing_dev_info, bdi_bits); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 40a1c4f07190..03fc853be2b9 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -147,6 +147,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) char *argv[] = {usermode_helper, cmd, mb, NULL }; int ret; + if (current == mdev->worker.task) + set_bit(CALLBACK_PENDING, &mdev->flags); + snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); if (get_net_conf(mdev)) { @@ -189,6 +192,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) usermode_helper, cmd, mb, (ret >> 8) & 0xff, ret); + if (current == mdev->worker.task) + clear_bit(CALLBACK_PENDING, &mdev->flags); + if (ret < 0) /* Ignore any ERRNOs we got. */ ret = 0; From 88437879fbd0ac5cde3f683f8eee455bbf83aaac Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Jun 2012 15:19:26 +0200 Subject: [PATCH 05/17] drbd: reset congestion information before reporting it in /proc/drbd We cache the congestion status in mdev->congestion_reason whenever drbd_congested() was called. Reset this cached info before reporting it when reading /proc/drbd. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_proc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 869bada2ed06..5496104f90b9 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -245,6 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v) mdev->state.role == R_SECONDARY) { seq_printf(seq, "%2d: cs:Unconfigured\n", i); } else { + /* reset mdev->congestion_reason */ + bdi_rw_congested(&mdev->rq_queue->backing_dev_info); + seq_printf(seq, "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " From 0029d62434d9045bc3e8b2eb48ae696e30336e92 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 14 Jun 2012 18:02:52 +0200 Subject: [PATCH 06/17] drbd: do not reset rs_pending_cnt too early Fix asserts like block drbd0: in got_BlockAck:4634: rs_pending_cnt = -35 < 0 ! We reset the resync lru cache and related information (rs_pending_cnt), once we successfully finished a resync or online verify, or if the replication connection is lost. We also need to reset it if a resync or online verify is aborted because a lower level disk failed. In that case the replication link is still established, and we may still have packets queued in the network buffers which want to touch rs_pending_cnt. We do not have any synchronization mechanism to know for sure when all such pending resync related packets have been drained. To avoid this counter to go negative (and violate the ASSERT that it will always be >= 0), just do not reset it when we lose a disk. It is good enough to make sure it is re-initialized before the next resync can start: reset it when we re-attach a disk. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 11 +++++++---- drivers/block/drbd/drbd_nl.c | 5 +++++ drivers/block/drbd/drbd_worker.c | 8 -------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 41ccb580d5ac..91a4853d9eeb 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1514,6 +1514,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Do not change the order of the if above and the two below... */ if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ + /* we probably will start a resync soon. + * make sure those things are properly reset. */ + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + drbd_rs_cancel_all(mdev); + drbd_send_uuids(mdev); drbd_send_state(mdev, ns); } @@ -1681,10 +1688,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, "ASSERT FAILED: disk is %s while going diskless\n", drbd_disk_str(mdev->state.disk)); - mdev->rs_total = 0; - mdev->rs_failed = 0; - atomic_set(&mdev->rs_pending_cnt, 0); - if (ns.conn >= C_CONNECTED) drbd_send_state(mdev, ns); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 03fc853be2b9..a68d9bfb731c 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -959,6 +959,11 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp /* make sure there is no leftover from previous force-detach attempts */ clear_bit(FORCE_DETACH, &mdev->flags); + /* and no leftover from previously aborted resync or verify, either */ + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + /* allocation not in the IO path, cqueue thread context */ nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); if (!nbc) { diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index a35393f2fd1b..6bce2cc179d4 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1501,14 +1501,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) return; } - if (mdev->state.conn < C_AHEAD) { - /* In case a previous resync run was aborted by an IO error/detach on the peer. */ - drbd_rs_cancel_all(mdev); - /* This should be done when we abort the resync. We definitely do not - want to have this for connections going back and forth between - Ahead/Behind and SyncSource/SyncTarget */ - } - if (side == C_SYNC_TARGET) { /* Since application IO was locked out during C_WF_BITMAP_T and C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET From 63a6d0bb3dd69afedb2b2952eb1d1e8340c11d0d Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 18 Jun 2012 11:46:01 +0200 Subject: [PATCH 07/17] drbd: call local-io-error handler early In case we want to hard-reset from the local-io-error handler, we need to call it before notifying the peer or aborting local IO. Otherwise the peer will advance its data generation UUIDs even if secondary. This way, local io error looks like a "regular" node crash, which reduces the number of different failure cases. This may be useful in a bigger picture where crashed or otherwise "misbehaving" nodes are automatically re-deployed. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 91a4853d9eeb..29a276425079 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1637,6 +1637,9 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, eh = mdev->ldev->dc.on_io_error; was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); + if (was_io_error && eh == EP_CALL_HELPER) + drbd_khelper(mdev, "local-io-error"); + /* Immediately allow completion of all application IO, * that waits for completion from the local disk, * if this was a force-detach due to disk_timeout @@ -1672,9 +1675,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, drbd_md_sync(mdev); } put_ldev(mdev); - - if (was_io_error && eh == EP_CALL_HELPER) - drbd_khelper(mdev, "local-io-error"); } /* second half of local IO error, failure to attach, From c12e9c8964215aaf2b5dcd06048444c2b672f0b9 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Jun 2012 09:40:00 +0200 Subject: [PATCH 08/17] drbd: fix potential access after free Occasionally, if we disconnect, we triggered this assert: block drbd7: ASSERT FAILED tl_hash[27] == c30b0f04, expected NULL hlist_del() happens only on master bio completion. We used to wait for pending IO to complete before freeing tl_hash on disconnect. We no longer do so, since we learned to "freeze" IO on disconnect. If the local disk is too slow, we may reach C_STANDALONE early, and there are still some requests pending locally when we call drbd_free_tl_hash(). If we now free the tl_hash, and later the local IO completion completes the master bio, which then does hlist_del() and clobbers freed memory. Do hlist_del_init() and hlist_add_fake() before kfree(tl_hash), so the hlist_del() on master bio completion is harmless. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index ea4836e0ae98..83d99133f94b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3801,11 +3801,18 @@ void drbd_free_tl_hash(struct drbd_conf *mdev) mdev->ee_hash = NULL; mdev->ee_hash_s = 0; - /* paranoia code */ - for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) - if (h->first) - dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", - (int)(h - mdev->tl_hash), h->first); + /* We may not have had the chance to wait for all locally pending + * application requests. The hlist_add_fake() prevents access after + * free on master bio completion. */ + for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) { + struct drbd_request *req; + struct hlist_node *pos, *n; + hlist_for_each_entry_safe(req, pos, n, h, collision) { + hlist_del_init(&req->collision); + hlist_add_fake(&req->collision); + } + } + kfree(mdev->tl_hash); mdev->tl_hash = NULL; mdev->tl_hash_s = 0; From 7ee1fb93f390f7a7231abec4e34e6ab20abeed45 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Jun 2012 10:27:58 +0200 Subject: [PATCH 09/17] drbd: flush drbd work queue before invalidate/invalidate remote If you do back to back wait-sync/invalidate on a Primary in a tight loop, during application IO load, you could trigger a race: kernel: block drbd6: FIXME going to queue 'set_n_write from StartingSync' but 'write from resync_finished' still pending? Fix this by changing the order of the drbd_queue_work() and the wake_up() in dec_ap_pending(), and adding the additional drbd_flush_workqueue() before requesting the full sync. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 10 ++++++---- drivers/block/drbd/drbd_nl.c | 8 ++++++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 5136510ec8be..2704af2ccf61 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -2421,15 +2421,17 @@ static inline void dec_ap_bio(struct drbd_conf *mdev) int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt); D_ASSERT(ap_bio >= 0); + + if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { + if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); + } + /* this currently does wake_up for every dec_ap_bio! * maybe rather introduce some type of hysteresis? * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */ if (ap_bio < mxb) wake_up(&mdev->misc_wait); - if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { - if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) - drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); - } } static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index a68d9bfb731c..c47df7cf7f80 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1977,9 +1977,11 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl int retcode; /* If there is still bitmap IO pending, probably because of a previous - * resync just being finished, wait for it before requesting a new resync. */ + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); + drbd_flush_workqueue(mdev); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); @@ -2018,9 +2020,11 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re int retcode; /* If there is still bitmap IO pending, probably because of a previous - * resync just being finished, wait for it before requesting a new resync. */ + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); + drbd_flush_workqueue(mdev); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); From db141b2f42b485b700465fe2401fbe65c65b190c Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 25 Jun 2012 19:15:58 +0200 Subject: [PATCH 10/17] drbd: fix max_bio_size to be unsigned We capped our max_bio_size respectively max_hw_sectors with min_t(int, lower level limit, our limit); unfortunately, some drivers, e.g. the kvm virtio block driver, initialize their limits to "-1U", and that is of course a smaller "int" value than our limit. Impact: we started to request 16 MB resync requests, which lead to protocol error and a reconnect loop. Fix all relevant constants and parameters to be unsigned int. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 8 ++++---- drivers/block/drbd/drbd_main.c | 11 ++++++----- drivers/block/drbd/drbd_nl.c | 13 +++++++------ 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 2704af2ccf61..b2ca143d0053 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1136,8 +1136,8 @@ struct drbd_conf { int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ int rs_planed; /* resync sectors already planned */ atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ - int peer_max_bio_size; - int local_max_bio_size; + unsigned int peer_max_bio_size; + unsigned int local_max_bio_size; }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) @@ -1441,9 +1441,9 @@ struct bm_extent { * hash table. */ #define HT_SHIFT 8 #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) -#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */ +#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ -#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */ +#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */ /* Number of elements in the app_reads_hash */ #define APP_R_HSIZE 15 diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 29a276425079..1ee1404769c2 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2209,7 +2209,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl { struct p_sizes p; sector_t d_size, u_size; - int q_order_type, max_bio_size; + int q_order_type; + unsigned int max_bio_size; int ok; if (get_ldev_if_state(mdev, D_NEGOTIATING)) { @@ -2218,7 +2219,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl u_size = mdev->ldev->dc.disk_size; q_order_type = drbd_queue_order_type(mdev); max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; - max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE); + max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); put_ldev(mdev); } else { d_size = 0; @@ -2229,7 +2230,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ if (mdev->agreed_pro_version <= 94) - max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); p.d_size = cpu_to_be64(d_size); p.u_size = cpu_to_be64(u_size); @@ -3981,9 +3982,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) spin_lock_irq(&mdev->req_lock); if (mdev->state.conn < C_CONNECTED) { - int peer; + unsigned int peer; peer = be32_to_cpu(buffer->la_peer_max_bio_size); - peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE); + peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); mdev->peer_max_bio_size = peer; } spin_unlock_irq(&mdev->req_lock); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index c47df7cf7f80..fb9dce8daa24 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -801,8 +801,8 @@ static int drbd_check_al_size(struct drbd_conf *mdev) static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) { struct request_queue * const q = mdev->rq_queue; - int max_hw_sectors = max_bio_size >> 9; - int max_segments = 0; + unsigned int max_hw_sectors = max_bio_size >> 9; + unsigned int max_segments = 0; if (get_ldev_if_state(mdev, D_ATTACHING)) { struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; @@ -835,7 +835,7 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) { - int now, new, local, peer; + unsigned int now, new, local, peer; now = queue_max_hw_sectors(mdev->rq_queue) << 9; local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */ @@ -846,13 +846,14 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) mdev->local_max_bio_size = local; put_ldev(mdev); } + local = min(local, DRBD_MAX_BIO_SIZE); /* We may ignore peer limits if the peer is modern enough. Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ if (mdev->state.conn >= C_CONNECTED) { if (mdev->agreed_pro_version < 94) { - peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ } else if (mdev->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; @@ -860,10 +861,10 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) peer = DRBD_MAX_BIO_SIZE; } - new = min_t(int, local, peer); + new = min(local, peer); if (mdev->state.role == R_PRIMARY && new < now) - dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now); + dev_err(DEV, "ASSERT FAILED new < now; (%u < %u)\n", new, now); if (new != now) dev_info(DEV, "max BIO size = %u\n", new); From a73ff3231df59a4b92ccd0dd4e73897c5822489b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 25 Jun 2012 19:15:38 +0200 Subject: [PATCH 11/17] drbd: announce FLUSH/FUA capability to upper layers Unconditionally announce FLUSH/FUA to upper layers. If the lower layers on either node do not actually support this, generic_make_request() will deal with it. If this causes performance regressions on your setup, make sure there are no volatile caches involved, and mount -o nobarrier or equivalent. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 6 +++++- drivers/block/drbd/drbd_main.c | 1 + drivers/block/drbd/drbd_receiver.c | 21 ++++++++++++++++----- drivers/block/drbd/drbd_req.c | 3 +-- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 6ace11e9e5a1..3fbef018ce55 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -876,7 +876,11 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, unsigned int enr, count = 0; struct lc_element *e; - if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { + /* this should be an empty REQ_FLUSH */ + if (size == 0) + return 0; + + if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "sector: %llus, size: %d\n", (unsigned long long)sector, size); return 0; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 1ee1404769c2..2e0e7fc1dbba 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3636,6 +3636,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor) q->backing_dev_info.congested_data = mdev; blk_queue_make_request(q, drbd_make_request); + blk_queue_flush(q, REQ_FLUSH | REQ_FUA); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 83d99133f94b..c74ca2df7431 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -277,6 +277,9 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; int i; + if (page == NULL) + return; + if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count) i = page_chain_free(page); else { @@ -316,7 +319,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, gfp_t gfp_mask) __must_hold(local) { struct drbd_epoch_entry *e; - struct page *page; + struct page *page = NULL; unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE)) @@ -329,9 +332,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, return NULL; } - page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); - if (!page) - goto fail; + if (data_size) { + page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); + if (!page) + goto fail; + } INIT_HLIST_NODE(&e->collision); e->epoch = NULL; @@ -1270,7 +1275,6 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ data_size -= dgs; - ERR_IF(data_size == 0) return NULL; ERR_IF(data_size & 0x1ff) return NULL; ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL; @@ -1291,6 +1295,9 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ if (!e) return NULL; + if (!data_size) + return e; + ds = data_size; page = e->pages; page_chain_for_each(page) { @@ -1715,6 +1722,10 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned dp_flags = be32_to_cpu(p->dp_flags); rw |= wire_flags_to_bio(mdev, dp_flags); + if (e->pages == NULL) { + D_ASSERT(e->size == 0); + D_ASSERT(dp_flags & DP_FLUSH); + } if (dp_flags & DP_MAY_SET_IN_SYNC) e->flags |= EE_MAY_SET_IN_SYNC; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 1f4b2dbb7d4a..910335c30927 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1111,13 +1111,12 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) /* * what we "blindly" assume: */ - D_ASSERT(bio->bi_size > 0); D_ASSERT((bio->bi_size & 0x1ff) == 0); /* to make some things easier, force alignment of requests within the * granularity of our hash tables */ s_enr = bio->bi_sector >> HT_SHIFT; - e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; + e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr; if (likely(s_enr == e_enr)) { do { From 01ff5dbc0925d11c8ad76eed3bdd02d0c7e1e0f5 Mon Sep 17 00:00:00 2001 From: Chetan Loke Date: Tue, 31 Jul 2012 08:47:13 +0200 Subject: [PATCH 12/17] block/nbd: micro-optimization in nbd request completion Add in-flight cmds to the tail. That way while searching (during request completion),we will always get a hit on the first element. Signed-off-by: Chetan Loke Acked-by: Paul.Clements@steeleye.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 061427a75d37..8957b9f0cfad 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -481,7 +481,7 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) nbd_end_request(req); } else { spin_lock(&nbd->queue_lock); - list_add(&req->queuelist, &nbd->queue_head); + list_add_tail(&req->queuelist, &nbd->queue_head); spin_unlock(&nbd->queue_lock); } From 0021b7bc045e4b0b85d8c53614342aaf84ca96a5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Jul 2012 09:08:14 +0200 Subject: [PATCH 13/17] md: remove plug_cnt feature of plugging. This seemed like a good idea at the time, but after further thought I cannot see it making a difference other than very occasionally and testing to try to exercise the case it is most likely to help did not show any performance difference by removing it. So remove the counting of active plugs and allow 'pending writes' to be activated at any time, not just when no plugs are active. This is only relevant when there is a write-intent bitmap, and the updating of the bitmap will likely introduce enough delay that the single-threading of bitmap updates will be enough to collect large numbers of updates together. Removing this will make it easier to centralise the unplug code, and will clear the other for other unplug enhancements which have a measurable effect. Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- drivers/md/md.c | 5 +---- drivers/md/md.h | 3 --- drivers/md/raid1.c | 3 +-- drivers/md/raid10.c | 3 +-- drivers/md/raid5.c | 5 ++--- 5 files changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d5ab4493c8be..34381172a947 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -514,8 +514,7 @@ struct md_plug_cb { static void plugger_unplug(struct blk_plug_cb *cb) { struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); - if (atomic_dec_and_test(&mdcb->mddev->plug_cnt)) - md_wakeup_thread(mdcb->mddev->thread); + md_wakeup_thread(mdcb->mddev->thread); kfree(mdcb); } @@ -548,7 +547,6 @@ int mddev_check_plugged(struct mddev *mddev) mdcb->mddev = mddev; mdcb->cb.callback = plugger_unplug; - atomic_inc(&mddev->plug_cnt); list_add(&mdcb->cb.list, &plug->cb_list); return 1; } @@ -602,7 +600,6 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); - atomic_set(&mddev->plug_cnt, 0); spin_lock_init(&mddev->write_lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); diff --git a/drivers/md/md.h b/drivers/md/md.h index 7b4a3c318cae..91786c46b85c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -266,9 +266,6 @@ struct mddev { int new_chunk_sectors; int reshape_backwards; - atomic_t plug_cnt; /* If device is expecting - * more bios soon. - */ struct md_thread *thread; /* management thread */ struct md_thread *sync_thread; /* doing resync or reconstruct */ sector_t curr_resync; /* last block scheduled */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cacd008d6864..36a8fc059ac3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2173,8 +2173,7 @@ static void raid1d(struct mddev *mddev) blk_start_plug(&plug); for (;;) { - if (atomic_read(&mddev->plug_cnt) == 0) - flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8da6282254c3..5d33603a497d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2660,8 +2660,7 @@ static void raid10d(struct mddev *mddev) blk_start_plug(&plug); for (;;) { - if (atomic_read(&mddev->plug_cnt) == 0) - flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 04348d76bb30..bde9da2baa39 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4543,7 +4543,7 @@ static void raid5d(struct mddev *mddev) while (1) { struct bio *bio; - if (atomic_read(&mddev->plug_cnt) == 0 && + if ( !list_empty(&conf->bitmap_list)) { /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; @@ -4553,8 +4553,7 @@ static void raid5d(struct mddev *mddev) conf->seq_write = conf->seq_flush; activate_bit_delay(conf); } - if (atomic_read(&mddev->plug_cnt) == 0) - raid5_activate_delayed(conf); + raid5_activate_delayed(conf); while ((bio = remove_bio_from_retry(conf))) { int ok; From 9cbb17508808f8a6bdd83354b61e126ac4fa6fed Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Jul 2012 09:08:14 +0200 Subject: [PATCH 14/17] blk: centralize non-request unplug handling. Both md and umem has similar code for getting notified on an blk_finish_plug event. Centralize this code in block/ and allow each driver to provide its distinctive difference. Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- block/blk-core.c | 25 +++++++++++++++++++ drivers/block/umem.c | 35 +++++--------------------- drivers/md/md.c | 56 ++++-------------------------------------- drivers/md/md.h | 8 +++++- include/linux/blkdev.h | 8 ++++-- 5 files changed, 49 insertions(+), 83 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index dd134d834d58..177ddcf356e6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2927,6 +2927,31 @@ static void flush_plug_callbacks(struct blk_plug *plug) } } +struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, + int size) +{ + struct blk_plug *plug = current->plug; + struct blk_plug_cb *cb; + + if (!plug) + return NULL; + + list_for_each_entry(cb, &plug->cb_list, list) + if (cb->callback == unplug && cb->data == data) + return cb; + + /* Not currently on the callback list */ + BUG_ON(size < sizeof(*cb)); + cb = kzalloc(size, GFP_ATOMIC); + if (cb) { + cb->data = data; + cb->callback = unplug; + list_add(&cb->list, &plug->cb_list); + } + return cb; +} +EXPORT_SYMBOL(blk_check_plugged); + void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request_queue *q; diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 9a72277a31df..6ef3489568e3 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -513,42 +513,19 @@ static void process_page(unsigned long data) } } -struct mm_plug_cb { - struct blk_plug_cb cb; - struct cardinfo *card; -}; - static void mm_unplug(struct blk_plug_cb *cb) { - struct mm_plug_cb *mmcb = container_of(cb, struct mm_plug_cb, cb); + struct cardinfo *card = cb->data; - spin_lock_irq(&mmcb->card->lock); - activate(mmcb->card); - spin_unlock_irq(&mmcb->card->lock); - kfree(mmcb); + spin_lock_irq(&card->lock); + activate(card); + spin_unlock_irq(&card->lock); + kfree(cb); } static int mm_check_plugged(struct cardinfo *card) { - struct blk_plug *plug = current->plug; - struct mm_plug_cb *mmcb; - - if (!plug) - return 0; - - list_for_each_entry(mmcb, &plug->cb_list, cb.list) { - if (mmcb->cb.callback == mm_unplug && mmcb->card == card) - return 1; - } - /* Not currently on the callback list */ - mmcb = kmalloc(sizeof(*mmcb), GFP_ATOMIC); - if (!mmcb) - return 0; - - mmcb->card = card; - mmcb->cb.callback = mm_unplug; - list_add(&mmcb->cb.list, &plug->cb_list); - return 1; + return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb)); } static void mm_make_request(struct request_queue *q, struct bio *bio) diff --git a/drivers/md/md.c b/drivers/md/md.c index 34381172a947..b493fa417387 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -498,59 +498,13 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -/* Support for plugging. - * This mirrors the plugging support in request_queue, but does not - * require having a whole queue or request structures. - * We allocate an md_plug_cb for each md device and each thread it gets - * plugged on. This links tot the private plug_handle structure in the - * personality data where we keep a count of the number of outstanding - * plugs so other code can see if a plug is active. - */ -struct md_plug_cb { - struct blk_plug_cb cb; - struct mddev *mddev; -}; - -static void plugger_unplug(struct blk_plug_cb *cb) +void md_unplug(struct blk_plug_cb *cb) { - struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); - md_wakeup_thread(mdcb->mddev->thread); - kfree(mdcb); + struct mddev *mddev = cb->data; + md_wakeup_thread(mddev->thread); + kfree(cb); } - -/* Check that an unplug wakeup will come shortly. - * If not, wakeup the md thread immediately - */ -int mddev_check_plugged(struct mddev *mddev) -{ - struct blk_plug *plug = current->plug; - struct md_plug_cb *mdcb; - - if (!plug) - return 0; - - list_for_each_entry(mdcb, &plug->cb_list, cb.list) { - if (mdcb->cb.callback == plugger_unplug && - mdcb->mddev == mddev) { - /* Already on the list, move to top */ - if (mdcb != list_first_entry(&plug->cb_list, - struct md_plug_cb, - cb.list)) - list_move(&mdcb->cb.list, &plug->cb_list); - return 1; - } - } - /* Not currently on the callback list */ - mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC); - if (!mdcb) - return 0; - - mdcb->mddev = mddev; - mdcb->cb.callback = plugger_unplug; - list_add(&mdcb->cb.list, &plug->cb_list); - return 1; -} -EXPORT_SYMBOL_GPL(mddev_check_plugged); +EXPORT_SYMBOL(md_unplug); static inline struct mddev *mddev_get(struct mddev *mddev) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 91786c46b85c..8f998e08fb87 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -627,6 +627,12 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); -extern int mddev_check_plugged(struct mddev *mddev); extern void md_trim_bio(struct bio *bio, int offset, int size); + +extern void md_unplug(struct blk_plug_cb *cb); +static inline int mddev_check_plugged(struct mddev *mddev) +{ + return !!blk_check_plugged(md_unplug, mddev, + sizeof(struct blk_plug_cb)); +} #endif /* _MD_MD_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3816ce8a08fc..607ca228f47e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -922,11 +922,15 @@ struct blk_plug { }; #define BLK_MAX_REQUEST_COUNT 16 +struct blk_plug_cb; +typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *); struct blk_plug_cb { struct list_head list; - void (*callback)(struct blk_plug_cb *); + blk_plug_cb_fn callback; + void *data; }; - +extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, + void *data, int size); extern void blk_start_plug(struct blk_plug *); extern void blk_finish_plug(struct blk_plug *); extern void blk_flush_plug_list(struct blk_plug *, bool); From 2a7d5559b346574057dce4672d1ed9aaa9d1e685 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 31 Jul 2012 09:08:15 +0200 Subject: [PATCH 15/17] block: stack unplug MD raid1 prepares to dispatch request in unplug callback. If make_request in low level queue also uses unplug callback to dispatch request, the low level queue's unplug callback will not be called. Recheck the callback list helps this case. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- block/blk-core.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 177ddcf356e6..35bf4fe8234c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2913,17 +2913,16 @@ static void flush_plug_callbacks(struct blk_plug *plug) { LIST_HEAD(callbacks); - if (list_empty(&plug->cb_list)) - return; + while (!list_empty(&plug->cb_list)) { + list_splice_init(&plug->cb_list, &callbacks); - list_splice_init(&plug->cb_list, &callbacks); - - while (!list_empty(&callbacks)) { - struct blk_plug_cb *cb = list_first_entry(&callbacks, + while (!list_empty(&callbacks)) { + struct blk_plug_cb *cb = list_first_entry(&callbacks, struct blk_plug_cb, list); - list_del(&cb->list); - cb->callback(cb); + list_del(&cb->list); + cb->callback(cb); + } } } From 74018dc3063a2c729fc73041c0a9f03aac995920 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Jul 2012 09:08:15 +0200 Subject: [PATCH 16/17] blk: pass from_schedule to non-request unplug functions. This will allow md/raid to know why the unplug was called, and will be able to act according - if !from_schedule it is safe to perform tasks which could themselves schedule. Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- block/blk-core.c | 6 +++--- drivers/block/umem.c | 2 +- drivers/md/md.c | 2 +- drivers/md/md.h | 2 +- include/linux/blkdev.h | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 35bf4fe8234c..4b4dbdfbca89 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2909,7 +2909,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth, } -static void flush_plug_callbacks(struct blk_plug *plug) +static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) { LIST_HEAD(callbacks); @@ -2921,7 +2921,7 @@ static void flush_plug_callbacks(struct blk_plug *plug) struct blk_plug_cb, list); list_del(&cb->list); - cb->callback(cb); + cb->callback(cb, from_schedule); } } } @@ -2961,7 +2961,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(plug->magic != PLUG_MAGIC); - flush_plug_callbacks(plug); + flush_plug_callbacks(plug, from_schedule); if (list_empty(&plug->list)) return; diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 6ef3489568e3..eb0d8216f557 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -513,7 +513,7 @@ static void process_page(unsigned long data) } } -static void mm_unplug(struct blk_plug_cb *cb) +static void mm_unplug(struct blk_plug_cb *cb, bool from_schedule) { struct cardinfo *card = cb->data; diff --git a/drivers/md/md.c b/drivers/md/md.c index b493fa417387..db02d2efb76f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -498,7 +498,7 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -void md_unplug(struct blk_plug_cb *cb) +void md_unplug(struct blk_plug_cb *cb, bool from_schedule) { struct mddev *mddev = cb->data; md_wakeup_thread(mddev->thread); diff --git a/drivers/md/md.h b/drivers/md/md.h index 8f998e08fb87..f385b038589d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -629,7 +629,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); extern void md_trim_bio(struct bio *bio, int offset, int size); -extern void md_unplug(struct blk_plug_cb *cb); +extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); static inline int mddev_check_plugged(struct mddev *mddev) { return !!blk_check_plugged(md_unplug, mddev, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 607ca228f47e..4e72a9d48232 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -923,7 +923,7 @@ struct blk_plug { #define BLK_MAX_REQUEST_COUNT 16 struct blk_plug_cb; -typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *); +typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); struct blk_plug_cb { struct list_head list; blk_plug_cb_fn callback; From 2fb2ca6f5b953192d5a94fcc778d9abba06462df Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Sat, 28 Jul 2012 19:45:59 +0800 Subject: [PATCH 17/17] floppy: remove duplicated flag FD_RAW_NEED_DISK Fix coccinelle warning (without behavior change): drivers/block/floppy.c:2518:32-48: duplicated argument to & or | Signed-off-by: Fengguang Wu Signed-off-by: Jiri Kosina --- drivers/block/floppy.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 1347ba8b8377..9d6ef68e2f1b 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2516,8 +2516,7 @@ static int make_raw_rw_request(void) set_fdc((long)current_req->rq_disk->private_data); raw_cmd = &default_raw_cmd; - raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_DISK | - FD_RAW_NEED_SEEK; + raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK; raw_cmd->cmd_count = NR_RW; if (rq_data_dir(current_req) == READ) { raw_cmd->flags |= FD_RAW_READ;