diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6f3dd12dd3a4..544e1600f208 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -605,7 +605,11 @@ static void ops_complete_compute5(void *stripe_head_ref) set_bit(R5_UPTODATE, &tgt->flags); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); clear_bit(R5_Wantcompute, &tgt->flags); - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); + if (sh->check_state == check_state_compute_run) + sh->check_state = check_state_compute_result; + else + set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -838,7 +842,7 @@ static void ops_complete_check(void *stripe_head_ref) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - set_bit(STRIPE_OP_CHECK, &sh->ops.complete); + sh->check_state = check_state_check_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -870,7 +874,8 @@ static void ops_run_check(struct stripe_head *sh) ops_complete_check, sh); } -static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) +static void raid5_run_ops(struct stripe_head *sh, unsigned long pending, + unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; @@ -880,7 +885,8 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) overlap_clear++; } - if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) + if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending) || + test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) tx = ops_run_compute5(sh, pending); if (test_bit(STRIPE_OP_PREXOR, &pending)) @@ -894,7 +900,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) if (test_bit(STRIPE_OP_POSTXOR, &pending)) ops_run_postxor(sh, tx, pending); - if (test_bit(STRIPE_OP_CHECK, &pending)) + if (test_bit(STRIPE_OP_CHECK, &ops_request)) ops_run_check(sh); if (overlap_clear) @@ -1961,8 +1967,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, /* don't schedule compute operations or reads on the parity block while * a check is in flight */ - if ((disk_idx == sh->pd_idx) && - test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) + if (disk_idx == sh->pd_idx && sh->check_state) return ~0; /* is the data in this block needed, and can we get it? */ @@ -1983,9 +1988,8 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * 3/ We hold off parity block re-reads until check operations * have quiesced. */ - if ((s->uptodate == disks - 1) && - (s->failed && disk_idx == s->failed_num) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + if ((s->uptodate == disks - 1) && !sh->check_state && + (s->failed && disk_idx == s->failed_num)) { set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; @@ -2021,12 +2025,8 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh, { int i; - /* Clear completed compute operations. Parity recovery - * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled - * later on in this routine - */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + /* Clear completed compute operations */ + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete)) { clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); @@ -2350,90 +2350,85 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { - int canceled_check = 0; + struct r5dev *dev = NULL; set_bit(STRIPE_HANDLE, &sh->state); - /* complete a check operation */ - if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { - clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); - clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are no failures */ if (s->failed == 0) { - if (sh->ops.zero_sum_result == 0) - /* parity is correct (on disc, - * not in buffer any more) - */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - conf->mddev->resync_mismatches += - STRIPE_SECTORS; - if (test_bit( - MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - set_bit(STRIPE_OP_COMPUTE_BLK, - &sh->ops.pending); - set_bit(STRIPE_OP_MOD_REPAIR_PD, - &sh->ops.pending); - set_bit(R5_Wantcompute, - &sh->dev[sh->pd_idx].flags); - sh->ops.target = sh->pd_idx; - sh->ops.count++; - s->uptodate++; - } - } - } else - canceled_check = 1; /* STRIPE_INSYNC is not set */ - } - - /* start a new check operation if there are no failures, the stripe is - * not insync, and a repair is not in flight - */ - if (s->failed == 0 && - !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { BUG_ON(s->uptodate != disks); + sh->check_state = check_state_run; + set_bit(STRIPE_OP_CHECK, &s->ops_request); clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); - sh->ops.count++; s->uptodate--; + break; } - } - - /* check if we can clear a parity disk reconstruct */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - - clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - - - /* Wait for check parity and compute block operations to complete - * before write-back. If a failure occurred while the check operation - * was in flight we need to cycle this stripe through handle_stripe - * since the parity block may not be uptodate - */ - if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { - struct r5dev *dev; - /* either failed parity check, or recovery is happening */ - if (s->failed == 0) - s->failed_num = sh->pd_idx; dev = &sh->dev[s->failed_num]; + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; + if (!dev) + dev = &sh->dev[sh->pd_idx]; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; + + /* either failed parity check, or recovery is happening */ BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); BUG_ON(s->uptodate != disks); set_bit(R5_LOCKED, &dev->flags); + s->locked++; set_bit(R5_Wantwrite, &dev->flags); clear_bit(STRIPE_DEGRADED, &sh->state); - s->locked++; set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* if a failure occurred during the check operation, leave + * STRIPE_INSYNC not set and let the stripe be handled again + */ + if (s->failed) + break; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, + * not in buffer any more) + */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + sh->check_state = check_state_compute_run; + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, + &sh->dev[sh->pd_idx].flags); + sh->ops.target = sh->pd_idx; + s->uptodate++; + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); } } @@ -2807,7 +2802,7 @@ static void handle_stripe5(struct stripe_head *sh) * block. */ if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) + !sh->check_state) handle_issuing_new_write_requests5(conf, sh, &s, disks); /* maybe we need to check and possibly fix the parity for this stripe @@ -2815,11 +2810,10 @@ static void handle_stripe5(struct stripe_head *sh) * data is available. The parity check is held off while parity * dependent operations are in flight. */ - if ((s.syncing && s.locked == 0 && + if (sh->check_state || + (s.syncing && s.locked == 0 && !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_INSYNC, &sh->state)) || - test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) + !test_bit(STRIPE_INSYNC, &sh->state))) handle_parity_checks5(conf, sh, &s, disks); if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { @@ -2897,8 +2891,8 @@ static void handle_stripe5(struct stripe_head *sh) if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - if (pending) - raid5_run_ops(sh, pending); + if (pending || s.ops_request) + raid5_run_ops(sh, pending, s.ops_request); ops_run_io(sh, &s); diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 1301195abf4b..2c96d5fd54bf 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -158,6 +158,41 @@ * the compute block completes. */ +/* + * Operations state - intermediate states that are visible outside of sh->lock + * In general _idle indicates nothing is running, _run indicates a data + * processing operation is active, and _result means the data processing result + * is stable and can be acted upon. For simple operations like biofill and + * compute that only have an _idle and _run state they are indicated with + * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) + */ +/** + * enum check_states - handles syncing / repairing a stripe + * @check_state_idle - check operations are quiesced + * @check_state_run - check operation is running + * @check_state_result - set outside lock when check result is valid + * @check_state_compute_run - check failed and we are repairing + * @check_state_compute_result - set outside lock when compute result is valid + */ +enum check_states { + check_state_idle = 0, + check_state_run, /* parity check */ + check_state_check_result, + check_state_compute_run, /* parity repair */ + check_state_compute_result, +}; + +/** + * enum reconstruct_states - handles writing or expanding a stripe + */ +enum reconstruct_states { + reconstruct_state_idle = 0, + reconstruct_state_drain_run, /* write */ + reconstruct_state_run, /* expand */ + reconstruct_state_drain_result, + reconstruct_state_result, +}; + struct stripe_head { struct hlist_node hash; struct list_head lru; /* inactive_list or handle_list */ @@ -169,6 +204,7 @@ struct stripe_head { spinlock_t lock; int bm_seq; /* sequence number for bitmap flushes */ int disks; /* disks in stripe */ + enum check_states check_state; /* stripe_operations * @pending - pending ops flags (set for request->issue->complete) * @ack - submitted ops flags (set for issue->complete) @@ -202,6 +238,7 @@ struct stripe_head_state { int locked, uptodate, to_read, to_write, failed, written; int to_fill, compute, req_compute, non_overwrite; int failed_num; + unsigned long ops_request; }; /* r6_state - extra state data only relevant to r6 */ @@ -254,8 +291,10 @@ struct r6_state { #define STRIPE_EXPAND_READY 11 #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ +#define STRIPE_BIOFILL_RUN 14 +#define STRIPE_COMPUTE_RUN 15 /* - * Operations flags (in issue order) + * Operation request flags */ #define STRIPE_OP_BIOFILL 0 #define STRIPE_OP_COMPUTE_BLK 1 @@ -264,11 +303,6 @@ struct r6_state { #define STRIPE_OP_POSTXOR 4 #define STRIPE_OP_CHECK 5 -/* modifiers to the base operations - * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back - */ -#define STRIPE_OP_MOD_REPAIR_PD 7 - /* * Plugging: *