Btrfs, scrub: repair the common data on RAID5/6 if it is corrupted

This patch implement the RAID5/6 common data repair function, the
implementation is similar to the scrub on the other RAID such as
RAID1, the differentia is that we don't read the data from the
mirror, we use the data repair function of RAID5/6.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
This commit is contained in:
Miao Xie 2014-10-23 14:42:50 +08:00
Родитель b89e1b012c
Коммит af8e2d1df9
5 изменённых файлов: 235 добавлений и 33 удалений

Просмотреть файл

@ -58,6 +58,15 @@
*/ */
#define RBIO_CACHE_READY_BIT 3 #define RBIO_CACHE_READY_BIT 3
/*
* bbio and raid_map is managed by the caller, so we shouldn't free
* them here. And besides that, all rbios with this flag should not
* be cached, because we need raid_map to check the rbios' stripe
* is the same or not, but it is very likely that the caller has
* free raid_map, so don't cache those rbios.
*/
#define RBIO_HOLD_BBIO_MAP_BIT 4
#define RBIO_CACHE_SIZE 1024 #define RBIO_CACHE_SIZE 1024
struct btrfs_raid_bio { struct btrfs_raid_bio {
@ -799,6 +808,21 @@ done_nolock:
remove_rbio_from_cache(rbio); remove_rbio_from_cache(rbio);
} }
static inline void
__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
{
if (need) {
kfree(raid_map);
kfree(bbio);
}
}
static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
{
__free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
}
static void __free_raid_bio(struct btrfs_raid_bio *rbio) static void __free_raid_bio(struct btrfs_raid_bio *rbio)
{ {
int i; int i;
@ -817,8 +841,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
rbio->stripe_pages[i] = NULL; rbio->stripe_pages[i] = NULL;
} }
} }
kfree(rbio->raid_map);
kfree(rbio->bbio); free_bbio_and_raid_map(rbio);
kfree(rbio); kfree(rbio);
} }
@ -933,11 +958,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
GFP_NOFS); GFP_NOFS);
if (!rbio) { if (!rbio)
kfree(raid_map);
kfree(bbio);
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
}
bio_list_init(&rbio->bio_list); bio_list_init(&rbio->bio_list);
INIT_LIST_HEAD(&rbio->plug_list); INIT_LIST_HEAD(&rbio->plug_list);
@ -1692,8 +1714,10 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
struct blk_plug_cb *cb; struct blk_plug_cb *cb;
rbio = alloc_rbio(root, bbio, raid_map, stripe_len); rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
if (IS_ERR(rbio)) if (IS_ERR(rbio)) {
__free_bbio_and_raid_map(bbio, raid_map, 1);
return PTR_ERR(rbio); return PTR_ERR(rbio);
}
bio_list_add(&rbio->bio_list, bio); bio_list_add(&rbio->bio_list, bio);
rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->bio_list_bytes = bio->bi_iter.bi_size;
@ -1888,7 +1912,8 @@ cleanup:
cleanup_io: cleanup_io:
if (rbio->read_rebuild) { if (rbio->read_rebuild) {
if (err == 0) if (err == 0 &&
!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
cache_rbio_pages(rbio); cache_rbio_pages(rbio);
else else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@ -2038,15 +2063,19 @@ cleanup:
*/ */
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 *raid_map, struct btrfs_bio *bbio, u64 *raid_map,
u64 stripe_len, int mirror_num) u64 stripe_len, int mirror_num, int hold_bbio)
{ {
struct btrfs_raid_bio *rbio; struct btrfs_raid_bio *rbio;
int ret; int ret;
rbio = alloc_rbio(root, bbio, raid_map, stripe_len); rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
if (IS_ERR(rbio)) if (IS_ERR(rbio)) {
__free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
return PTR_ERR(rbio); return PTR_ERR(rbio);
}
if (hold_bbio)
set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
rbio->read_rebuild = 1; rbio->read_rebuild = 1;
bio_list_add(&rbio->bio_list, bio); bio_list_add(&rbio->bio_list, bio);
rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->bio_list_bytes = bio->bi_iter.bi_size;
@ -2054,8 +2083,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
rbio->faila = find_logical_bio_stripe(rbio, bio); rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) { if (rbio->faila == -1) {
BUG(); BUG();
kfree(raid_map); __free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
kfree(bbio);
kfree(rbio); kfree(rbio);
return -EIO; return -EIO;
} }

Просмотреть файл

@ -41,7 +41,7 @@ static inline int nr_data_stripes(struct map_lookup *map)
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 *raid_map, struct btrfs_bio *bbio, u64 *raid_map,
u64 stripe_len, int mirror_num); u64 stripe_len, int mirror_num, int hold_bbio);
int raid56_parity_write(struct btrfs_root *root, struct bio *bio, int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 *raid_map, struct btrfs_bio *bbio, u64 *raid_map,
u64 stripe_len); u64 stripe_len);

Просмотреть файл

@ -63,6 +63,13 @@ struct scrub_ctx;
*/ */
#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
struct scrub_recover {
atomic_t refs;
struct btrfs_bio *bbio;
u64 *raid_map;
u64 map_length;
};
struct scrub_page { struct scrub_page {
struct scrub_block *sblock; struct scrub_block *sblock;
struct page *page; struct page *page;
@ -79,6 +86,8 @@ struct scrub_page {
unsigned int io_error:1; unsigned int io_error:1;
}; };
u8 csum[BTRFS_CSUM_SIZE]; u8 csum[BTRFS_CSUM_SIZE];
struct scrub_recover *recover;
}; };
struct scrub_bio { struct scrub_bio {
@ -196,7 +205,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
static void scrub_recheck_block(struct btrfs_fs_info *fs_info, static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata, struct scrub_block *sblock, int is_metadata,
int have_csum, u8 *csum, u64 generation, int have_csum, u8 *csum, u64 generation,
u16 csum_size); u16 csum_size, int retry_failed_mirror);
static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, struct scrub_block *sblock,
int is_metadata, int have_csum, int is_metadata, int have_csum,
@ -790,6 +799,20 @@ out:
scrub_pending_trans_workers_dec(sctx); scrub_pending_trans_workers_dec(sctx);
} }
static inline void scrub_get_recover(struct scrub_recover *recover)
{
atomic_inc(&recover->refs);
}
static inline void scrub_put_recover(struct scrub_recover *recover)
{
if (atomic_dec_and_test(&recover->refs)) {
kfree(recover->bbio);
kfree(recover->raid_map);
kfree(recover);
}
}
/* /*
* scrub_handle_errored_block gets called when either verification of the * scrub_handle_errored_block gets called when either verification of the
* pages failed or the bio failed to read, e.g. with EIO. In the latter * pages failed or the bio failed to read, e.g. with EIO. In the latter
@ -906,7 +929,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/* build and submit the bios for the failed mirror, check checksums */ /* build and submit the bios for the failed mirror, check checksums */
scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
csum, generation, sctx->csum_size); csum, generation, sctx->csum_size, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error && if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) { sblock_bad->no_io_error_seen) {
@ -1019,7 +1042,7 @@ nodatasum_case:
/* build and submit the bios, check checksums */ /* build and submit the bios, check checksums */
scrub_recheck_block(fs_info, sblock_other, is_metadata, scrub_recheck_block(fs_info, sblock_other, is_metadata,
have_csum, csum, generation, have_csum, csum, generation,
sctx->csum_size); sctx->csum_size, 0);
if (!sblock_other->header_error && if (!sblock_other->header_error &&
!sblock_other->checksum_error && !sblock_other->checksum_error &&
@ -1169,7 +1192,7 @@ nodatasum_case:
*/ */
scrub_recheck_block(fs_info, sblock_bad, scrub_recheck_block(fs_info, sblock_bad,
is_metadata, have_csum, csum, is_metadata, have_csum, csum,
generation, sctx->csum_size); generation, sctx->csum_size, 1);
if (!sblock_bad->header_error && if (!sblock_bad->header_error &&
!sblock_bad->checksum_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) sblock_bad->no_io_error_seen)
@ -1201,11 +1224,18 @@ out:
mirror_index++) { mirror_index++) {
struct scrub_block *sblock = sblocks_for_recheck + struct scrub_block *sblock = sblocks_for_recheck +
mirror_index; mirror_index;
struct scrub_recover *recover;
int page_index; int page_index;
for (page_index = 0; page_index < sblock->page_count; for (page_index = 0; page_index < sblock->page_count;
page_index++) { page_index++) {
sblock->pagev[page_index]->sblock = NULL; sblock->pagev[page_index]->sblock = NULL;
recover = sblock->pagev[page_index]->recover;
if (recover) {
scrub_put_recover(recover);
sblock->pagev[page_index]->recover =
NULL;
}
scrub_page_put(sblock->pagev[page_index]); scrub_page_put(sblock->pagev[page_index]);
} }
} }
@ -1215,14 +1245,63 @@ out:
return 0; return 0;
} }
static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
{
if (raid_map) {
if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
return 3;
else
return 2;
} else {
return (int)bbio->num_stripes;
}
}
static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
u64 mapped_length,
int nstripes, int mirror,
int *stripe_index,
u64 *stripe_offset)
{
int i;
if (raid_map) {
/* RAID5/6 */
for (i = 0; i < nstripes; i++) {
if (raid_map[i] == RAID6_Q_STRIPE ||
raid_map[i] == RAID5_P_STRIPE)
continue;
if (logical >= raid_map[i] &&
logical < raid_map[i] + mapped_length)
break;
}
*stripe_index = i;
*stripe_offset = logical - raid_map[i];
} else {
/* The other RAID type */
*stripe_index = mirror;
*stripe_offset = 0;
}
}
static int scrub_setup_recheck_block(struct scrub_ctx *sctx, static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info, struct btrfs_fs_info *fs_info,
struct scrub_block *original_sblock, struct scrub_block *original_sblock,
u64 length, u64 logical, u64 length, u64 logical,
struct scrub_block *sblocks_for_recheck) struct scrub_block *sblocks_for_recheck)
{ {
struct scrub_recover *recover;
struct btrfs_bio *bbio;
u64 *raid_map;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
int page_index; int page_index;
int mirror_index; int mirror_index;
int nmirrors;
int ret; int ret;
/* /*
@ -1233,23 +1312,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
page_index = 0; page_index = 0;
while (length > 0) { while (length > 0) {
u64 sublen = min_t(u64, length, PAGE_SIZE); sublen = min_t(u64, length, PAGE_SIZE);
u64 mapped_length = sublen; mapped_length = sublen;
struct btrfs_bio *bbio = NULL; bbio = NULL;
raid_map = NULL;
/* /*
* with a length of PAGE_SIZE, each returned stripe * with a length of PAGE_SIZE, each returned stripe
* represents one mirror * represents one mirror
*/ */
ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
&mapped_length, &bbio, 0); &mapped_length, &bbio, 0, &raid_map);
if (ret || !bbio || mapped_length < sublen) { if (ret || !bbio || mapped_length < sublen) {
kfree(bbio); kfree(bbio);
kfree(raid_map);
return -EIO; return -EIO;
} }
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
kfree(bbio);
kfree(raid_map);
return -ENOMEM;
}
atomic_set(&recover->refs, 1);
recover->bbio = bbio;
recover->raid_map = raid_map;
recover->map_length = mapped_length;
BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) { mirror_index++) {
struct scrub_block *sblock; struct scrub_block *sblock;
struct scrub_page *page; struct scrub_page *page;
@ -1265,26 +1360,38 @@ leave_nomem:
spin_lock(&sctx->stat_lock); spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++; sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock); spin_unlock(&sctx->stat_lock);
kfree(bbio); scrub_put_recover(recover);
return -ENOMEM; return -ENOMEM;
} }
scrub_page_get(page); scrub_page_get(page);
sblock->pagev[page_index] = page; sblock->pagev[page_index] = page;
page->logical = logical; page->logical = logical;
page->physical = bbio->stripes[mirror_index].physical;
scrub_stripe_index_and_offset(logical, raid_map,
mapped_length,
bbio->num_stripes,
mirror_index,
&stripe_index,
&stripe_offset);
page->physical = bbio->stripes[stripe_index].physical +
stripe_offset;
page->dev = bbio->stripes[stripe_index].dev;
BUG_ON(page_index >= original_sblock->page_count); BUG_ON(page_index >= original_sblock->page_count);
page->physical_for_dev_replace = page->physical_for_dev_replace =
original_sblock->pagev[page_index]-> original_sblock->pagev[page_index]->
physical_for_dev_replace; physical_for_dev_replace;
/* for missing devices, dev->bdev is NULL */ /* for missing devices, dev->bdev is NULL */
page->dev = bbio->stripes[mirror_index].dev;
page->mirror_num = mirror_index + 1; page->mirror_num = mirror_index + 1;
sblock->page_count++; sblock->page_count++;
page->page = alloc_page(GFP_NOFS); page->page = alloc_page(GFP_NOFS);
if (!page->page) if (!page->page)
goto leave_nomem; goto leave_nomem;
scrub_get_recover(recover);
page->recover = recover;
} }
kfree(bbio); scrub_put_recover(recover);
length -= sublen; length -= sublen;
logical += sublen; logical += sublen;
page_index++; page_index++;
@ -1293,6 +1400,51 @@ leave_nomem:
return 0; return 0;
} }
struct scrub_bio_ret {
struct completion event;
int error;
};
static void scrub_bio_wait_endio(struct bio *bio, int error)
{
struct scrub_bio_ret *ret = bio->bi_private;
ret->error = error;
complete(&ret->event);
}
static inline int scrub_is_page_on_raid56(struct scrub_page *page)
{
return page->recover && page->recover->raid_map;
}
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
struct scrub_page *page)
{
struct scrub_bio_ret done;
int ret;
init_completion(&done.event);
done.error = 0;
bio->bi_iter.bi_sector = page->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
page->recover->raid_map,
page->recover->map_length,
page->mirror_num, 1);
if (ret)
return ret;
wait_for_completion(&done.event);
if (done.error)
return -EIO;
return 0;
}
/* /*
* this function will check the on disk data for checksum errors, header * this function will check the on disk data for checksum errors, header
* errors and read I/O errors. If any I/O errors happen, the exact pages * errors and read I/O errors. If any I/O errors happen, the exact pages
@ -1303,7 +1455,7 @@ leave_nomem:
static void scrub_recheck_block(struct btrfs_fs_info *fs_info, static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata, struct scrub_block *sblock, int is_metadata,
int have_csum, u8 *csum, u64 generation, int have_csum, u8 *csum, u64 generation,
u16 csum_size) u16 csum_size, int retry_failed_mirror)
{ {
int page_num; int page_num;
@ -1329,11 +1481,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
continue; continue;
} }
bio->bi_bdev = page->dev->bdev; bio->bi_bdev = page->dev->bdev;
bio->bi_iter.bi_sector = page->physical >> 9;
bio_add_page(bio, page->page, PAGE_SIZE, 0); bio_add_page(bio, page->page, PAGE_SIZE, 0);
if (btrfsic_submit_bio_wait(READ, bio)) if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
sblock->no_io_error_seen = 0; if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
sblock->no_io_error_seen = 0;
} else {
bio->bi_iter.bi_sector = page->physical >> 9;
if (btrfsic_submit_bio_wait(READ, bio))
sblock->no_io_error_seen = 0;
}
bio_put(bio); bio_put(bio);
} }

Просмотреть файл

@ -5161,7 +5161,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
BTRFS_BLOCK_GROUP_RAID6)) { BTRFS_BLOCK_GROUP_RAID6)) {
u64 tmp; u64 tmp;
if (raid_map_ret && ((rw & REQ_WRITE) || mirror_num > 1)) { if (raid_map_ret &&
((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
mirror_num > 1)) {
int i, rot; int i, rot;
/* push stripe_nr back to the start of the full stripe */ /* push stripe_nr back to the start of the full stripe */
@ -5440,6 +5442,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
mirror_num, NULL); mirror_num, NULL);
} }
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
u64 **raid_map_ret)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
mirror_num, raid_map_ret);
}
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid, u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len) u64 **logical, int *naddrs, int *stripe_len)
@ -5809,7 +5821,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
} else { } else {
ret = raid56_parity_recover(root, bio, bbio, ret = raid56_parity_recover(root, bio, bbio,
raid_map, map_length, raid_map, map_length,
mirror_num); mirror_num, 0);
} }
/* /*
* FIXME, replace dosen't support raid56 yet, please fix * FIXME, replace dosen't support raid56 yet, please fix

Просмотреть файл

@ -393,6 +393,10 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length, u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num); struct btrfs_bio **bbio_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
u64 **raid_map_ret);
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid, u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len); u64 **logical, int *naddrs, int *stripe_len);