drbd: cleanup ondisk meta data layout calculations and defines
Add a comment about our meta data layout variants, and rename a few defines (e.g. MD_RESERVED_SECT -> MD_128MB_SECT) to make it clear that they are short hand for fixed constants, and not arbitrarily to be redefined as one may see fit. Properly pad struct meta_data_on_disk to 4kB, and initialize to zero not only the first 512 Byte, but all of it in drbd_md_sync(). Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Родитель
9114d79569
Коммит
ae8bf312e9
|
@ -209,7 +209,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
|
|||
current->comm, current->pid, __func__,
|
||||
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
|
||||
|
||||
err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE);
|
||||
/* we do all our meta data IO in aligned 4k blocks. */
|
||||
err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096);
|
||||
if (err) {
|
||||
dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
|
||||
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
|
||||
|
@ -350,6 +351,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
|
|||
(BM_EXT_SHIFT - BM_BLOCK_SHIFT));
|
||||
}
|
||||
|
||||
static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev)
|
||||
{
|
||||
const unsigned int stripes = 1;
|
||||
const unsigned int stripe_size_4kB = MD_32kB_SECT/MD_4kB_SECT;
|
||||
|
||||
/* transaction number, modulo on-disk ring buffer wrap around */
|
||||
unsigned int t = mdev->al_tr_number % (stripe_size_4kB * stripes);
|
||||
|
||||
/* ... to aligned 4k on disk block */
|
||||
t = ((t % stripes) * stripe_size_4kB) + t/stripes;
|
||||
|
||||
/* ... to 512 byte sector in activity log */
|
||||
t *= 8;
|
||||
|
||||
/* ... plus offset to the on disk position */
|
||||
return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t;
|
||||
}
|
||||
|
||||
static int
|
||||
_al_write_transaction(struct drbd_conf *mdev)
|
||||
{
|
||||
|
@ -432,13 +451,12 @@ _al_write_transaction(struct drbd_conf *mdev)
|
|||
if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
|
||||
mdev->al_tr_cycle = 0;
|
||||
|
||||
sector = mdev->ldev->md.md_offset
|
||||
+ mdev->ldev->md.al_offset
|
||||
+ mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
|
||||
sector = al_tr_number_to_on_disk_sector(mdev);
|
||||
|
||||
crc = crc32c(0, buffer, 4096);
|
||||
buffer->crc32c = cpu_to_be32(crc);
|
||||
|
||||
/* normal execution path goes through all three branches */
|
||||
if (drbd_bm_write_hinted(mdev))
|
||||
err = -EIO;
|
||||
/* drbd_chk_io_error done already */
|
||||
|
@ -446,8 +464,6 @@ _al_write_transaction(struct drbd_conf *mdev)
|
|||
err = -EIO;
|
||||
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
|
||||
} else {
|
||||
/* advance ringbuffer position and transaction counter */
|
||||
mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
|
||||
mdev->al_tr_number++;
|
||||
}
|
||||
|
||||
|
|
|
@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
|
|||
}
|
||||
}
|
||||
|
||||
/* For the layout, see comment above drbd_md_set_sector_offsets(). */
|
||||
static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
|
||||
{
|
||||
u64 bitmap_sectors;
|
||||
if (ldev->md.al_offset == 8)
|
||||
bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
|
||||
else
|
||||
bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
|
||||
return bitmap_sectors << (9 + 3);
|
||||
}
|
||||
|
||||
/*
|
||||
* make sure the bitmap has enough room for the attached storage,
|
||||
* if necessary, resize.
|
||||
|
@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
|
|||
words = ALIGN(bits, 64) >> LN2_BPL;
|
||||
|
||||
if (get_ldev(mdev)) {
|
||||
u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
|
||||
u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev);
|
||||
put_ldev(mdev);
|
||||
if (bits > bits_on_disk) {
|
||||
dev_info(DEV, "bits = %lu\n", bits);
|
||||
|
|
|
@ -753,13 +753,8 @@ struct drbd_md {
|
|||
u32 flags;
|
||||
u32 md_size_sect;
|
||||
|
||||
s32 al_offset; /* signed relative sector offset to al area */
|
||||
s32 al_offset; /* signed relative sector offset to activity log */
|
||||
s32 bm_offset; /* signed relative sector offset to bitmap */
|
||||
|
||||
/* u32 al_nr_extents; important for restoring the AL
|
||||
* is stored into ldev->dc.al_extents, which in turn
|
||||
* gets applied to act_log->nr_elements
|
||||
*/
|
||||
};
|
||||
|
||||
struct drbd_backing_dev {
|
||||
|
@ -1009,7 +1004,6 @@ struct drbd_conf {
|
|||
struct lru_cache *act_log; /* activity log */
|
||||
unsigned int al_tr_number;
|
||||
int al_tr_cycle;
|
||||
int al_tr_pos; /* position of the next transaction in the journal */
|
||||
wait_queue_head_t seq_wait;
|
||||
atomic_t packet_seq;
|
||||
unsigned int peer_seq;
|
||||
|
@ -1151,21 +1145,41 @@ extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
|
|||
extern void drbd_ldev_destroy(struct drbd_conf *mdev);
|
||||
|
||||
/* Meta data layout
|
||||
We reserve a 128MB Block (4k aligned)
|
||||
* either at the end of the backing device
|
||||
* or on a separate meta data device. */
|
||||
*
|
||||
* We currently have two possible layouts.
|
||||
* Offsets in (512 byte) sectors.
|
||||
* external:
|
||||
* |----------- md_size_sect ------------------|
|
||||
* [ 4k superblock ][ activity log ][ Bitmap ]
|
||||
* | al_offset == 8 |
|
||||
* | bm_offset = al_offset + X |
|
||||
* ==> bitmap sectors = md_size_sect - bm_offset
|
||||
*
|
||||
* Variants:
|
||||
* old, indexed fixed size meta data:
|
||||
*
|
||||
* internal:
|
||||
* |----------- md_size_sect ------------------|
|
||||
* [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*]
|
||||
* | al_offset < 0 |
|
||||
* | bm_offset = al_offset - Y |
|
||||
* ==> bitmap sectors = Y = al_offset - bm_offset
|
||||
*
|
||||
* [padding*] are zero or up to 7 unused 512 Byte sectors to the
|
||||
* end of the device, so that the [4k superblock] will be 4k aligned.
|
||||
*
|
||||
* The activity log consists of 4k transaction blocks,
|
||||
* which are written in a ring-buffer, or striped ring-buffer like fashion,
|
||||
* which are writtensize used to be fixed 32kB,
|
||||
* but is about to become configurable.
|
||||
*/
|
||||
|
||||
/* The following numbers are sectors */
|
||||
/* Allows up to about 3.8TB, so if you want more,
|
||||
/* Our old fixed size meta data layout
|
||||
* allows up to about 3.8TB, so if you want more,
|
||||
* you need to use the "flexible" meta data format. */
|
||||
#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
|
||||
#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
|
||||
#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */
|
||||
#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
|
||||
|
||||
/* we do all meta data IO in 4k blocks */
|
||||
#define MD_BLOCK_SHIFT 12
|
||||
#define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT)
|
||||
#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */
|
||||
#define MD_4kB_SECT 8
|
||||
#define MD_32kB_SECT 64
|
||||
|
||||
/* One activity log extent represents 4M of storage */
|
||||
#define AL_EXTENT_SHIFT 22
|
||||
|
@ -1255,7 +1269,6 @@ struct bm_extent {
|
|||
|
||||
/* in one sector of the bitmap, we have this many activity_log extents. */
|
||||
#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
|
||||
#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
|
||||
|
||||
#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
|
||||
#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
|
||||
|
@ -1275,16 +1288,18 @@ struct bm_extent {
|
|||
*/
|
||||
|
||||
#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
|
||||
#define DRBD_MAX_SECTORS_BM \
|
||||
((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
|
||||
#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
|
||||
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
|
||||
#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
|
||||
#elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
|
||||
/* we have a certain meta data variant that has a fixed on-disk size of 128
|
||||
* MiB, of which 4k are our "superblock", and 32k are the fixed size activity
|
||||
* log, leaving this many sectors for the bitmap.
|
||||
*/
|
||||
|
||||
#define DRBD_MAX_SECTORS_FIXED_BM \
|
||||
((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
|
||||
#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
|
||||
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
|
||||
#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
|
||||
#else
|
||||
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
|
||||
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM
|
||||
/* 16 TB in units of sectors */
|
||||
#if BITS_PER_LONG == 32
|
||||
/* adjust by one page worth of bitmap,
|
||||
|
@ -1792,10 +1807,10 @@ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
|
|||
switch (meta_dev_idx) {
|
||||
case DRBD_MD_INDEX_INTERNAL:
|
||||
case DRBD_MD_INDEX_FLEX_INT:
|
||||
return bdev->md.md_offset + MD_AL_OFFSET - 1;
|
||||
return bdev->md.md_offset + MD_4kB_SECT -1;
|
||||
case DRBD_MD_INDEX_FLEX_EXT:
|
||||
default:
|
||||
return bdev->md.md_offset + bdev->md.md_size_sect;
|
||||
return bdev->md.md_offset + bdev->md.md_size_sect -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1861,13 +1876,11 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
|
|||
rcu_read_unlock();
|
||||
|
||||
switch (meta_dev_idx) {
|
||||
default: /* external, some index */
|
||||
return MD_RESERVED_SECT * meta_dev_idx;
|
||||
default: /* external, some index; this is the old fixed size layout */
|
||||
return MD_128MB_SECT * meta_dev_idx;
|
||||
case DRBD_MD_INDEX_INTERNAL:
|
||||
/* with drbd08, internal meta data is always "flexible" */
|
||||
case DRBD_MD_INDEX_FLEX_INT:
|
||||
/* sizeof(struct md_on_disk_07) == 4k
|
||||
* position: last 4k aligned block of 4k size */
|
||||
if (!bdev->backing_bdev) {
|
||||
if (__ratelimit(&drbd_ratelimit_state)) {
|
||||
dev_err(DEV, "bdev->backing_bdev==NULL\n");
|
||||
|
@ -1875,8 +1888,9 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
|
||||
- MD_AL_OFFSET;
|
||||
/* sizeof(struct md_on_disk_07) == 4k
|
||||
* position: last 4k aligned block of 4k size */
|
||||
return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
|
||||
case DRBD_MD_INDEX_FLEX_EXT:
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -2834,6 +2834,7 @@ void conn_md_sync(struct drbd_tconn *tconn)
|
|||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/* aligned 4kByte */
|
||||
struct meta_data_on_disk {
|
||||
u64 la_size; /* last agreed size. */
|
||||
u64 uuid[UI_SIZE]; /* UUIDs. */
|
||||
|
@ -2843,13 +2844,13 @@ struct meta_data_on_disk {
|
|||
u32 magic;
|
||||
u32 md_size_sect;
|
||||
u32 al_offset; /* offset to this block */
|
||||
u32 al_nr_extents; /* important for restoring the AL */
|
||||
u32 al_nr_extents; /* important for restoring the AL (userspace) */
|
||||
/* `-- act_log->nr_elements <-- ldev->dc.al_extents */
|
||||
u32 bm_offset; /* offset to the bitmap, from here */
|
||||
u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
|
||||
u32 la_peer_max_bio_size; /* last peer max_bio_size */
|
||||
u32 reserved_u32[3];
|
||||
|
||||
u8 reserved_u8[4096 - (7*8 + 8*4)];
|
||||
} __packed;
|
||||
|
||||
/**
|
||||
|
@ -2862,6 +2863,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
|
|||
sector_t sector;
|
||||
int i;
|
||||
|
||||
/* Don't accidentally change the DRBD meta data layout. */
|
||||
BUILD_BUG_ON(UI_SIZE != 4);
|
||||
BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
|
||||
|
||||
del_timer(&mdev->md_sync_timer);
|
||||
/* timer may be rearmed by drbd_md_mark_dirty() now. */
|
||||
if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
|
||||
|
@ -2876,7 +2881,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
|
|||
if (!buffer)
|
||||
goto out;
|
||||
|
||||
memset(buffer, 0, 512);
|
||||
memset(buffer, 0, sizeof(*buffer));
|
||||
|
||||
buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
|
||||
for (i = UI_CURRENT; i < UI_SIZE; i++)
|
||||
|
|
|
@ -696,12 +696,32 @@ out:
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* initializes the md.*_offset members, so we are able to find
|
||||
* the on disk meta data */
|
||||
/* Initializes the md.*_offset members, so we are able to find
|
||||
* the on disk meta data.
|
||||
*
|
||||
* We currently have two possible layouts:
|
||||
* external:
|
||||
* |----------- md_size_sect ------------------|
|
||||
* [ 4k superblock ][ activity log ][ Bitmap ]
|
||||
* | al_offset == 8 |
|
||||
* | bm_offset = al_offset + X |
|
||||
* ==> bitmap sectors = md_size_sect - bm_offset
|
||||
*
|
||||
* internal:
|
||||
* |----------- md_size_sect ------------------|
|
||||
* [data.....][ Bitmap ][ activity log ][ 4k superblock ]
|
||||
* | al_offset < 0 |
|
||||
* | bm_offset = al_offset - Y |
|
||||
* ==> bitmap sectors = Y = al_offset - bm_offset
|
||||
*
|
||||
* Activity log size used to be fixed 32kB,
|
||||
* but is about to become configurable.
|
||||
*/
|
||||
static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
|
||||
struct drbd_backing_dev *bdev)
|
||||
{
|
||||
sector_t md_size_sect = 0;
|
||||
unsigned int al_size_sect = MD_32kB_SECT;
|
||||
int meta_dev_idx;
|
||||
|
||||
rcu_read_lock();
|
||||
|
@ -710,23 +730,23 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
|
|||
switch (meta_dev_idx) {
|
||||
default:
|
||||
/* v07 style fixed size indexed meta data */
|
||||
bdev->md.md_size_sect = MD_RESERVED_SECT;
|
||||
bdev->md.md_size_sect = MD_128MB_SECT;
|
||||
bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
|
||||
bdev->md.al_offset = MD_AL_OFFSET;
|
||||
bdev->md.bm_offset = MD_BM_OFFSET;
|
||||
bdev->md.al_offset = MD_4kB_SECT;
|
||||
bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
|
||||
break;
|
||||
case DRBD_MD_INDEX_FLEX_EXT:
|
||||
/* just occupy the full device; unit: sectors */
|
||||
bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
|
||||
bdev->md.md_offset = 0;
|
||||
bdev->md.al_offset = MD_AL_OFFSET;
|
||||
bdev->md.bm_offset = MD_BM_OFFSET;
|
||||
bdev->md.al_offset = MD_4kB_SECT;
|
||||
bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
|
||||
break;
|
||||
case DRBD_MD_INDEX_INTERNAL:
|
||||
case DRBD_MD_INDEX_FLEX_INT:
|
||||
bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
|
||||
/* al size is still fixed */
|
||||
bdev->md.al_offset = -MD_AL_SECTORS;
|
||||
bdev->md.al_offset = -al_size_sect;
|
||||
/* we need (slightly less than) ~ this much bitmap sectors: */
|
||||
md_size_sect = drbd_get_capacity(bdev->backing_bdev);
|
||||
md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
|
||||
|
@ -735,11 +755,11 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
|
|||
|
||||
/* plus the "drbd meta data super block",
|
||||
* and the activity log; */
|
||||
md_size_sect += MD_BM_OFFSET;
|
||||
md_size_sect += MD_4kB_SECT + al_size_sect;
|
||||
|
||||
bdev->md.md_size_sect = md_size_sect;
|
||||
/* bitmap offset is adjusted by 'super' block size */
|
||||
bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
|
||||
bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT;
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
@ -1416,7 +1436,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
|
|||
min_md_device_sectors = (2<<10);
|
||||
} else {
|
||||
max_possible_sectors = DRBD_MAX_SECTORS;
|
||||
min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1);
|
||||
min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
|
||||
}
|
||||
|
||||
if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
|
||||
|
|
Загрузка…
Ссылка в новой задаче