ore: Support for raid 6
This simple patch adds support for raid6 to the ORE. Most operations and calculations where already for the general case. Only things left: * call async_gen_syndrome() in the case of raid6 (NOTE that the raid6 math is the one supported by the Linux Kernel see: crypto/async_tx/async_pq.c) * call _ore_add_parity_unit() twice with only last call generating the redundancy pages. * Fix couple BUGS in old code a. In reads when parity==2 it can happen that per_dev->length=0 but per_dev->offset was set and adjusted by _ore_add_sg_seg(). Don't let it be overwritten. b. The all 'cur_comp > starting_dev' thing to determine if: "per_dev->offset is in the current stripe number or the next one." Was a complete raid5/4 accident. When parity==2 this is not at all true usually. All we need to do is increment si->ob_offset once we pass by the first parity device. (This also greatly simplifies the code, amen) c. Calculation of si->dev rotation can overflow when parity==2. * Then last enable raid6 in ore_verify_layout() I want to deeply thank Daniel Gryniewicz who found first all the bugs in the old raid code, and inspired these patches: Inspired-by Daniel Gryniewicz <dang@linuxbox.com> Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
This commit is contained in:
Родитель
455682ce54
Коммит
ce5d36aac2
|
@ -9,4 +9,6 @@ config ORE
|
||||||
tristate
|
tristate
|
||||||
depends on EXOFS_FS || PNFS_OBJLAYOUT
|
depends on EXOFS_FS || PNFS_OBJLAYOUT
|
||||||
select ASYNC_XOR
|
select ASYNC_XOR
|
||||||
|
select RAID6_PQ
|
||||||
|
select ASYNC_PQ
|
||||||
default SCSI_OSD_ULD
|
default SCSI_OSD_ULD
|
||||||
|
|
|
@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
|
||||||
layout->parity = 1;
|
layout->parity = 1;
|
||||||
break;
|
break;
|
||||||
case PNFS_OSD_RAID_PQ:
|
case PNFS_OSD_RAID_PQ:
|
||||||
|
layout->parity = 2;
|
||||||
|
break;
|
||||||
case PNFS_OSD_RAID_4:
|
case PNFS_OSD_RAID_4:
|
||||||
default:
|
default:
|
||||||
ORE_ERR("Only RAID_0/5 for now\n");
|
ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
|
||||||
|
layout->raid_algorithm);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
|
if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
|
||||||
|
@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
|
||||||
layout->max_io_length /= stripe_length;
|
layout->max_io_length /= stripe_length;
|
||||||
layout->max_io_length *= stripe_length;
|
layout->max_io_length *= stripe_length;
|
||||||
}
|
}
|
||||||
|
ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(ore_verify_layout);
|
EXPORT_SYMBOL(ore_verify_layout);
|
||||||
|
@ -561,7 +566,8 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
|
||||||
|
|
||||||
si->par_dev = (group_width + group_width - parity - RxP) %
|
si->par_dev = (group_width + group_width - parity - RxP) %
|
||||||
group_width + first_dev;
|
group_width + first_dev;
|
||||||
si->dev = (group_width + C - RxP) % group_width + first_dev;
|
si->dev = (group_width + group_width + C - RxP) %
|
||||||
|
group_width + first_dev;
|
||||||
si->bytes_in_stripe = U;
|
si->bytes_in_stripe = U;
|
||||||
si->first_stripe_start = M * S + G * T + N * U;
|
si->first_stripe_start = M * S + G * T + N * U;
|
||||||
} else {
|
} else {
|
||||||
|
@ -651,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int _add_parity_units(struct ore_io_state *ios,
|
||||||
|
struct ore_striping_info *si,
|
||||||
|
unsigned dev, unsigned first_dev,
|
||||||
|
unsigned mirrors_p1, unsigned devs_in_group,
|
||||||
|
unsigned cur_len)
|
||||||
|
{
|
||||||
|
unsigned do_parity;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
for (do_parity = ios->layout->parity; do_parity; --do_parity) {
|
||||||
|
struct ore_per_dev_state *per_dev;
|
||||||
|
|
||||||
|
per_dev = &ios->per_dev[dev - first_dev];
|
||||||
|
if (!per_dev->length && !per_dev->offset) {
|
||||||
|
/* Only/always the parity unit of the first
|
||||||
|
* stripe will be empty. So this is a chance to
|
||||||
|
* initialize the per_dev info.
|
||||||
|
*/
|
||||||
|
per_dev->dev = dev;
|
||||||
|
per_dev->offset = si->obj_offset - si->unit_off;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
|
||||||
|
do_parity == 1);
|
||||||
|
if (unlikely(ret))
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (do_parity != 1) {
|
||||||
|
dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
|
||||||
|
si->cur_comp = (si->cur_comp + 1) %
|
||||||
|
ios->layout->group_width;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static int _prepare_for_striping(struct ore_io_state *ios)
|
static int _prepare_for_striping(struct ore_io_state *ios)
|
||||||
{
|
{
|
||||||
struct ore_striping_info *si = &ios->si;
|
struct ore_striping_info *si = &ios->si;
|
||||||
|
@ -660,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios)
|
||||||
unsigned devs_in_group = group_width * mirrors_p1;
|
unsigned devs_in_group = group_width * mirrors_p1;
|
||||||
unsigned dev = si->dev;
|
unsigned dev = si->dev;
|
||||||
unsigned first_dev = dev - (dev % devs_in_group);
|
unsigned first_dev = dev - (dev % devs_in_group);
|
||||||
unsigned dev_order;
|
|
||||||
unsigned cur_pg = ios->pages_consumed;
|
unsigned cur_pg = ios->pages_consumed;
|
||||||
u64 length = ios->length;
|
u64 length = ios->length;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
@ -672,14 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios)
|
||||||
|
|
||||||
BUG_ON(length > si->length);
|
BUG_ON(length > si->length);
|
||||||
|
|
||||||
dev_order = si->cur_comp;
|
|
||||||
|
|
||||||
while (length) {
|
while (length) {
|
||||||
struct ore_per_dev_state *per_dev =
|
struct ore_per_dev_state *per_dev =
|
||||||
&ios->per_dev[dev - first_dev];
|
&ios->per_dev[dev - first_dev];
|
||||||
unsigned cur_len, page_off = 0;
|
unsigned cur_len, page_off = 0;
|
||||||
|
|
||||||
if (!per_dev->length) {
|
if (!per_dev->length && !per_dev->offset) {
|
||||||
|
/* First time initialize the per_dev info. */
|
||||||
per_dev->dev = dev;
|
per_dev->dev = dev;
|
||||||
if (dev == si->dev) {
|
if (dev == si->dev) {
|
||||||
WARN_ON(dev == si->par_dev);
|
WARN_ON(dev == si->par_dev);
|
||||||
|
@ -688,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
|
||||||
page_off = si->unit_off & ~PAGE_MASK;
|
page_off = si->unit_off & ~PAGE_MASK;
|
||||||
BUG_ON(page_off && (page_off != ios->pgbase));
|
BUG_ON(page_off && (page_off != ios->pgbase));
|
||||||
} else {
|
} else {
|
||||||
if (si->cur_comp > dev_order)
|
per_dev->offset = si->obj_offset - si->unit_off;
|
||||||
per_dev->offset =
|
|
||||||
si->obj_offset - si->unit_off;
|
|
||||||
else /* si->cur_comp < dev_order */
|
|
||||||
per_dev->offset =
|
|
||||||
si->obj_offset + stripe_unit -
|
|
||||||
si->unit_off;
|
|
||||||
cur_len = stripe_unit;
|
cur_len = stripe_unit;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -721,20 +756,12 @@ static int _prepare_for_striping(struct ore_io_state *ios)
|
||||||
/* If last stripe operate on parity comp */
|
/* If last stripe operate on parity comp */
|
||||||
si->cur_comp = group_width - ios->layout->parity;
|
si->cur_comp = group_width - ios->layout->parity;
|
||||||
}
|
}
|
||||||
per_dev = &ios->per_dev[dev - first_dev];
|
|
||||||
if (!per_dev->length) {
|
|
||||||
/* Only/always the parity unit of the first
|
|
||||||
* stripe will be empty. So this is a chance to
|
|
||||||
* initialize the per_dev info.
|
|
||||||
*/
|
|
||||||
per_dev->dev = dev;
|
|
||||||
per_dev->offset = si->obj_offset - si->unit_off;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* In writes cur_len just means if it's the
|
/* In writes cur_len just means if it's the
|
||||||
* last one. See _ore_add_parity_unit.
|
* last one. See _ore_add_parity_unit.
|
||||||
*/
|
*/
|
||||||
ret = _ore_add_parity_unit(ios, si, per_dev,
|
ret = _add_parity_units(ios, si, dev, first_dev,
|
||||||
|
mirrors_p1, devs_in_group,
|
||||||
ios->sp2d ? length : cur_len);
|
ios->sp2d ? length : cur_len);
|
||||||
if (unlikely(ret))
|
if (unlikely(ret))
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -746,6 +773,8 @@ static int _prepare_for_striping(struct ore_io_state *ios)
|
||||||
/* Next stripe, start fresh */
|
/* Next stripe, start fresh */
|
||||||
si->cur_comp = 0;
|
si->cur_comp = 0;
|
||||||
si->cur_pg = 0;
|
si->cur_pg = 0;
|
||||||
|
si->obj_offset += cur_len;
|
||||||
|
si->unit_off = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out:
|
out:
|
||||||
|
|
|
@ -218,20 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
|
||||||
static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
|
static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
|
||||||
{
|
{
|
||||||
unsigned p;
|
unsigned p;
|
||||||
|
unsigned tx_flags = ASYNC_TX_ACK;
|
||||||
|
|
||||||
|
if (sp2d->parity == 1)
|
||||||
|
tx_flags |= ASYNC_TX_XOR_ZERO_DST;
|
||||||
|
|
||||||
for (p = 0; p < sp2d->pages_in_unit; p++) {
|
for (p = 0; p < sp2d->pages_in_unit; p++) {
|
||||||
struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
|
struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
|
||||||
|
|
||||||
if (!_1ps->write_count)
|
if (!_1ps->write_count)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
init_async_submit(&_1ps->submit,
|
init_async_submit(&_1ps->submit, tx_flags,
|
||||||
ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
|
|
||||||
NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
|
NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
|
||||||
|
|
||||||
/* TODO: raid6 */
|
if (sp2d->parity == 1)
|
||||||
_1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
|
_1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
|
||||||
0, sp2d->data_devs, PAGE_SIZE,
|
_1ps->pages, 0, sp2d->data_devs,
|
||||||
&_1ps->submit);
|
PAGE_SIZE, &_1ps->submit);
|
||||||
|
else /* parity == 2 */
|
||||||
|
_1ps->tx = async_gen_syndrome(_1ps->pages, 0,
|
||||||
|
sp2d->data_devs + sp2d->parity,
|
||||||
|
PAGE_SIZE, &_1ps->submit);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (p = 0; p < sp2d->pages_in_unit; p++) {
|
for (p = 0; p < sp2d->pages_in_unit; p++) {
|
||||||
|
@ -616,7 +624,7 @@ static int _read_4_write_execute(struct ore_io_state *ios)
|
||||||
int _ore_add_parity_unit(struct ore_io_state *ios,
|
int _ore_add_parity_unit(struct ore_io_state *ios,
|
||||||
struct ore_striping_info *si,
|
struct ore_striping_info *si,
|
||||||
struct ore_per_dev_state *per_dev,
|
struct ore_per_dev_state *per_dev,
|
||||||
unsigned cur_len)
|
unsigned cur_len, bool do_xor)
|
||||||
{
|
{
|
||||||
if (ios->reading) {
|
if (ios->reading) {
|
||||||
if (per_dev->cur_sg >= ios->sgs_per_dev) {
|
if (per_dev->cur_sg >= ios->sgs_per_dev) {
|
||||||
|
@ -641,9 +649,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
|
||||||
/* If first stripe, Read in all read4write pages
|
/* If first stripe, Read in all read4write pages
|
||||||
* (if needed) before we calculate the first parity.
|
* (if needed) before we calculate the first parity.
|
||||||
*/
|
*/
|
||||||
_read_4_write_first_stripe(ios);
|
if (do_xor)
|
||||||
|
_read_4_write_first_stripe(ios);
|
||||||
}
|
}
|
||||||
if (!cur_len) /* If last stripe r4w pages of last stripe */
|
if (!cur_len && do_xor)
|
||||||
|
/* If last stripe r4w pages of last stripe */
|
||||||
_read_4_write_last_stripe(ios);
|
_read_4_write_last_stripe(ios);
|
||||||
_read_4_write_execute(ios);
|
_read_4_write_execute(ios);
|
||||||
|
|
||||||
|
@ -655,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
|
||||||
++(ios->cur_par_page);
|
++(ios->cur_par_page);
|
||||||
}
|
}
|
||||||
|
|
||||||
BUG_ON(si->cur_comp != sp2d->data_devs);
|
BUG_ON(si->cur_comp < sp2d->data_devs);
|
||||||
BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
|
BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
|
||||||
|
|
||||||
ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
|
ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
|
||||||
|
@ -663,9 +673,10 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
|
||||||
if (unlikely(ret))
|
if (unlikely(ret))
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
/* TODO: raid6 if (last_parity_dev) */
|
if (do_xor) {
|
||||||
_gen_xor_unit(sp2d);
|
_gen_xor_unit(sp2d);
|
||||||
_sp2d_reset(sp2d, ios->r4w, ios->private);
|
_sp2d_reset(sp2d, ios->r4w, ios->private);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios);
|
||||||
void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
|
void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
|
||||||
bool not_last);
|
bool not_last);
|
||||||
int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
|
int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
|
||||||
struct ore_per_dev_state *per_dev, unsigned cur_len);
|
struct ore_per_dev_state *per_dev, unsigned cur_len,
|
||||||
|
bool do_xor);
|
||||||
void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
|
void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
|
||||||
struct ore_striping_info *si, struct page *page);
|
struct ore_striping_info *si, struct page *page);
|
||||||
static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
|
static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
|
||||||
|
|
Загрузка…
Ссылка в новой задаче