2011-08-07 06:26:31 +04:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2011
|
|
|
|
* Boaz Harrosh <bharrosh@panasas.com>
|
|
|
|
*
|
|
|
|
* Public Declarations of the ORE API
|
|
|
|
*
|
|
|
|
* This file is part of the ORE (Object Raid Engine) library.
|
|
|
|
*
|
|
|
|
* ORE is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License version 2 as published
|
|
|
|
* by the Free Software Foundation. (GPL v2)
|
|
|
|
*
|
|
|
|
* ORE is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with the ORE; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*/
|
|
|
|
#ifndef __ORE_H__
|
|
|
|
#define __ORE_H__
|
|
|
|
|
|
|
|
#include <scsi/osd_initiator.h>
|
|
|
|
#include <scsi/osd_attributes.h>
|
|
|
|
#include <scsi/osd_sec.h>
|
|
|
|
#include <linux/pnfs_osd_xdr.h>
|
2011-11-24 05:12:59 +04:00
|
|
|
#include <linux/bug.h>
|
2011-08-07 06:26:31 +04:00
|
|
|
|
|
|
|
struct ore_comp {
|
|
|
|
struct osd_obj_id obj;
|
|
|
|
u8 cred[OSD_CAP_LEN];
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ore_layout {
|
|
|
|
/* Our way of looking at the data_map */
|
2011-08-11 01:15:02 +04:00
|
|
|
enum pnfs_osd_raid_algorithm4
|
|
|
|
raid_algorithm;
|
2011-08-07 06:26:31 +04:00
|
|
|
unsigned stripe_unit;
|
|
|
|
unsigned mirrors_p1;
|
|
|
|
|
|
|
|
unsigned group_width;
|
2011-10-12 20:42:22 +04:00
|
|
|
unsigned parity;
|
2011-08-07 06:26:31 +04:00
|
|
|
u64 group_depth;
|
|
|
|
unsigned group_count;
|
2011-09-28 14:18:45 +04:00
|
|
|
|
|
|
|
/* Cached often needed calculations filled in by
|
|
|
|
* ore_verify_layout
|
|
|
|
*/
|
|
|
|
unsigned long max_io_length; /* Max length that should be passed to
|
|
|
|
* ore_get_rw_state
|
|
|
|
*/
|
2011-08-07 06:26:31 +04:00
|
|
|
};
|
|
|
|
|
2011-09-28 15:43:09 +04:00
|
|
|
struct ore_dev {
|
|
|
|
struct osd_dev *od;
|
|
|
|
};
|
|
|
|
|
2011-08-07 06:26:31 +04:00
|
|
|
struct ore_components {
|
2011-09-28 13:04:23 +04:00
|
|
|
unsigned first_dev; /* First logical device no */
|
2011-08-07 06:26:31 +04:00
|
|
|
unsigned numdevs; /* Num of devices in array */
|
|
|
|
/* If @single_comp == EC_SINGLE_COMP, @comps points to a single
|
|
|
|
* component. else there are @numdevs components
|
|
|
|
*/
|
|
|
|
enum EC_COMP_USAGE {
|
|
|
|
EC_SINGLE_COMP = 0, EC_MULTPLE_COMPS = 0xffffffff
|
|
|
|
} single_comp;
|
|
|
|
struct ore_comp *comps;
|
2011-09-28 15:43:09 +04:00
|
|
|
|
|
|
|
/* Array of pointers to ore_dev-* . User will usually have these pointed
|
|
|
|
* too a bigger struct which contain an "ore_dev ored" member and use
|
|
|
|
* container_of(oc->ods[i], struct foo_dev, ored) to access the bigger
|
|
|
|
* structure.
|
|
|
|
*/
|
|
|
|
struct ore_dev **ods;
|
2011-08-07 06:26:31 +04:00
|
|
|
};
|
|
|
|
|
2011-09-28 15:43:09 +04:00
|
|
|
/* ore_comp_dev Recievies a logical device index */
|
|
|
|
static inline struct osd_dev *ore_comp_dev(
|
|
|
|
const struct ore_components *oc, unsigned i)
|
|
|
|
{
|
2011-09-28 13:04:23 +04:00
|
|
|
BUG_ON((i < oc->first_dev) || (oc->first_dev + oc->numdevs <= i));
|
|
|
|
return oc->ods[i - oc->first_dev]->od;
|
2011-09-28 15:43:09 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void ore_comp_set_dev(
|
|
|
|
struct ore_components *oc, unsigned i, struct osd_dev *od)
|
|
|
|
{
|
2011-09-28 13:04:23 +04:00
|
|
|
oc->ods[i - oc->first_dev]->od = od;
|
2011-09-28 15:43:09 +04:00
|
|
|
}
|
|
|
|
|
2011-08-11 01:17:28 +04:00
|
|
|
struct ore_striping_info {
|
2011-10-12 20:42:22 +04:00
|
|
|
u64 offset;
|
2011-08-11 01:17:28 +04:00
|
|
|
u64 obj_offset;
|
2011-10-12 20:42:22 +04:00
|
|
|
u64 length;
|
|
|
|
u64 first_stripe_start; /* only used in raid writes */
|
2011-08-11 01:17:28 +04:00
|
|
|
u64 M; /* for truncate */
|
2011-10-12 20:42:22 +04:00
|
|
|
unsigned bytes_in_stripe;
|
2011-08-11 01:17:28 +04:00
|
|
|
unsigned dev;
|
2011-10-12 20:42:22 +04:00
|
|
|
unsigned par_dev;
|
2011-08-11 01:17:28 +04:00
|
|
|
unsigned unit_off;
|
ore: RAID5 Write
This is finally the RAID5 Write support.
The bigger part of this patch is not the XOR engine itself, But the
read4write logic, which is a complete mini prepare_for_striping
reading engine that can read scattered pages of a stripe into cache
so it can be used for XOR calculation. That is, if the write was not
stripe aligned.
The main algorithm behind the XOR engine is the 2 dimensional array:
struct __stripe_pages_2d.
A drawing might save 1000 words
---
__stripe_pages_2d
|
n = pages_in_stripe_unit;
w = group_width - parity;
| pages array presented to the XOR lib
| |
V |
__1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---|
| |
__1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <---
|
... | ...
|
__1_page_stripe[n].pages --> [c0][c1]..[cw][c_par]
^
|
data added columns first then row
---
The pages are put on this array columns first. .i.e:
p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ...
So we are doing a corner turn of the pages.
Note that pages will zigzag down and left. but are put sequentially
in growing order. So when the time comes to XOR the stripe, only the
beginning and end of the array need be checked. We scan the array
and any NULL spot will be field by pages-to-be-read.
The FS that wants to support RAID5 needs to supply an
operations-vector that searches a given page in cache, and specifies
if the page is uptodate or need reading. All these pages to be read
are put on a slave ore_io_state and synchronously read. All the pages
of a stripe are read in one IO, using the scatter gather mechanism.
In write we constrain our IO to only be incomplete on a single
stripe. Meaning either the complete IO is within a single stripe so
we might have pages to read from both beginning or end of the
strip. Or we have some reading to do at beginning but end at strip
boundary. The left over pages are pushed to the next IO by the API
already established by previous work, where an IO offset/length
combination presented to the ORE might get the length truncated and
the user must re-submit the leftover pages. (Both exofs and NFS
support this)
But any ORE user should make it's best effort to align it's IO
before hand and avoid complications. A cached ore_layout->stripe_size
member can be used for that calculation. (NOTE: that ORE demands
that stripe_size may not be bigger then 32bit)
What else? Well read it and tell me.
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
2011-10-14 17:33:51 +04:00
|
|
|
unsigned cur_pg;
|
2011-10-12 20:42:22 +04:00
|
|
|
unsigned cur_comp;
|
2013-11-21 19:58:08 +04:00
|
|
|
unsigned maxdevUnits;
|
2011-08-11 01:17:28 +04:00
|
|
|
};
|
|
|
|
|
2011-08-07 06:26:31 +04:00
|
|
|
struct ore_io_state;
|
|
|
|
typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private);
|
ore: RAID5 Write
This is finally the RAID5 Write support.
The bigger part of this patch is not the XOR engine itself, But the
read4write logic, which is a complete mini prepare_for_striping
reading engine that can read scattered pages of a stripe into cache
so it can be used for XOR calculation. That is, if the write was not
stripe aligned.
The main algorithm behind the XOR engine is the 2 dimensional array:
struct __stripe_pages_2d.
A drawing might save 1000 words
---
__stripe_pages_2d
|
n = pages_in_stripe_unit;
w = group_width - parity;
| pages array presented to the XOR lib
| |
V |
__1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---|
| |
__1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <---
|
... | ...
|
__1_page_stripe[n].pages --> [c0][c1]..[cw][c_par]
^
|
data added columns first then row
---
The pages are put on this array columns first. .i.e:
p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ...
So we are doing a corner turn of the pages.
Note that pages will zigzag down and left. but are put sequentially
in growing order. So when the time comes to XOR the stripe, only the
beginning and end of the array need be checked. We scan the array
and any NULL spot will be field by pages-to-be-read.
The FS that wants to support RAID5 needs to supply an
operations-vector that searches a given page in cache, and specifies
if the page is uptodate or need reading. All these pages to be read
are put on a slave ore_io_state and synchronously read. All the pages
of a stripe are read in one IO, using the scatter gather mechanism.
In write we constrain our IO to only be incomplete on a single
stripe. Meaning either the complete IO is within a single stripe so
we might have pages to read from both beginning or end of the
strip. Or we have some reading to do at beginning but end at strip
boundary. The left over pages are pushed to the next IO by the API
already established by previous work, where an IO offset/length
combination presented to the ORE might get the length truncated and
the user must re-submit the leftover pages. (Both exofs and NFS
support this)
But any ORE user should make it's best effort to align it's IO
before hand and avoid complications. A cached ore_layout->stripe_size
member can be used for that calculation. (NOTE: that ORE demands
that stripe_size may not be bigger then 32bit)
What else? Well read it and tell me.
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
2011-10-14 17:33:51 +04:00
|
|
|
struct _ore_r4w_op {
|
|
|
|
/* @Priv given here is passed ios->private */
|
|
|
|
struct page * (*get_page)(void *priv, u64 page_index, bool *uptodate);
|
|
|
|
void (*put_page)(void *priv, struct page *page);
|
|
|
|
};
|
2011-08-07 06:26:31 +04:00
|
|
|
|
|
|
|
struct ore_io_state {
|
|
|
|
struct kref kref;
|
2011-10-02 17:32:50 +04:00
|
|
|
struct ore_striping_info si;
|
2011-08-07 06:26:31 +04:00
|
|
|
|
|
|
|
void *private;
|
|
|
|
ore_io_done_fn done;
|
|
|
|
|
|
|
|
struct ore_layout *layout;
|
2011-09-28 12:39:59 +04:00
|
|
|
struct ore_components *oc;
|
2011-08-07 06:26:31 +04:00
|
|
|
|
|
|
|
/* Global read/write IO*/
|
|
|
|
loff_t offset;
|
|
|
|
unsigned long length;
|
|
|
|
void *kern_buff;
|
|
|
|
|
|
|
|
struct page **pages;
|
|
|
|
unsigned nr_pages;
|
|
|
|
unsigned pgbase;
|
|
|
|
unsigned pages_consumed;
|
|
|
|
|
|
|
|
/* Attributes */
|
|
|
|
unsigned in_attr_len;
|
|
|
|
struct osd_attr *in_attr;
|
|
|
|
unsigned out_attr_len;
|
|
|
|
struct osd_attr *out_attr;
|
|
|
|
|
|
|
|
bool reading;
|
|
|
|
|
2011-10-12 20:42:22 +04:00
|
|
|
/* House keeping of Parity pages */
|
|
|
|
bool extra_part_alloc;
|
|
|
|
struct page **parity_pages;
|
|
|
|
unsigned max_par_pages;
|
|
|
|
unsigned cur_par_page;
|
|
|
|
unsigned sgs_per_dev;
|
ore: RAID5 Write
This is finally the RAID5 Write support.
The bigger part of this patch is not the XOR engine itself, But the
read4write logic, which is a complete mini prepare_for_striping
reading engine that can read scattered pages of a stripe into cache
so it can be used for XOR calculation. That is, if the write was not
stripe aligned.
The main algorithm behind the XOR engine is the 2 dimensional array:
struct __stripe_pages_2d.
A drawing might save 1000 words
---
__stripe_pages_2d
|
n = pages_in_stripe_unit;
w = group_width - parity;
| pages array presented to the XOR lib
| |
V |
__1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---|
| |
__1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <---
|
... | ...
|
__1_page_stripe[n].pages --> [c0][c1]..[cw][c_par]
^
|
data added columns first then row
---
The pages are put on this array columns first. .i.e:
p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ...
So we are doing a corner turn of the pages.
Note that pages will zigzag down and left. but are put sequentially
in growing order. So when the time comes to XOR the stripe, only the
beginning and end of the array need be checked. We scan the array
and any NULL spot will be field by pages-to-be-read.
The FS that wants to support RAID5 needs to supply an
operations-vector that searches a given page in cache, and specifies
if the page is uptodate or need reading. All these pages to be read
are put on a slave ore_io_state and synchronously read. All the pages
of a stripe are read in one IO, using the scatter gather mechanism.
In write we constrain our IO to only be incomplete on a single
stripe. Meaning either the complete IO is within a single stripe so
we might have pages to read from both beginning or end of the
strip. Or we have some reading to do at beginning but end at strip
boundary. The left over pages are pushed to the next IO by the API
already established by previous work, where an IO offset/length
combination presented to the ORE might get the length truncated and
the user must re-submit the leftover pages. (Both exofs and NFS
support this)
But any ORE user should make it's best effort to align it's IO
before hand and avoid complications. A cached ore_layout->stripe_size
member can be used for that calculation. (NOTE: that ORE demands
that stripe_size may not be bigger then 32bit)
What else? Well read it and tell me.
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
2011-10-14 17:33:51 +04:00
|
|
|
struct __stripe_pages_2d *sp2d;
|
|
|
|
struct ore_io_state *ios_read_4_write;
|
|
|
|
const struct _ore_r4w_op *r4w;
|
2011-10-12 20:42:22 +04:00
|
|
|
|
2011-08-07 06:26:31 +04:00
|
|
|
/* Variable array of size numdevs */
|
|
|
|
unsigned numdevs;
|
|
|
|
struct ore_per_dev_state {
|
|
|
|
struct osd_request *or;
|
|
|
|
struct bio *bio;
|
|
|
|
loff_t offset;
|
|
|
|
unsigned length;
|
2011-10-12 20:42:22 +04:00
|
|
|
unsigned last_sgs_total;
|
2011-08-07 06:26:31 +04:00
|
|
|
unsigned dev;
|
2011-10-12 20:42:22 +04:00
|
|
|
struct osd_sg_entry *sglist;
|
|
|
|
unsigned cur_sg;
|
2011-08-07 06:26:31 +04:00
|
|
|
} per_dev[];
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline unsigned ore_io_state_size(unsigned numdevs)
|
|
|
|
{
|
|
|
|
return sizeof(struct ore_io_state) +
|
|
|
|
sizeof(struct ore_per_dev_state) * numdevs;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ore.c */
|
2011-09-28 14:18:45 +04:00
|
|
|
int ore_verify_layout(unsigned total_comps, struct ore_layout *layout);
|
2011-10-04 16:20:17 +04:00
|
|
|
void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
|
2011-10-12 20:42:22 +04:00
|
|
|
u64 length, struct ore_striping_info *si);
|
2011-08-07 06:26:31 +04:00
|
|
|
int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
|
|
|
|
bool is_reading, u64 offset, u64 length,
|
|
|
|
struct ore_io_state **ios);
|
|
|
|
int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
|
|
|
|
struct ore_io_state **ios);
|
|
|
|
void ore_put_io_state(struct ore_io_state *ios);
|
|
|
|
|
2011-09-28 14:25:50 +04:00
|
|
|
typedef void (*ore_on_dev_error)(struct ore_io_state *ios, struct ore_dev *od,
|
|
|
|
unsigned dev_index, enum osd_err_priority oep,
|
|
|
|
u64 dev_offset, u64 dev_len);
|
|
|
|
int ore_check_io(struct ore_io_state *ios, ore_on_dev_error rep);
|
2011-08-07 06:26:31 +04:00
|
|
|
|
|
|
|
int ore_create(struct ore_io_state *ios);
|
|
|
|
int ore_remove(struct ore_io_state *ios);
|
|
|
|
int ore_write(struct ore_io_state *ios);
|
|
|
|
int ore_read(struct ore_io_state *ios);
|
|
|
|
int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
|
|
|
|
u64 size);
|
|
|
|
|
|
|
|
int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr);
|
|
|
|
|
|
|
|
extern const struct osd_attr g_attr_logical_length;
|
|
|
|
|
|
|
|
#endif
|