linux-2.6/fs/exofs/ore_raid.h

/*
 * Copyright (C) from 2011
 * Boaz Harrosh <bharrosh@panasas.com>
 *
 * This file is part of the objects raid engine (ore).
 *
 * It is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation.
 *
 * You should have received a copy of the GNU General Public License
 * along with "ore". If not, write to the Free Software Foundation, Inc:
 *	"Free Software Foundation <info@fsf.org>"
 */

#include <scsi/osd_ore.h>

#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)

#ifdef CONFIG_EXOFS_DEBUG
#define ORE_DBGMSG(fmt, a...) \
	printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
#else
#define ORE_DBGMSG(fmt, a...) \
	do { if (0) printk(fmt, ##a); } while (0)
#endif

/* u64 has problems with printk this will cast it to unsigned long long */
#define _LLU(x) (unsigned long long)(x)

#define ORE_DBGMSG2(M...) do {} while (0)
/* #define ORE_DBGMSG2 ORE_DBGMSG */

/* Calculate the component order in a stripe. eg the logical data unit
 * address within the stripe of @dev given the @par_dev of this stripe.
 */
static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
				  unsigned par_dev, unsigned dev)
{
	unsigned first_dev = dev - dev % devs_in_group;

	dev -= first_dev;
	par_dev -= first_dev;

	if (devs_in_group == par_dev) /* The raid 0 case */
		return dev / mirrors_p1;
	/* raid4/5/6 case */
	return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
	       mirrors_p1;
}

/* ios_raid.c stuff needed by ios.c */
int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
void _ore_free_raid_stuff(struct ore_io_state *ios);

void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
		 bool not_last);
int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
		     struct ore_per_dev_state *per_dev, unsigned cur_len);
void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
		       struct ore_striping_info *si, struct page *page);
static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
				struct ore_striping_info *si, struct page *page)
{
	if (!sp2d) /* Inline the fast path */
		return; /* Hay no raid stuff */
	_ore_add_stripe_page(sp2d, si, page);
}

/* ios.c stuff needed by ios_raid.c */
int  _ore_get_io_state(struct ore_layout *layout,
			struct ore_components *oc, unsigned numdevs,
			unsigned sgs_per_dev, unsigned num_par_pages,
			struct ore_io_state **pios);
int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
		unsigned pgbase, struct page **pages,
		struct ore_per_dev_state *per_dev, int cur_len);
int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
int ore_io_execute(struct ore_io_state *ios);
ore: RAID5 read This patch introduces the first stage of RAID5 support mainly the skip-over-raid-units when reading. For writes it inserts BLANK units, into where XOR blocks should be calculated and written to. It introduces the new "general raid maths", and the main additional parameters and components needed for raid5. Since at this stage it could corrupt future version that actually do support raid5. The enablement of raid5 mounting and setting of parity-count > 0 is disabled. So the raid5 code will never be used. Mounting of raid5 is only enabled later once the basic XOR write is also in. But if the patch "enable RAID5" is applied this code has been tested to be able to properly read raid5 volumes and is according to standard. Also it has been tested that the new maths still properly supports RAID0 and grouping code just as before. (BTW: I have found more bugs in the pnfs-obj RAID math fixed here) The ore.c file is getting too big, so new ore_raid.[hc] files are added that will include the special raid stuff that are not used in striping and mirrors. In future write support these will get bigger. When adding the ore_raid.c to Kbuild file I was forced to rename ore.ko to libore.ko. Is it possible to keep source file, say ore.c and module file ore.ko the same even if there are multiple files inside ore.ko? Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> 2011-10-12 16:42:22 +00:00			`/*`
			`* Copyright (C) from 2011`
			`* Boaz Harrosh <bharrosh@panasas.com>`
			`*`
			`* This file is part of the objects raid engine (ore).`
			`*`
			`* It is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License version 2 as published`
			`* by the Free Software Foundation.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with "ore". If not, write to the Free Software Foundation, Inc:`
			`* "Free Software Foundation <info@fsf.org>"`
			`*/`

			`#include <scsi/osd_ore.h>`

			`#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)`

			`#ifdef CONFIG_EXOFS_DEBUG`
			`#define ORE_DBGMSG(fmt, a...) \`
			`printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)`
			`#else`
			`#define ORE_DBGMSG(fmt, a...) \`
			`do { if (0) printk(fmt, ##a); } while (0)`
			`#endif`

			`/* u64 has problems with printk this will cast it to unsigned long long */`
			`#define _LLU(x) (unsigned long long)(x)`

			`#define ORE_DBGMSG2(M...) do {} while (0)`
			`/* #define ORE_DBGMSG2 ORE_DBGMSG */`

			`/* Calculate the component order in a stripe. eg the logical data unit`
			`* address within the stripe of @dev given the @par_dev of this stripe.`
			`*/`
			`static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,`
			`unsigned par_dev, unsigned dev)`
			`{`
			`unsigned first_dev = dev - dev % devs_in_group;`

			`dev -= first_dev;`
			`par_dev -= first_dev;`

			`if (devs_in_group == par_dev) /* The raid 0 case */`
			`return dev / mirrors_p1;`
			`/* raid4/5/6 case */`
			`return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /`
			`mirrors_p1;`
			`}`

			`/* ios_raid.c stuff needed by ios.c */`
			`int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);`
			`void _ore_free_raid_stuff(struct ore_io_state *ios);`

			`void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,`
			`bool not_last);`
			`int _ore_add_parity_unit(struct ore_io_state ios, struct ore_striping_info si,`
			`struct ore_per_dev_state *per_dev, unsigned cur_len);`
ore: RAID5 Write This is finally the RAID5 Write support. The bigger part of this patch is not the XOR engine itself, But the read4write logic, which is a complete mini prepare_for_striping reading engine that can read scattered pages of a stripe into cache so it can be used for XOR calculation. That is, if the write was not stripe aligned. The main algorithm behind the XOR engine is the 2 dimensional array: struct __stripe_pages_2d. A drawing might save 1000 words --- __stripe_pages_2d \| n = pages_in_stripe_unit; w = group_width - parity; \| pages array presented to the XOR lib \| \| V \| __1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---\| \| \| __1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <--- \| ... \| ... \| __1_page_stripe[n].pages --> [c0][c1]..[cw][c_par] ^ \| data added columns first then row --- The pages are put on this array columns first. .i.e: p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ... So we are doing a corner turn of the pages. Note that pages will zigzag down and left. but are put sequentially in growing order. So when the time comes to XOR the stripe, only the beginning and end of the array need be checked. We scan the array and any NULL spot will be field by pages-to-be-read. The FS that wants to support RAID5 needs to supply an operations-vector that searches a given page in cache, and specifies if the page is uptodate or need reading. All these pages to be read are put on a slave ore_io_state and synchronously read. All the pages of a stripe are read in one IO, using the scatter gather mechanism. In write we constrain our IO to only be incomplete on a single stripe. Meaning either the complete IO is within a single stripe so we might have pages to read from both beginning or end of the strip. Or we have some reading to do at beginning but end at strip boundary. The left over pages are pushed to the next IO by the API already established by previous work, where an IO offset/length combination presented to the ORE might get the length truncated and the user must re-submit the leftover pages. (Both exofs and NFS support this) But any ORE user should make it's best effort to align it's IO before hand and avoid complications. A cached ore_layout->stripe_size member can be used for that calculation. (NOTE: that ORE demands that stripe_size may not be bigger then 32bit) What else? Well read it and tell me. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> 2011-10-14 13:33:51 +00:00			`void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,`
			`struct ore_striping_info si, struct page page);`
			`static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,`
			`struct ore_striping_info si, struct page page)`
			`{`
			`if (!sp2d) /* Inline the fast path */`
			`return; /* Hay no raid stuff */`
			`_ore_add_stripe_page(sp2d, si, page);`
			`}`
ore: RAID5 read This patch introduces the first stage of RAID5 support mainly the skip-over-raid-units when reading. For writes it inserts BLANK units, into where XOR blocks should be calculated and written to. It introduces the new "general raid maths", and the main additional parameters and components needed for raid5. Since at this stage it could corrupt future version that actually do support raid5. The enablement of raid5 mounting and setting of parity-count > 0 is disabled. So the raid5 code will never be used. Mounting of raid5 is only enabled later once the basic XOR write is also in. But if the patch "enable RAID5" is applied this code has been tested to be able to properly read raid5 volumes and is according to standard. Also it has been tested that the new maths still properly supports RAID0 and grouping code just as before. (BTW: I have found more bugs in the pnfs-obj RAID math fixed here) The ore.c file is getting too big, so new ore_raid.[hc] files are added that will include the special raid stuff that are not used in striping and mirrors. In future write support these will get bigger. When adding the ore_raid.c to Kbuild file I was forced to rename ore.ko to libore.ko. Is it possible to keep source file, say ore.c and module file ore.ko the same even if there are multiple files inside ore.ko? Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> 2011-10-12 16:42:22 +00:00
			`/* ios.c stuff needed by ios_raid.c */`
ore: RAID5 Write This is finally the RAID5 Write support. The bigger part of this patch is not the XOR engine itself, But the read4write logic, which is a complete mini prepare_for_striping reading engine that can read scattered pages of a stripe into cache so it can be used for XOR calculation. That is, if the write was not stripe aligned. The main algorithm behind the XOR engine is the 2 dimensional array: struct __stripe_pages_2d. A drawing might save 1000 words --- __stripe_pages_2d \| n = pages_in_stripe_unit; w = group_width - parity; \| pages array presented to the XOR lib \| \| V \| __1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---\| \| \| __1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <--- \| ... \| ... \| __1_page_stripe[n].pages --> [c0][c1]..[cw][c_par] ^ \| data added columns first then row --- The pages are put on this array columns first. .i.e: p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ... So we are doing a corner turn of the pages. Note that pages will zigzag down and left. but are put sequentially in growing order. So when the time comes to XOR the stripe, only the beginning and end of the array need be checked. We scan the array and any NULL spot will be field by pages-to-be-read. The FS that wants to support RAID5 needs to supply an operations-vector that searches a given page in cache, and specifies if the page is uptodate or need reading. All these pages to be read are put on a slave ore_io_state and synchronously read. All the pages of a stripe are read in one IO, using the scatter gather mechanism. In write we constrain our IO to only be incomplete on a single stripe. Meaning either the complete IO is within a single stripe so we might have pages to read from both beginning or end of the strip. Or we have some reading to do at beginning but end at strip boundary. The left over pages are pushed to the next IO by the API already established by previous work, where an IO offset/length combination presented to the ORE might get the length truncated and the user must re-submit the leftover pages. (Both exofs and NFS support this) But any ORE user should make it's best effort to align it's IO before hand and avoid complications. A cached ore_layout->stripe_size member can be used for that calculation. (NOTE: that ORE demands that stripe_size may not be bigger then 32bit) What else? Well read it and tell me. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> 2011-10-14 13:33:51 +00:00			`int _ore_get_io_state(struct ore_layout *layout,`
			`struct ore_components *oc, unsigned numdevs,`
			`unsigned sgs_per_dev, unsigned num_par_pages,`
			`struct ore_io_state **pios);`
ore: RAID5 read This patch introduces the first stage of RAID5 support mainly the skip-over-raid-units when reading. For writes it inserts BLANK units, into where XOR blocks should be calculated and written to. It introduces the new "general raid maths", and the main additional parameters and components needed for raid5. Since at this stage it could corrupt future version that actually do support raid5. The enablement of raid5 mounting and setting of parity-count > 0 is disabled. So the raid5 code will never be used. Mounting of raid5 is only enabled later once the basic XOR write is also in. But if the patch "enable RAID5" is applied this code has been tested to be able to properly read raid5 volumes and is according to standard. Also it has been tested that the new maths still properly supports RAID0 and grouping code just as before. (BTW: I have found more bugs in the pnfs-obj RAID math fixed here) The ore.c file is getting too big, so new ore_raid.[hc] files are added that will include the special raid stuff that are not used in striping and mirrors. In future write support these will get bigger. When adding the ore_raid.c to Kbuild file I was forced to rename ore.ko to libore.ko. Is it possible to keep source file, say ore.c and module file ore.ko the same even if there are multiple files inside ore.ko? Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> 2011-10-12 16:42:22 +00:00			`int _ore_add_stripe_unit(struct ore_io_state ios, unsigned cur_pg,`
			`unsigned pgbase, struct page **pages,`
			`struct ore_per_dev_state *per_dev, int cur_len);`
ore: RAID5 Write This is finally the RAID5 Write support. The bigger part of this patch is not the XOR engine itself, But the read4write logic, which is a complete mini prepare_for_striping reading engine that can read scattered pages of a stripe into cache so it can be used for XOR calculation. That is, if the write was not stripe aligned. The main algorithm behind the XOR engine is the 2 dimensional array: struct __stripe_pages_2d. A drawing might save 1000 words --- __stripe_pages_2d \| n = pages_in_stripe_unit; w = group_width - parity; \| pages array presented to the XOR lib \| \| V \| __1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---\| \| \| __1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <--- \| ... \| ... \| __1_page_stripe[n].pages --> [c0][c1]..[cw][c_par] ^ \| data added columns first then row --- The pages are put on this array columns first. .i.e: p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ... So we are doing a corner turn of the pages. Note that pages will zigzag down and left. but are put sequentially in growing order. So when the time comes to XOR the stripe, only the beginning and end of the array need be checked. We scan the array and any NULL spot will be field by pages-to-be-read. The FS that wants to support RAID5 needs to supply an operations-vector that searches a given page in cache, and specifies if the page is uptodate or need reading. All these pages to be read are put on a slave ore_io_state and synchronously read. All the pages of a stripe are read in one IO, using the scatter gather mechanism. In write we constrain our IO to only be incomplete on a single stripe. Meaning either the complete IO is within a single stripe so we might have pages to read from both beginning or end of the strip. Or we have some reading to do at beginning but end at strip boundary. The left over pages are pushed to the next IO by the API already established by previous work, where an IO offset/length combination presented to the ORE might get the length truncated and the user must re-submit the leftover pages. (Both exofs and NFS support this) But any ORE user should make it's best effort to align it's IO before hand and avoid complications. A cached ore_layout->stripe_size member can be used for that calculation. (NOTE: that ORE demands that stripe_size may not be bigger then 32bit) What else? Well read it and tell me. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> 2011-10-14 13:33:51 +00:00			`int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);`
			`int ore_io_execute(struct ore_io_state *ios);`