From b5e1b8cee7ad58a15d2fa79bcd7946acb592602d Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Mon, 21 May 2012 09:26:59 +1000
Subject: [PATCH 01/56] md: using GFP_NOIO to allocate bio for flush request

A flush request is usually issued in transaction commit code path, so
using GFP_KERNEL to allocate memory for flush request bio falls into
the classic deadlock issue.

This is suitable for any -stable kernel to which it applies as it
avoids a possible deadlock.

Cc: stable@vger.kernel.org
Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 01233d855eb..2b30ffdb81b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -452,7 +452,7 @@ static void submit_flushes(struct work_struct *ws)
 			atomic_inc(&rdev->nr_pending);
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
-			bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
+			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
 			bi->bi_end_io = md_end_flush;
 			bi->bi_private = rdev;
 			bi->bi_bdev = rdev->bdev;

From 2c810cddc44d6f95cef75df3f07fc0850ff92417 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 May 2012 09:27:00 +1000
Subject: [PATCH 02/56] md: allow a reshape operation to be reversed.

Currently a reshape operation always progresses from the start
of the array to the end unless the number of devices is being
reduced, in which case it progressed in the opposite direction.

To reverse a partial reshape which changes the number of devices
you can stop the array and re-assemble with the raid-disks numbers
reversed and it will undo.

However for a reshape that does not change the number of devices
it is not possible to reverse the reshape in the middle - you have to
wait until it completes.

So add a 'reshape_direction' attribute with is either 'forwards' or
'backwards' and can be explicitly set when delta_disks is zero.

This will become more important when we allow the data_offset to
change in a reshape.  Then the explicit statement of what direction is
being used will be more useful.

This can be enabled in raid5 trivially as it already supports
reverse reshape and just needs to use a different trigger to request it.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 67 +++++++++++++++++++++++++++++++++++++--
 drivers/md/md.h           |  1 +
 drivers/md/raid5.c        | 23 +++++++-------
 include/linux/raid/md_p.h |  7 +++-
 4 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2b30ffdb81b..44bb1d52dd4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -607,6 +607,7 @@ void mddev_init(struct mddev *mddev)
 	init_waitqueue_head(&mddev->sb_wait);
 	init_waitqueue_head(&mddev->recovery_wait);
 	mddev->reshape_position = MaxSector;
+	mddev->reshape_backwards = 0;
 	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
 	mddev->level = LEVEL_NONE;
@@ -1185,6 +1186,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 		mddev->events = ev1;
 		mddev->bitmap_info.offset = 0;
 		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
+		mddev->reshape_backwards = 0;
 
 		if (mddev->minor_version >= 91) {
 			mddev->reshape_position = sb->reshape_position;
@@ -1192,6 +1194,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 			mddev->new_level = sb->new_level;
 			mddev->new_layout = sb->new_layout;
 			mddev->new_chunk_sectors = sb->new_chunk >> 9;
+			if (mddev->delta_disks < 0)
+				mddev->reshape_backwards = 1;
 		} else {
 			mddev->reshape_position = MaxSector;
 			mddev->delta_disks = 0;
@@ -1645,7 +1649,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		mddev->events = ev1;
 		mddev->bitmap_info.offset = 0;
 		mddev->bitmap_info.default_offset = 1024 >> 9;
-		
+		mddev->reshape_backwards = 0;
+
 		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
 		memcpy(mddev->uuid, sb->set_uuid, 16);
 
@@ -1662,6 +1667,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			mddev->new_level = le32_to_cpu(sb->new_level);
 			mddev->new_layout = le32_to_cpu(sb->new_layout);
 			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
+			if (mddev->delta_disks < 0 ||
+			    (mddev->delta_disks == 0 &&
+			     (le32_to_cpu(sb->feature_map)
+			      & MD_FEATURE_RESHAPE_BACKWARDS)))
+				mddev->reshape_backwards = 1;
 		} else {
 			mddev->reshape_position = MaxSector;
 			mddev->delta_disks = 0;
@@ -1781,6 +1791,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
 		sb->new_level = cpu_to_le32(mddev->new_level);
 		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
+		if (mddev->delta_disks == 0 &&
+		    mddev->reshape_backwards)
+			sb->feature_map
+				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
 	}
 
 	if (rdev->badblocks.count == 0)
@@ -3419,6 +3433,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 		mddev->new_chunk_sectors = mddev->chunk_sectors;
 		mddev->raid_disks -= mddev->delta_disks;
 		mddev->delta_disks = 0;
+		mddev->reshape_backwards = 0;
 		module_put(pers->owner);
 		printk(KERN_WARNING "md: %s: %s would not accept array\n",
 		       mdname(mddev), clevel);
@@ -3492,6 +3507,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 	mddev->layout = mddev->new_layout;
 	mddev->chunk_sectors = mddev->new_chunk_sectors;
 	mddev->delta_disks = 0;
+	mddev->reshape_backwards = 0;
 	mddev->degraded = 0;
 	if (mddev->pers->sync_request == NULL) {
 		/* this is now an array without redundancy, so
@@ -3585,6 +3601,7 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
 		int olddisks = mddev->raid_disks - mddev->delta_disks;
 		mddev->delta_disks = n - olddisks;
 		mddev->raid_disks = n;
+		mddev->reshape_backwards = (mddev->delta_disks < 0);
 	} else
 		mddev->raid_disks = n;
 	return rv ? rv : len;
@@ -4436,6 +4453,7 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
 		return -EINVAL;
 	mddev->reshape_position = new;
 	mddev->delta_disks = 0;
+	mddev->reshape_backwards = 0;
 	mddev->new_level = mddev->level;
 	mddev->new_layout = mddev->layout;
 	mddev->new_chunk_sectors = mddev->chunk_sectors;
@@ -4446,6 +4464,42 @@ static struct md_sysfs_entry md_reshape_position =
 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
        reshape_position_store);
 
+static ssize_t
+reshape_direction_show(struct mddev *mddev, char *page)
+{
+	return sprintf(page, "%s\n",
+		       mddev->reshape_backwards ? "backwards" : "forwards");
+}
+
+static ssize_t
+reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
+{
+	int backwards = 0;
+	if (cmd_match(buf, "forwards"))
+		backwards = 0;
+	else if (cmd_match(buf, "backwards"))
+		backwards = 1;
+	else
+		return -EINVAL;
+	if (mddev->reshape_backwards == backwards)
+		return len;
+
+	/* check if we are allowed to change */
+	if (mddev->delta_disks)
+		return -EBUSY;
+
+	if (mddev->persistent &&
+	    mddev->major_version == 0)
+		return -EINVAL;
+
+	mddev->reshape_backwards = backwards;
+	return len;
+}
+
+static struct md_sysfs_entry md_reshape_direction =
+__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
+       reshape_direction_store);
+
 static ssize_t
 array_size_show(struct mddev *mddev, char *page)
 {
@@ -4501,6 +4555,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_safe_delay.attr,
 	&md_array_state.attr,
 	&md_reshape_position.attr,
+	&md_reshape_direction.attr,
 	&md_array_size.attr,
 	&max_corr_read_errors.attr,
 	NULL,
@@ -5064,6 +5119,7 @@ static void md_clean(struct mddev *mddev)
 	mddev->events = 0;
 	mddev->can_decrease_events = 0;
 	mddev->delta_disks = 0;
+	mddev->reshape_backwards = 0;
 	mddev->new_level = LEVEL_NONE;
 	mddev->new_layout = 0;
 	mddev->new_chunk_sectors = 0;
@@ -5888,6 +5944,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
 	mddev->new_chunk_sectors = mddev->chunk_sectors;
 	mddev->new_layout = mddev->layout;
 	mddev->delta_disks = 0;
+	mddev->reshape_backwards = 0;
 
 	return 0;
 }
@@ -5953,10 +6010,16 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
 	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
 		return -EBUSY;
 	mddev->delta_disks = raid_disks - mddev->raid_disks;
+	if (mddev->delta_disks < 0)
+		mddev->reshape_backwards = 1;
+	else if (mddev->delta_disks > 0)
+		mddev->reshape_backwards = 0;
 
 	rv = mddev->pers->check_reshape(mddev);
-	if (rv < 0)
+	if (rv < 0) {
 		mddev->delta_disks = 0;
+		mddev->reshape_backwards = 0;
+	}
 	return rv;
 }
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1c2063ccf48..d51c0ca3777 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -262,6 +262,7 @@ struct mddev {
 	sector_t			reshape_position;
 	int				delta_disks, new_level, new_layout;
 	int				new_chunk_sectors;
+	int				reshape_backwards;
 
 	atomic_t			plug_cnt;	/* If device is expecting
 							 * more bios soon.
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f351422938e..0abbd3447cf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3970,13 +3970,13 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 			 * to check again.
 			 */
 			spin_lock_irq(&conf->device_lock);
-			if (mddev->delta_disks < 0
+			if (mddev->reshape_backwards
 			    ? logical_sector < conf->reshape_progress
 			    : logical_sector >= conf->reshape_progress) {
 				disks = conf->previous_raid_disks;
 				previous = 1;
 			} else {
-				if (mddev->delta_disks < 0
+				if (mddev->reshape_backwards
 				    ? logical_sector < conf->reshape_safe
 				    : logical_sector >= conf->reshape_safe) {
 					spin_unlock_irq(&conf->device_lock);
@@ -4009,7 +4009,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 				 */
 				int must_retry = 0;
 				spin_lock_irq(&conf->device_lock);
-				if (mddev->delta_disks < 0
+				if (mddev->reshape_backwards
 				    ? logical_sector >= conf->reshape_progress
 				    : logical_sector < conf->reshape_progress)
 					/* mismatch, need to try again */
@@ -4108,11 +4108,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 
 	if (sector_nr == 0) {
 		/* If restarting in the middle, skip the initial sectors */
-		if (mddev->delta_disks < 0 &&
+		if (mddev->reshape_backwards &&
 		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
 			sector_nr = raid5_size(mddev, 0, 0)
 				- conf->reshape_progress;
-		} else if (mddev->delta_disks >= 0 &&
+		} else if (!mddev->reshape_backwards &&
 			   conf->reshape_progress > 0)
 			sector_nr = conf->reshape_progress;
 		sector_div(sector_nr, new_data_disks);
@@ -4147,7 +4147,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	sector_div(readpos, data_disks);
 	safepos = conf->reshape_safe;
 	sector_div(safepos, data_disks);
-	if (mddev->delta_disks < 0) {
+	if (mddev->reshape_backwards) {
 		writepos -= min_t(sector_t, reshape_sectors, writepos);
 		readpos += reshape_sectors;
 		safepos += reshape_sectors;
@@ -4174,7 +4174,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	 * Maybe that number should be configurable, but I'm not sure it is
 	 * worth it.... maybe it could be a multiple of safemode_delay???
 	 */
-	if ((mddev->delta_disks < 0
+	if ((mddev->reshape_backwards
 	     ? (safepos > writepos && readpos < writepos)
 	     : (safepos < writepos && readpos > writepos)) ||
 	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
@@ -4195,7 +4195,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	}
 
-	if (mddev->delta_disks < 0) {
+	if (mddev->reshape_backwards) {
 		BUG_ON(conf->reshape_progress == 0);
 		stripe_addr = writepos;
 		BUG_ON((mddev->dev_sectors &
@@ -4239,7 +4239,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 		list_add(&sh->lru, &stripes);
 	}
 	spin_lock_irq(&conf->device_lock);
-	if (mddev->delta_disks < 0)
+	if (mddev->reshape_backwards)
 		conf->reshape_progress -= reshape_sectors * new_data_disks;
 	else
 		conf->reshape_progress += reshape_sectors * new_data_disks;
@@ -5008,7 +5008,7 @@ static int run(struct mddev *mddev)
 				       mdname(mddev));
 				return -EINVAL;
 			}
-		} else if (mddev->delta_disks < 0
+		} else if (mddev->reshape_backwards
 		    ? (here_new * mddev->new_chunk_sectors <=
 		       here_old * mddev->chunk_sectors)
 		    : (here_new * mddev->new_chunk_sectors >=
@@ -5535,7 +5535,7 @@ static int raid5_start_reshape(struct mddev *mddev)
 	conf->chunk_sectors = mddev->new_chunk_sectors;
 	conf->prev_algo = conf->algorithm;
 	conf->algorithm = mddev->new_layout;
-	if (mddev->delta_disks < 0)
+	if (mddev->reshape_backwards)
 		conf->reshape_progress = raid5_size(mddev, 0, 0);
 	else
 		conf->reshape_progress = 0;
@@ -5663,6 +5663,7 @@ static void raid5_finish_reshape(struct mddev *mddev)
 		mddev->chunk_sectors = conf->chunk_sectors;
 		mddev->reshape_position = MaxSector;
 		mddev->delta_disks = 0;
+		mddev->reshape_backwards = 0;
 	}
 }
 
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 8c0a3adc5df..07e05f92d05 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -281,10 +281,15 @@ struct mdp_superblock_1 {
 					    * active device with same 'role'.
 					    * 'recovery_offset' is also set.
 					    */
+#define	MD_FEATURE_RESHAPE_BACKWARDS	32 /* Reshape doesn't change number
+					    * of devices, but is going
+					    * backwards anyway.
+					    */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
 					|MD_FEATURE_BAD_BLOCKS		\
-					|MD_FEATURE_REPLACEMENT)
+					|MD_FEATURE_REPLACEMENT		\
+					|MD_FEATURE_RESHAPE_BACKWARDS)
 
 #endif 

From c6563a8c38fde3c1c7fc925a10bde3ca20799301 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 May 2012 09:27:00 +1000
Subject: [PATCH 03/56] md: add possibility to change data-offset for devices.

When reshaping we can avoid costly intermediate backup by
changing the 'start' address of the array on the device
(if there is enough room).

So as a first step, allow such a change to be requested
through sysfs, and recorded in v1.x metadata.

(As we didn't previous check that all 'pad' fields were zero,
 we need a new FEATURE flag for this.
 A (belatedly) check that all remaining 'pad' fields are
 zero to avoid a repeat of this)

The new data offset must be requested separately for each device.
This allows each to have a different change in the data offset.
This is not likely to be used often but as data_offset can be
set per-device, new_data_offset should be too.

This patch also removes the 'acknowledged' arg to rdev_set_badblocks as
it is never used and never will be.  At the same time we add a new
arg ('in_new') which is currently always zero but will be used more
soon.

When a reshape finishes we will need to update the data_offset
and rdev->sectors.  So provide an exported function to do that.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 217 ++++++++++++++++++++++++++++++++++----
 drivers/md/md.h           |   7 +-
 drivers/md/raid1.c        |   4 +-
 drivers/md/raid10.c       |   8 +-
 drivers/md/raid5.c        |  10 +-
 include/linux/raid/md_p.h |  10 +-
 6 files changed, 222 insertions(+), 34 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 44bb1d52dd4..9fa98fc74b0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1035,12 +1035,17 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
 struct super_type  {
 	char		    *name;
 	struct module	    *owner;
-	int		    (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev,
+	int		    (*load_super)(struct md_rdev *rdev,
+					  struct md_rdev *refdev,
 					  int minor_version);
-	int		    (*validate_super)(struct mddev *mddev, struct md_rdev *rdev);
-	void		    (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
+	int		    (*validate_super)(struct mddev *mddev,
+					      struct md_rdev *rdev);
+	void		    (*sync_super)(struct mddev *mddev,
+					  struct md_rdev *rdev);
 	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
 						sector_t num_sectors);
+	int		    (*allow_new_offset)(struct md_rdev *rdev,
+						unsigned long long new_offset);
 };
 
 /*
@@ -1112,6 +1117,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
 
 	rdev->preferred_minor = sb->md_minor;
 	rdev->data_offset = 0;
+	rdev->new_data_offset = 0;
 	rdev->sb_size = MD_SB_BYTES;
 	rdev->badblocks.shift = -1;
 
@@ -1438,6 +1444,12 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 	return num_sectors;
 }
 
+static int
+super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
+{
+	/* non-zero offset changes not possible with v0.90 */
+	return new_offset == 0;
+}
 
 /*
  * version 1 superblock
@@ -1473,6 +1485,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 	struct mdp_superblock_1 *sb;
 	int ret;
 	sector_t sb_start;
+	sector_t sectors;
 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 	int bmask;
 
@@ -1527,9 +1540,18 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 		       bdevname(rdev->bdev,b));
 		return -EINVAL;
 	}
+	if (sb->pad0 ||
+	    sb->pad3[0] ||
+	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
+		/* Some padding is non-zero, might be a new feature */
+		return -EINVAL;
 
 	rdev->preferred_minor = 0xffff;
 	rdev->data_offset = le64_to_cpu(sb->data_offset);
+	rdev->new_data_offset = rdev->data_offset;
+	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
+	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
+		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
 
 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
@@ -1540,6 +1562,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 	if (minor_version
 	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
 		return -EINVAL;
+	if (minor_version
+	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
+		return -EINVAL;
 
 	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
 		rdev->desc_nr = -1;
@@ -1611,16 +1636,14 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 		else
 			ret = 0;
 	}
-	if (minor_version)
-		rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
-			le64_to_cpu(sb->data_offset);
-	else
-		rdev->sectors = rdev->sb_start;
-	if (rdev->sectors < le64_to_cpu(sb->data_size))
+	if (minor_version) {
+		sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
+		sectors -= rdev->data_offset;
+	} else
+		sectors = rdev->sb_start;
+	if (sectors < le64_to_cpu(sb->data_size))
 		return -EINVAL;
 	rdev->sectors = le64_to_cpu(sb->data_size);
-	if (le64_to_cpu(sb->size) > rdev->sectors)
-		return -EINVAL;
 	return ret;
 }
 
@@ -1745,7 +1768,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 	sb->feature_map = 0;
 	sb->pad0 = 0;
 	sb->recovery_offset = cpu_to_le64(0);
-	memset(sb->pad1, 0, sizeof(sb->pad1));
 	memset(sb->pad3, 0, sizeof(sb->pad3));
 
 	sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1767,6 +1789,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 		sb->devflags |= WriteMostly1;
 	else
 		sb->devflags &= ~WriteMostly1;
+	sb->data_offset = cpu_to_le64(rdev->data_offset);
+	sb->data_size = cpu_to_le64(rdev->sectors);
 
 	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
@@ -1795,6 +1819,12 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 		    mddev->reshape_backwards)
 			sb->feature_map
 				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
+		if (rdev->new_data_offset != rdev->data_offset) {
+			sb->feature_map
+				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
+			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
+							     - rdev->data_offset));
+		}
 	}
 
 	if (rdev->badblocks.count == 0)
@@ -1871,6 +1901,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 	sector_t max_sectors;
 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
 		return 0; /* component must fit device */
+	if (rdev->data_offset != rdev->new_data_offset)
+		return 0; /* too confusing */
 	if (rdev->sb_start < rdev->data_offset) {
 		/* minor versions 1 and 2; superblock before data */
 		max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
@@ -1898,6 +1930,40 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
 	return num_sectors;
+
+}
+
+static int
+super_1_allow_new_offset(struct md_rdev *rdev,
+			 unsigned long long new_offset)
+{
+	/* All necessary checks on new >= old have been done */
+	struct bitmap *bitmap;
+	if (new_offset >= rdev->data_offset)
+		return 1;
+
+	/* with 1.0 metadata, there is no metadata to tread on
+	 * so we can always move back */
+	if (rdev->mddev->minor_version == 0)
+		return 1;
+
+	/* otherwise we must be sure not to step on
+	 * any metadata, so stay:
+	 * 36K beyond start of superblock
+	 * beyond end of badblocks
+	 * beyond write-intent bitmap
+	 */
+	if (rdev->sb_start + (32+4)*2 > new_offset)
+		return 0;
+	bitmap = rdev->mddev->bitmap;
+	if (bitmap && !rdev->mddev->bitmap_info.file &&
+	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
+	    bitmap->file_pages * (PAGE_SIZE>>9) > new_offset)
+		return 0;
+	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
+		return 0;
+
+	return 1;
 }
 
 static struct super_type super_types[] = {
@@ -1908,6 +1974,7 @@ static struct super_type super_types[] = {
 		.validate_super	    = super_90_validate,
 		.sync_super	    = super_90_sync,
 		.rdev_size_change   = super_90_rdev_size_change,
+		.allow_new_offset   = super_90_allow_new_offset,
 	},
 	[1] = {
 		.name	= "md-1",
@@ -1916,6 +1983,7 @@ static struct super_type super_types[] = {
 		.validate_super	    = super_1_validate,
 		.sync_super	    = super_1_sync,
 		.rdev_size_change   = super_1_rdev_size_change,
+		.allow_new_offset   = super_1_allow_new_offset,
 	},
 };
 
@@ -2823,9 +2891,8 @@ offset_show(struct md_rdev *rdev, char *page)
 static ssize_t
 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
-	char *e;
-	unsigned long long offset = simple_strtoull(buf, &e, 10);
-	if (e==buf || (*e && *e != '\n'))
+	unsigned long long offset;
+	if (strict_strtoull(buf, 10, &offset) < 0)
 		return -EINVAL;
 	if (rdev->mddev->pers && rdev->raid_disk >= 0)
 		return -EBUSY;
@@ -2840,6 +2907,63 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len)
 static struct rdev_sysfs_entry rdev_offset =
 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
 
+static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
+{
+	return sprintf(page, "%llu\n",
+		       (unsigned long long)rdev->new_data_offset);
+}
+
+static ssize_t new_offset_store(struct md_rdev *rdev,
+				const char *buf, size_t len)
+{
+	unsigned long long new_offset;
+	struct mddev *mddev = rdev->mddev;
+
+	if (strict_strtoull(buf, 10, &new_offset) < 0)
+		return -EINVAL;
+
+	if (mddev->sync_thread)
+		return -EBUSY;
+	if (new_offset == rdev->data_offset)
+		/* reset is always permitted */
+		;
+	else if (new_offset > rdev->data_offset) {
+		/* must not push array size beyond rdev_sectors */
+		if (new_offset - rdev->data_offset
+		    + mddev->dev_sectors > rdev->sectors)
+				return -E2BIG;
+	}
+	/* Metadata worries about other space details. */
+
+	/* decreasing the offset is inconsistent with a backwards
+	 * reshape.
+	 */
+	if (new_offset < rdev->data_offset &&
+	    mddev->reshape_backwards)
+		return -EINVAL;
+	/* Increasing offset is inconsistent with forwards
+	 * reshape.  reshape_direction should be set to
+	 * 'backwards' first.
+	 */
+	if (new_offset > rdev->data_offset &&
+	    !mddev->reshape_backwards)
+		return -EINVAL;
+
+	if (mddev->pers && mddev->persistent &&
+	    !super_types[mddev->major_version]
+	    .allow_new_offset(rdev, new_offset))
+		return -E2BIG;
+	rdev->new_data_offset = new_offset;
+	if (new_offset > rdev->data_offset)
+		mddev->reshape_backwards = 1;
+	else if (new_offset < rdev->data_offset)
+		mddev->reshape_backwards = 0;
+
+	return len;
+}
+static struct rdev_sysfs_entry rdev_new_offset =
+__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
+
 static ssize_t
 rdev_size_show(struct md_rdev *rdev, char *page)
 {
@@ -2884,6 +3008,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
 
 	if (strict_blocks_to_sectors(buf, &sectors) < 0)
 		return -EINVAL;
+	if (rdev->data_offset != rdev->new_data_offset)
+		return -EINVAL; /* too confusing */
 	if (my_mddev->pers && rdev->raid_disk >= 0) {
 		if (my_mddev->persistent) {
 			sectors = super_types[my_mddev->major_version].
@@ -3020,6 +3146,7 @@ static struct attribute *rdev_default_attrs[] = {
 	&rdev_errors.attr,
 	&rdev_slot.attr,
 	&rdev_offset.attr,
+	&rdev_new_offset.attr,
 	&rdev_size.attr,
 	&rdev_recovery_start.attr,
 	&rdev_bad_blocks.attr,
@@ -3094,6 +3221,7 @@ int md_rdev_init(struct md_rdev *rdev)
 	rdev->raid_disk = -1;
 	rdev->flags = 0;
 	rdev->data_offset = 0;
+	rdev->new_data_offset = 0;
 	rdev->sb_events = 0;
 	rdev->last_read_error.tv_sec  = 0;
 	rdev->last_read_error.tv_nsec = 0;
@@ -3598,7 +3726,17 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
 	if (mddev->pers)
 		rv = update_raid_disks(mddev, n);
 	else if (mddev->reshape_position != MaxSector) {
+		struct md_rdev *rdev;
 		int olddisks = mddev->raid_disks - mddev->delta_disks;
+
+		rdev_for_each(rdev, mddev) {
+			if (olddisks < n &&
+			    rdev->data_offset < rdev->new_data_offset)
+				return -EINVAL;
+			if (olddisks > n &&
+			    rdev->data_offset > rdev->new_data_offset)
+				return -EINVAL;
+		}
 		mddev->delta_disks = n - olddisks;
 		mddev->raid_disks = n;
 		mddev->reshape_backwards = (mddev->delta_disks < 0);
@@ -4445,6 +4583,7 @@ reshape_position_show(struct mddev *mddev, char *page)
 static ssize_t
 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
 {
+	struct md_rdev *rdev;
 	char *e;
 	unsigned long long new = simple_strtoull(buf, &e, 10);
 	if (mddev->pers)
@@ -4457,6 +4596,8 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
 	mddev->new_level = mddev->level;
 	mddev->new_layout = mddev->layout;
 	mddev->new_chunk_sectors = mddev->chunk_sectors;
+	rdev_for_each(rdev, mddev)
+		rdev->new_data_offset = rdev->data_offset;
 	return len;
 }
 
@@ -6001,6 +6142,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 static int update_raid_disks(struct mddev *mddev, int raid_disks)
 {
 	int rv;
+	struct md_rdev *rdev;
 	/* change the number of raid disks */
 	if (mddev->pers->check_reshape == NULL)
 		return -EINVAL;
@@ -6009,6 +6151,16 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
 		return -EINVAL;
 	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
 		return -EBUSY;
+
+	rdev_for_each(rdev, mddev) {
+		if (mddev->raid_disks < raid_disks &&
+		    rdev->data_offset < rdev->new_data_offset)
+			return -EINVAL;
+		if (mddev->raid_disks > raid_disks &&
+		    rdev->data_offset > rdev->new_data_offset)
+			return -EINVAL;
+	}
+
 	mddev->delta_disks = raid_disks - mddev->raid_disks;
 	if (mddev->delta_disks < 0)
 		mddev->reshape_backwards = 1;
@@ -7709,6 +7861,20 @@ void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
 }
 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
 
+void md_finish_reshape(struct mddev *mddev)
+{
+	/* called be personality module when reshape completes. */
+	struct md_rdev *rdev;
+
+	rdev_for_each(rdev, mddev) {
+		if (rdev->data_offset > rdev->new_data_offset)
+			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
+		else
+			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
+		rdev->data_offset = rdev->new_data_offset;
+	}
+}
+EXPORT_SYMBOL(md_finish_reshape);
 
 /* Bad block management.
  * We can record which blocks on each device are 'bad' and so just
@@ -7957,10 +8123,15 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
 }
 
 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
-		       int acknowledged)
+		       int is_new)
 {
-	int rv = md_set_badblocks(&rdev->badblocks,
-				  s + rdev->data_offset, sectors, acknowledged);
+	int rv;
+	if (is_new)
+		s += rdev->new_data_offset;
+	else
+		s += rdev->data_offset;
+	rv = md_set_badblocks(&rdev->badblocks,
+			      s, sectors, 0);
 	if (rv) {
 		/* Make sure they get written out promptly */
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -8066,11 +8237,15 @@ out:
 	return rv;
 }
 
-int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors)
+int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+			 int is_new)
 {
+	if (is_new)
+		s += rdev->new_data_offset;
+	else
+		s += rdev->data_offset;
 	return md_clear_badblocks(&rdev->badblocks,
-				  s + rdev->data_offset,
-				  sectors);
+				  s, sectors);
 }
 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d51c0ca3777..98913e8dac1 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -55,6 +55,7 @@ struct md_rdev {
 	int		sb_loaded;
 	__u64		sb_events;
 	sector_t	data_offset;	/* start of data in array */
+	sector_t	new_data_offset;/* only relevant while reshaping */
 	sector_t 	sb_start;	/* offset of the super block (in 512byte sectors) */
 	int		sb_size;	/* bytes in the superblock */
 	int		preferred_minor;	/* autorun support */
@@ -193,8 +194,9 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
 	return 0;
 }
 extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
-			      int acknowledged);
-extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors);
+			      int is_new);
+extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+				int is_new);
 extern void md_ack_all_badblocks(struct badblocks *bb);
 
 struct mddev {
@@ -592,6 +594,7 @@ extern void md_write_start(struct mddev *mddev, struct bio *bi);
 extern void md_write_end(struct mddev *mddev);
 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
 extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
+extern void md_finish_reshape(struct mddev *mddev);
 
 extern int mddev_congested(struct mddev *mddev, int bits);
 extern void md_flush_request(struct mddev *mddev, struct bio *bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 15dd59b84e9..71a7dc038a8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2024,7 +2024,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 			continue;
 		if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
 		    test_bit(R1BIO_MadeGood, &r1_bio->state)) {
-			rdev_clear_badblocks(rdev, r1_bio->sector, s);
+			rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
 		}
 		if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
 		    test_bit(R1BIO_WriteError, &r1_bio->state)) {
@@ -2044,7 +2044,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 			struct md_rdev *rdev = conf->mirrors[m].rdev;
 			rdev_clear_badblocks(rdev,
 					     r1_bio->sector,
-					     r1_bio->sectors);
+					     r1_bio->sectors, 0);
 			rdev_dec_pending(rdev, conf->mddev);
 		} else if (r1_bio->bios[m] != NULL) {
 			/* This drive got a write error.  We need to
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3f91c2e1dfe..832fb4d5665 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2480,7 +2480,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
-					r10_bio->sectors);
+					r10_bio->sectors, 0);
 			} else {
 				if (!rdev_set_badblocks(
 					    rdev,
@@ -2496,7 +2496,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
-					r10_bio->sectors);
+					r10_bio->sectors, 0);
 			} else {
 				if (!rdev_set_badblocks(
 					    rdev,
@@ -2515,7 +2515,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
-					r10_bio->sectors);
+					r10_bio->sectors, 0);
 				rdev_dec_pending(rdev, conf->mddev);
 			} else if (bio != NULL &&
 				   !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2532,7 +2532,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
-					r10_bio->sectors);
+					r10_bio->sectors, 0);
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 		}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0abbd3447cf..3705585d756 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3561,7 +3561,7 @@ finish:
 			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
 				rdev = conf->disks[i].rdev;
 				rdev_clear_badblocks(rdev, sh->sector,
-						     STRIPE_SECTORS);
+						     STRIPE_SECTORS, 0);
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 			if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
@@ -3570,7 +3570,7 @@ finish:
 					/* rdev have been moved down */
 					rdev = conf->disks[i].rdev;
 				rdev_clear_badblocks(rdev, sh->sector,
-						     STRIPE_SECTORS);
+						     STRIPE_SECTORS, 0);
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 		}
@@ -5505,10 +5505,14 @@ static int raid5_start_reshape(struct mddev *mddev)
 	if (!check_stripe_cache(mddev))
 		return -ENOSPC;
 
-	rdev_for_each(rdev, mddev)
+	rdev_for_each(rdev, mddev) {
+		/* Don't support changing data_offset yet */
+		if (rdev->new_data_offset != rdev->data_offset)
+			return -EINVAL;
 		if (!test_bit(In_sync, &rdev->flags)
 		    && !test_bit(Faulty, &rdev->flags))
 			spares++;
+	}
 
 	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
 		/* Not enough devices even to make a degraded array
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 07e05f92d05..ee753536ab7 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -233,7 +233,10 @@ struct mdp_superblock_1 {
 	__le32	delta_disks;	/* change in number of raid_disks		*/
 	__le32	new_layout;	/* new layout					*/
 	__le32	new_chunk;	/* new chunk size (512byte sectors)		*/
-	__u8	pad1[128-124];	/* set to 0 when written */
+	__le32  new_offset;	/* signed number to add to data_offset in new
+				 * layout.  0 == no-change.  This can be
+				 * different on each device in the array.
+				 */
 
 	/* constant this-device information - 64 bytes */
 	__le64	data_offset;	/* sector start of data, often 0 */
@@ -285,11 +288,14 @@ struct mdp_superblock_1 {
 					    * of devices, but is going
 					    * backwards anyway.
 					    */
+#define	MD_FEATURE_NEW_OFFSET		64 /* new_offset must be honoured */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
 					|MD_FEATURE_BAD_BLOCKS		\
 					|MD_FEATURE_REPLACEMENT		\
-					|MD_FEATURE_RESHAPE_BACKWARDS)
+					|MD_FEATURE_RESHAPE_BACKWARDS	\
+					|MD_FEATURE_NEW_OFFSET		\
+					)
 
 #endif 

From 05616be5e11f66888b66554957dbecdd90658a84 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 May 2012 09:27:00 +1000
Subject: [PATCH 04/56] md/raid5:  Use correct data_offset for all IO.

As there can now be two different data_offsets - an 'old' and
a 'new' - we need to carefully choose between them.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 72 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 59 insertions(+), 13 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3705585d756..71d1de909ba 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 	return sh;
 }
 
+/* Determine if 'data_offset' or 'new_data_offset' should be used
+ * in this stripe_head.
+ */
+static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
+{
+	sector_t progress = conf->reshape_progress;
+	/* Need a memory barrier to make sure we see the value
+	 * of conf->generation, or ->data_offset that was set before
+	 * reshape_progress was updated.
+	 */
+	smp_rmb();
+	if (progress == MaxSector)
+		return 0;
+	if (sh->generation == conf->generation - 1)
+		return 0;
+	/* We are in a reshape, and this is a new-generation stripe,
+	 * so use new_data_offset.
+	 */
+	return 1;
+}
+
 static void
 raid5_end_read_request(struct bio *bi, int error);
 static void
@@ -603,7 +624,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				__func__, (unsigned long long)sh->sector,
 				bi->bi_rw, i);
 			atomic_inc(&sh->count);
-			bi->bi_sector = sh->sector + rdev->data_offset;
+			if (use_new_offset(conf, sh))
+				bi->bi_sector = (sh->sector
+						 + rdev->new_data_offset);
+			else
+				bi->bi_sector = (sh->sector
+						 + rdev->data_offset);
 			bi->bi_flags = 1 << BIO_UPTODATE;
 			bi->bi_idx = 0;
 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -627,7 +653,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				__func__, (unsigned long long)sh->sector,
 				rbi->bi_rw, i);
 			atomic_inc(&sh->count);
-			rbi->bi_sector = sh->sector + rrdev->data_offset;
+			if (use_new_offset(conf, sh))
+				rbi->bi_sector = (sh->sector
+						  + rrdev->new_data_offset);
+			else
+				rbi->bi_sector = (sh->sector
+						  + rrdev->data_offset);
 			rbi->bi_flags = 1 << BIO_UPTODATE;
 			rbi->bi_idx = 0;
 			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1648,7 +1679,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 	char b[BDEVNAME_SIZE];
 	struct md_rdev *rdev = NULL;
-
+	sector_t s;
 
 	for (i=0 ; i<disks; i++)
 		if (bi == &sh->dev[i].req)
@@ -1671,6 +1702,10 @@ static void raid5_end_read_request(struct bio * bi, int error)
 	if (!rdev)
 		rdev = conf->disks[i].rdev;
 
+	if (use_new_offset(conf, sh))
+		s = sh->sector + rdev->new_data_offset;
+	else
+		s = sh->sector + rdev->data_offset;
 	if (uptodate) {
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
@@ -1683,8 +1718,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 				"md/raid:%s: read error corrected"
 				" (%lu sectors at %llu on %s)\n",
 				mdname(conf->mddev), STRIPE_SECTORS,
-				(unsigned long long)(sh->sector
-						     + rdev->data_offset),
+				(unsigned long long)s,
 				bdevname(rdev->bdev, b));
 			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
@@ -1704,8 +1738,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 				"md/raid:%s: read error on replacement device "
 				"(sector %llu on %s).\n",
 				mdname(conf->mddev),
-				(unsigned long long)(sh->sector
-						     + rdev->data_offset),
+				(unsigned long long)s,
 				bdn);
 		else if (conf->mddev->degraded >= conf->max_degraded)
 			printk_ratelimited(
@@ -1713,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 				"md/raid:%s: read error not correctable "
 				"(sector %llu on %s).\n",
 				mdname(conf->mddev),
-				(unsigned long long)(sh->sector
-						     + rdev->data_offset),
+				(unsigned long long)s,
 				bdn);
 		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
 			/* Oh, no!!! */
@@ -1723,8 +1755,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 				"md/raid:%s: read error NOT corrected!! "
 				"(sector %llu on %s).\n",
 				mdname(conf->mddev),
-				(unsigned long long)(sh->sector
-						     + rdev->data_offset),
+				(unsigned long long)s,
 				bdn);
 		else if (atomic_read(&rdev->read_errors)
 			 > conf->max_nr_stripes)
@@ -3842,6 +3873,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 		raid_bio->bi_next = (void*)rdev;
 		align_bi->bi_bdev =  rdev->bdev;
 		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
+		/* No reshape active, so we can trust rdev->data_offset */
 		align_bi->bi_sector += rdev->data_offset;
 
 		if (!bio_fits_rdev(align_bi) ||
@@ -5182,9 +5214,12 @@ static int run(struct mddev *mddev)
 		blk_queue_io_opt(mddev->queue, chunk_size *
 				 (conf->raid_disks - conf->max_degraded));
 
-		rdev_for_each(rdev, mddev)
+		rdev_for_each(rdev, mddev) {
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->new_data_offset << 9);
+		}
 	}
 
 	return 0;
@@ -5539,12 +5574,16 @@ static int raid5_start_reshape(struct mddev *mddev)
 	conf->chunk_sectors = mddev->new_chunk_sectors;
 	conf->prev_algo = conf->algorithm;
 	conf->algorithm = mddev->new_layout;
+	conf->generation++;
+	/* Code that selects data_offset needs to see the generation update
+	 * if reshape_progress has been set - so a memory barrier needed.
+	 */
+	smp_mb();
 	if (mddev->reshape_backwards)
 		conf->reshape_progress = raid5_size(mddev, 0, 0);
 	else
 		conf->reshape_progress = 0;
 	conf->reshape_safe = conf->reshape_progress;
-	conf->generation++;
 	spin_unlock_irq(&conf->device_lock);
 
 	/* Add some new drives, as many as will fit.
@@ -5596,6 +5635,9 @@ static int raid5_start_reshape(struct mddev *mddev)
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+		rdev_for_each(rdev, mddev)
+			rdev->new_data_offset = rdev->data_offset;
+		smp_wmb();
 		conf->reshape_progress = MaxSector;
 		mddev->reshape_position = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
@@ -5614,9 +5656,13 @@ static void end_reshape(struct r5conf *conf)
 {
 
 	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+		struct md_rdev *rdev;
 
 		spin_lock_irq(&conf->device_lock);
 		conf->previous_raid_disks = conf->raid_disks;
+		rdev_for_each(rdev, conf->mddev)
+			rdev->data_offset = rdev->new_data_offset;
+		smp_wmb();
 		conf->reshape_progress = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		wake_up(&conf->wait_for_overlap);

From b5254dd5fdd9abcacadb5101beb35df9ae8cc564 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 May 2012 09:27:01 +1000
Subject: [PATCH 05/56] md/raid5: allow for change in data_offset while
 managing a reshape.

The important issue here is incorporating the different in data_offset
into calculations concerning when we might need to over-write data
that is still thought to be valid.

To this end we find the minimum offset difference across all devices
and add that where appropriate.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 109 +++++++++++++++++++++++++++++++--------------
 drivers/md/raid5.h |   6 +++
 2 files changed, 82 insertions(+), 33 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 71d1de909ba..0172bdd37b4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4165,13 +4165,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	else
 		reshape_sectors = mddev->chunk_sectors;
 
-	/* we update the metadata when there is more than 3Meg
-	 * in the block range (that is rather arbitrary, should
-	 * probably be time based) or when the data about to be
-	 * copied would over-write the source of the data at
-	 * the front of the range.
-	 * i.e. one new_stripe along from reshape_progress new_maps
-	 * to after where reshape_safe old_maps to
+	/* We update the metadata at least every 10 seconds, or when
+	 * the data about to be copied would over-write the source of
+	 * the data at the front of the range.  i.e. one new_stripe
+	 * along from reshape_progress new_maps to after where
+	 * reshape_safe old_maps to
 	 */
 	writepos = conf->reshape_progress;
 	sector_div(writepos, new_data_disks);
@@ -4189,11 +4187,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 		safepos -= min_t(sector_t, reshape_sectors, safepos);
 	}
 
+	/* Having calculated the 'writepos' possibly use it
+	 * to set 'stripe_addr' which is where we will write to.
+	 */
+	if (mddev->reshape_backwards) {
+		BUG_ON(conf->reshape_progress == 0);
+		stripe_addr = writepos;
+		BUG_ON((mddev->dev_sectors &
+			~((sector_t)reshape_sectors - 1))
+		       - reshape_sectors - stripe_addr
+		       != sector_nr);
+	} else {
+		BUG_ON(writepos != sector_nr + reshape_sectors);
+		stripe_addr = sector_nr;
+	}
+
 	/* 'writepos' is the most advanced device address we might write.
 	 * 'readpos' is the least advanced device address we might read.
 	 * 'safepos' is the least address recorded in the metadata as having
 	 *     been reshaped.
-	 * If 'readpos' is behind 'writepos', then there is no way that we can
+	 * If there is a min_offset_diff, these are adjusted either by
+	 * increasing the safepos/readpos if diff is negative, or
+	 * increasing writepos if diff is positive.
+	 * If 'readpos' is then behind 'writepos', there is no way that we can
 	 * ensure safety in the face of a crash - that must be done by userspace
 	 * making a backup of the data.  So in that case there is no particular
 	 * rush to update metadata.
@@ -4206,6 +4222,12 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	 * Maybe that number should be configurable, but I'm not sure it is
 	 * worth it.... maybe it could be a multiple of safemode_delay???
 	 */
+	if (conf->min_offset_diff < 0) {
+		safepos += -conf->min_offset_diff;
+		readpos += -conf->min_offset_diff;
+	} else
+		writepos += conf->min_offset_diff;
+
 	if ((mddev->reshape_backwards
 	     ? (safepos > writepos && readpos < writepos)
 	     : (safepos < writepos && readpos > writepos)) ||
@@ -4227,17 +4249,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	}
 
-	if (mddev->reshape_backwards) {
-		BUG_ON(conf->reshape_progress == 0);
-		stripe_addr = writepos;
-		BUG_ON((mddev->dev_sectors &
-			~((sector_t)reshape_sectors - 1))
-		       - reshape_sectors - stripe_addr
-		       != sector_nr);
-	} else {
-		BUG_ON(writepos != sector_nr + reshape_sectors);
-		stripe_addr = sector_nr;
-	}
 	INIT_LIST_HEAD(&stripes);
 	for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
 		int j;
@@ -4984,16 +4995,42 @@ static int run(struct mddev *mddev)
 	struct md_rdev *rdev;
 	sector_t reshape_offset = 0;
 	int i;
+	long long min_offset_diff = 0;
+	int first = 1;
 
 	if (mddev->recovery_cp != MaxSector)
 		printk(KERN_NOTICE "md/raid:%s: not clean"
 		       " -- starting background reconstruction\n",
 		       mdname(mddev));
+
+	rdev_for_each(rdev, mddev) {
+		long long diff;
+		if (rdev->raid_disk < 0)
+			continue;
+		diff = (rdev->new_data_offset - rdev->data_offset);
+		if (first) {
+			min_offset_diff = diff;
+			first = 0;
+		} else if (mddev->reshape_backwards &&
+			 diff < min_offset_diff)
+			min_offset_diff = diff;
+		else if (!mddev->reshape_backwards &&
+			 diff > min_offset_diff)
+			min_offset_diff = diff;
+	}
+
 	if (mddev->reshape_position != MaxSector) {
 		/* Check that we can continue the reshape.
-		 * Currently only disks can change, it must
-		 * increase, and we must be past the point where
-		 * a stripe over-writes itself
+		 * Difficulties arise if the stripe we would write to
+		 * next is at or after the stripe we would read from next.
+		 * For a reshape that changes the number of devices, this
+		 * is only possible for a very short time, and mdadm makes
+		 * sure that time appears to have past before assembling
+		 * the array.  So we fail if that time hasn't passed.
+		 * For a reshape that keeps the number of devices the same
+		 * mdadm must be monitoring the reshape can keeping the
+		 * critical areas read-only and backed up.  It will start
+		 * the array in read-only mode, so we check for that.
 		 */
 		sector_t here_new, here_old;
 		int old_disks;
@@ -5025,26 +5062,34 @@ static int run(struct mddev *mddev)
 		/* here_old is the first stripe that we might need to read
 		 * from */
 		if (mddev->delta_disks == 0) {
+			if ((here_new * mddev->new_chunk_sectors !=
+			     here_old * mddev->chunk_sectors)) {
+				printk(KERN_ERR "md/raid:%s: reshape position is"
+				       " confused - aborting\n", mdname(mddev));
+				return -EINVAL;
+			}
 			/* We cannot be sure it is safe to start an in-place
-			 * reshape.  It is only safe if user-space if monitoring
+			 * reshape.  It is only safe if user-space is monitoring
 			 * and taking constant backups.
 			 * mdadm always starts a situation like this in
 			 * readonly mode so it can take control before
 			 * allowing any writes.  So just check for that.
 			 */
-			if ((here_new * mddev->new_chunk_sectors != 
-			     here_old * mddev->chunk_sectors) ||
-			    mddev->ro == 0) {
-				printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
-				       " in read-only mode - aborting\n",
+			if (abs(min_offset_diff) >= mddev->chunk_sectors &&
+			    abs(min_offset_diff) >= mddev->new_chunk_sectors)
+				/* not really in-place - so OK */;
+			else if (mddev->ro == 0) {
+				printk(KERN_ERR "md/raid:%s: in-place reshape "
+				       "must be started in read-only mode "
+				       "- aborting\n",
 				       mdname(mddev));
 				return -EINVAL;
 			}
 		} else if (mddev->reshape_backwards
-		    ? (here_new * mddev->new_chunk_sectors <=
+		    ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
 		       here_old * mddev->chunk_sectors)
 		    : (here_new * mddev->new_chunk_sectors >=
-		       here_old * mddev->chunk_sectors)) {
+		       here_old * mddev->chunk_sectors + (-min_offset_diff))) {
 			/* Reading from the same stripe as writing to - bad */
 			printk(KERN_ERR "md/raid:%s: reshape_position too early for "
 			       "auto-recovery - aborting.\n",
@@ -5069,6 +5114,7 @@ static int run(struct mddev *mddev)
 	if (IS_ERR(conf))
 		return PTR_ERR(conf);
 
+	conf->min_offset_diff = min_offset_diff;
 	mddev->thread = conf->thread;
 	conf->thread = NULL;
 	mddev->private = conf;
@@ -5541,9 +5587,6 @@ static int raid5_start_reshape(struct mddev *mddev)
 		return -ENOSPC;
 
 	rdev_for_each(rdev, mddev) {
-		/* Don't support changing data_offset yet */
-		if (rdev->new_data_offset != rdev->data_offset)
-			return -EINVAL;
 		if (!test_bit(In_sync, &rdev->flags)
 		    && !test_bit(Faulty, &rdev->flags))
 			spares++;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 8d8e13934a4..c6bdfa01d98 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -385,6 +385,12 @@ struct r5conf {
 	short			generation; /* increments with every reshape */
 	unsigned long		reshape_checkpoint; /* Time we last updated
 						     * metadata */
+	long long		min_offset_diff; /* minimum difference between
+						  * data_offset and
+						  * new_data_offset across all
+						  * devices.  May be negative,
+						  * but is closest to zero.
+						  */
 
 	struct list_head	handle_list; /* stripes needing handling */
 	struct list_head	hold_list; /* preread ready stripes */

From 5cf00fcd3c98d2eafb58ac7a649bbdb9dbc4902b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 May 2012 09:28:20 +1000
Subject: [PATCH 06/56] md/raid10: collect some geometry fields into a
 dedicated structure.

We will shortly be adding reshape support for RAID10 which will
require it having 2 concurrent geometries (before and after).
To make that easier, collect most geometry fields into 'struct geom'
and access them from there.  Then we will more easily be able to add
a second set of fields.

Note that 'copies' is not in this struct and so cannot be changed.
There is little need to change this number and doing so is a lot
more difficult as it requires reallocating more things.
So leave it out for now.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 200 +++++++++++++++++++++++---------------------
 drivers/md/raid10.h |  23 ++---
 2 files changed, 115 insertions(+), 108 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 832fb4d5665..36f445f9e11 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -511,42 +511,43 @@ static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
 	sector_t chunk;
 	sector_t stripe;
 	int dev;
+	struct geom *geo = &conf->geo;
 
 	int slot = 0;
 
 	/* now calculate first sector/dev */
-	chunk = r10bio->sector >> conf->chunk_shift;
-	sector = r10bio->sector & conf->chunk_mask;
+	chunk = r10bio->sector >> geo->chunk_shift;
+	sector = r10bio->sector & geo->chunk_mask;
 
-	chunk *= conf->near_copies;
+	chunk *= geo->near_copies;
 	stripe = chunk;
-	dev = sector_div(stripe, conf->raid_disks);
-	if (conf->far_offset)
-		stripe *= conf->far_copies;
+	dev = sector_div(stripe, geo->raid_disks);
+	if (geo->far_offset)
+		stripe *= geo->far_copies;
 
-	sector += stripe << conf->chunk_shift;
+	sector += stripe << geo->chunk_shift;
 
 	/* and calculate all the others */
-	for (n=0; n < conf->near_copies; n++) {
+	for (n = 0; n < geo->near_copies; n++) {
 		int d = dev;
 		sector_t s = sector;
 		r10bio->devs[slot].addr = sector;
 		r10bio->devs[slot].devnum = d;
 		slot++;
 
-		for (f = 1; f < conf->far_copies; f++) {
-			d += conf->near_copies;
-			if (d >= conf->raid_disks)
-				d -= conf->raid_disks;
-			s += conf->stride;
+		for (f = 1; f < geo->far_copies; f++) {
+			d += geo->near_copies;
+			if (d >= geo->raid_disks)
+				d -= geo->raid_disks;
+			s += geo->stride;
 			r10bio->devs[slot].devnum = d;
 			r10bio->devs[slot].addr = s;
 			slot++;
 		}
 		dev++;
-		if (dev >= conf->raid_disks) {
+		if (dev >= geo->raid_disks) {
 			dev = 0;
-			sector += (conf->chunk_mask + 1);
+			sector += (geo->chunk_mask + 1);
 		}
 	}
 	BUG_ON(slot != conf->copies);
@@ -555,28 +556,29 @@ static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 {
 	sector_t offset, chunk, vchunk;
+	struct geom *geo = &conf->geo;
 
-	offset = sector & conf->chunk_mask;
-	if (conf->far_offset) {
+	offset = sector & geo->chunk_mask;
+	if (geo->far_offset) {
 		int fc;
-		chunk = sector >> conf->chunk_shift;
-		fc = sector_div(chunk, conf->far_copies);
-		dev -= fc * conf->near_copies;
+		chunk = sector >> geo->chunk_shift;
+		fc = sector_div(chunk, geo->far_copies);
+		dev -= fc * geo->near_copies;
 		if (dev < 0)
-			dev += conf->raid_disks;
+			dev += geo->raid_disks;
 	} else {
-		while (sector >= conf->stride) {
-			sector -= conf->stride;
-			if (dev < conf->near_copies)
-				dev += conf->raid_disks - conf->near_copies;
+		while (sector >= geo->stride) {
+			sector -= geo->stride;
+			if (dev < geo->near_copies)
+				dev += geo->raid_disks - geo->near_copies;
 			else
-				dev -= conf->near_copies;
+				dev -= geo->near_copies;
 		}
-		chunk = sector >> conf->chunk_shift;
+		chunk = sector >> geo->chunk_shift;
 	}
-	vchunk = chunk * conf->raid_disks + dev;
-	sector_div(vchunk, conf->near_copies);
-	return (vchunk << conf->chunk_shift) + offset;
+	vchunk = chunk * geo->raid_disks + dev;
+	sector_div(vchunk, geo->near_copies);
+	return (vchunk << geo->chunk_shift) + offset;
 }
 
 /**
@@ -599,8 +601,9 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
+	struct geom *geo = &conf->geo;
 
-	if (conf->near_copies < conf->raid_disks) {
+	if (geo->near_copies < geo->raid_disks) {
 		max = (chunk_sectors - ((sector & (chunk_sectors - 1))
 					+ bio_sectors)) << 9;
 		if (max < 0)
@@ -681,6 +684,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
 	struct md_rdev *rdev, *best_rdev;
 	int do_balance;
 	int best_slot;
+	struct geom *geo = &conf->geo;
 
 	raid10_find_phys(conf, r10_bio);
 	rcu_read_lock();
@@ -761,11 +765,11 @@ retry:
 		 * sequential read speed for 'far copies' arrays.  So only
 		 * keep it for 'near' arrays, and review those later.
 		 */
-		if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
+		if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
 			break;
 
 		/* for far > 1 always use the lowest address */
-		if (conf->far_copies > 1)
+		if (geo->far_copies > 1)
 			new_distance = r10_bio->devs[slot].addr;
 		else
 			new_distance = abs(r10_bio->devs[slot].addr -
@@ -812,7 +816,7 @@ static int raid10_congested(void *data, int bits)
 	if (mddev_congested(mddev, bits))
 		return 1;
 	rcu_read_lock();
-	for (i = 0; i < conf->raid_disks && ret == 0; i++) {
+	for (i = 0; i < conf->geo.raid_disks && ret == 0; i++) {
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -979,7 +983,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	struct r10bio *r10_bio;
 	struct bio *read_bio;
 	int i;
-	int chunk_sects = conf->chunk_mask + 1;
+	sector_t chunk_mask = conf->geo.chunk_mask;
+	int chunk_sects = chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
@@ -997,9 +1002,9 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	/* If this request crosses a chunk boundary, we need to
 	 * split it.  This will only happen for 1 PAGE (or less) requests.
 	 */
-	if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
-		      > chunk_sects &&
-		    conf->near_copies < conf->raid_disks)) {
+	if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
+		     > chunk_sects
+		     && conf->geo.near_copies < conf->geo.raid_disks)) {
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
 		if (bio->bi_vcnt != 1 ||
@@ -1368,19 +1373,19 @@ static void status(struct seq_file *seq, struct mddev *mddev)
 	struct r10conf *conf = mddev->private;
 	int i;
 
-	if (conf->near_copies < conf->raid_disks)
+	if (conf->geo.near_copies < conf->geo.raid_disks)
 		seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
-	if (conf->near_copies > 1)
-		seq_printf(seq, " %d near-copies", conf->near_copies);
-	if (conf->far_copies > 1) {
-		if (conf->far_offset)
-			seq_printf(seq, " %d offset-copies", conf->far_copies);
+	if (conf->geo.near_copies > 1)
+		seq_printf(seq, " %d near-copies", conf->geo.near_copies);
+	if (conf->geo.far_copies > 1) {
+		if (conf->geo.far_offset)
+			seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
 		else
-			seq_printf(seq, " %d far-copies", conf->far_copies);
+			seq_printf(seq, " %d far-copies", conf->geo.far_copies);
 	}
-	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
-					conf->raid_disks - mddev->degraded);
-	for (i = 0; i < conf->raid_disks; i++)
+	seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
+					conf->geo.raid_disks - mddev->degraded);
+	for (i = 0; i < conf->geo.raid_disks; i++)
 		seq_printf(seq, "%s",
 			      conf->mirrors[i].rdev &&
 			      test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
@@ -1403,7 +1408,7 @@ static int enough(struct r10conf *conf, int ignore)
 			if (conf->mirrors[first].rdev &&
 			    first != ignore)
 				cnt++;
-			first = (first+1) % conf->raid_disks;
+			first = (first+1) % conf->geo.raid_disks;
 		}
 		if (cnt == 0)
 			return 0;
@@ -1445,7 +1450,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid10:%s: Operation continuing on %d devices.\n",
 	       mdname(mddev), bdevname(rdev->bdev, b),
-	       mdname(mddev), conf->raid_disks - mddev->degraded);
+	       mdname(mddev), conf->geo.raid_disks - mddev->degraded);
 }
 
 static void print_conf(struct r10conf *conf)
@@ -1458,10 +1463,10 @@ static void print_conf(struct r10conf *conf)
 		printk(KERN_DEBUG "(!conf)\n");
 		return;
 	}
-	printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
-		conf->raid_disks);
+	printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
+		conf->geo.raid_disks);
 
-	for (i = 0; i < conf->raid_disks; i++) {
+	for (i = 0; i < conf->geo.raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
 		tmp = conf->mirrors + i;
 		if (tmp->rdev)
@@ -1493,7 +1498,7 @@ static int raid10_spare_active(struct mddev *mddev)
 	 * Find all non-in_sync disks within the RAID10 configuration
 	 * and mark them in_sync
 	 */
-	for (i = 0; i < conf->raid_disks; i++) {
+	for (i = 0; i < conf->geo.raid_disks; i++) {
 		tmp = conf->mirrors + i;
 		if (tmp->replacement
 		    && tmp->replacement->recovery_offset == MaxSector
@@ -1535,7 +1540,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	int err = -EEXIST;
 	int mirror;
 	int first = 0;
-	int last = conf->raid_disks - 1;
+	int last = conf->geo.raid_disks - 1;
 	struct request_queue *q = bdev_get_queue(rdev->bdev);
 
 	if (mddev->recovery_cp < MaxSector)
@@ -2603,7 +2608,7 @@ static int init_resync(struct r10conf *conf)
 	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
 	BUG_ON(conf->r10buf_pool);
 	conf->have_replacement = 0;
-	for (i = 0; i < conf->raid_disks; i++)
+	for (i = 0; i < conf->geo.raid_disks; i++)
 		if (conf->mirrors[i].replacement)
 			conf->have_replacement = 1;
 	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
@@ -2657,6 +2662,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 	sector_t sync_blocks;
 	sector_t sectors_skipped = 0;
 	int chunks_skipped = 0;
+	sector_t chunk_mask = conf->geo.chunk_mask;
 
 	if (!conf->r10buf_pool)
 		if (init_resync(conf))
@@ -2680,7 +2686,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 				bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
 						&sync_blocks, 1);
-			else for (i=0; i<conf->raid_disks; i++) {
+			else for (i = 0; i < conf->geo.raid_disks; i++) {
 				sector_t sect =
 					raid10_find_virt(conf, mddev->curr_resync, i);
 				bitmap_end_sync(mddev->bitmap, sect,
@@ -2694,7 +2700,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 				/* Completed a full sync so the replacements
 				 * are now fully recovered.
 				 */
-				for (i = 0; i < conf->raid_disks; i++)
+				for (i = 0; i < conf->geo.raid_disks; i++)
 					if (conf->mirrors[i].replacement)
 						conf->mirrors[i].replacement
 							->recovery_offset
@@ -2707,7 +2713,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 		*skipped = 1;
 		return sectors_skipped;
 	}
-	if (chunks_skipped >= conf->raid_disks) {
+	if (chunks_skipped >= conf->geo.raid_disks) {
 		/* if there has been nothing to do on any drive,
 		 * then there is nothing to do at all..
 		 */
@@ -2721,9 +2727,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 	/* make sure whole request will fit in a chunk - if chunks
 	 * are meaningful
 	 */
-	if (conf->near_copies < conf->raid_disks &&
-	    max_sector > (sector_nr | conf->chunk_mask))
-		max_sector = (sector_nr | conf->chunk_mask) + 1;
+	if (conf->geo.near_copies < conf->geo.raid_disks &&
+	    max_sector > (sector_nr | chunk_mask))
+		max_sector = (sector_nr | chunk_mask) + 1;
 	/*
 	 * If there is non-resync activity waiting for us then
 	 * put in a delay to throttle resync.
@@ -2752,7 +2758,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 		int j;
 		r10_bio = NULL;
 
-		for (i=0 ; i<conf->raid_disks; i++) {
+		for (i = 0 ; i < conf->geo.raid_disks; i++) {
 			int still_degraded;
 			struct r10bio *rb2;
 			sector_t sect;
@@ -2806,7 +2812,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			/* Need to check if the array will still be
 			 * degraded
 			 */
-			for (j=0; j<conf->raid_disks; j++)
+			for (j = 0; j < conf->geo.raid_disks; j++)
 				if (conf->mirrors[j].rdev == NULL ||
 				    test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
 					still_degraded = 1;
@@ -2984,9 +2990,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 		r10_bio->sector = sector_nr;
 		set_bit(R10BIO_IsSync, &r10_bio->state);
 		raid10_find_phys(conf, r10_bio);
-		r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
+		r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
 
-		for (i=0; i<conf->copies; i++) {
+		for (i = 0; i < conf->copies; i++) {
 			int d = r10_bio->devs[i].devnum;
 			sector_t first_bad, sector;
 			int bad_sectors;
@@ -3152,16 +3158,16 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 	struct r10conf *conf = mddev->private;
 
 	if (!raid_disks)
-		raid_disks = conf->raid_disks;
+		raid_disks = conf->geo.raid_disks;
 	if (!sectors)
 		sectors = conf->dev_sectors;
 
-	size = sectors >> conf->chunk_shift;
-	sector_div(size, conf->far_copies);
+	size = sectors >> conf->geo.chunk_shift;
+	sector_div(size, conf->geo.far_copies);
 	size = size * raid_disks;
-	sector_div(size, conf->near_copies);
+	sector_div(size, conf->geo.near_copies);
 
-	return size << conf->chunk_shift;
+	return size << conf->geo.chunk_shift;
 }
 
 static void calc_sectors(struct r10conf *conf, sector_t size)
@@ -3171,10 +3177,10 @@ static void calc_sectors(struct r10conf *conf, sector_t size)
 	 * conf->stride
 	 */
 
-	size = size >> conf->chunk_shift;
-	sector_div(size, conf->far_copies);
-	size = size * conf->raid_disks;
-	sector_div(size, conf->near_copies);
+	size = size >> conf->geo.chunk_shift;
+	sector_div(size, conf->geo.far_copies);
+	size = size * conf->geo.raid_disks;
+	sector_div(size, conf->geo.near_copies);
 	/* 'size' is now the number of chunks in the array */
 	/* calculate "used chunks per device" */
 	size = size * conf->copies;
@@ -3182,15 +3188,15 @@ static void calc_sectors(struct r10conf *conf, sector_t size)
 	/* We need to round up when dividing by raid_disks to
 	 * get the stride size.
 	 */
-	size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks);
+	size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
 
-	conf->dev_sectors = size << conf->chunk_shift;
+	conf->dev_sectors = size << conf->geo.chunk_shift;
 
-	if (conf->far_offset)
-		conf->stride = 1 << conf->chunk_shift;
+	if (conf->geo.far_offset)
+		conf->geo.stride = 1 << conf->geo.chunk_shift;
 	else {
-		sector_div(size, conf->far_copies);
-		conf->stride = size << conf->chunk_shift;
+		sector_div(size, conf->geo.far_copies);
+		conf->geo.stride = size << conf->geo.chunk_shift;
 	}
 }
 
@@ -3234,13 +3240,13 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 		goto out;
 
 
-	conf->raid_disks = mddev->raid_disks;
-	conf->near_copies = nc;
-	conf->far_copies = fc;
+	conf->geo.raid_disks = mddev->raid_disks;
+	conf->geo.near_copies = nc;
+	conf->geo.far_copies = fc;
 	conf->copies = nc*fc;
-	conf->far_offset = fo;
-	conf->chunk_mask = mddev->new_chunk_sectors - 1;
-	conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
+	conf->geo.far_offset = fo;
+	conf->geo.chunk_mask = mddev->new_chunk_sectors - 1;
+	conf->geo.chunk_shift = ffz(~mddev->new_chunk_sectors);
 
 	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
 					   r10bio_pool_free, conf);
@@ -3304,16 +3310,16 @@ static int run(struct mddev *mddev)
 
 	chunk_size = mddev->chunk_sectors << 9;
 	blk_queue_io_min(mddev->queue, chunk_size);
-	if (conf->raid_disks % conf->near_copies)
-		blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
+	if (conf->geo.raid_disks % conf->geo.near_copies)
+		blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
 	else
 		blk_queue_io_opt(mddev->queue, chunk_size *
-				 (conf->raid_disks / conf->near_copies));
+				 (conf->geo.raid_disks / conf->geo.near_copies));
 
 	rdev_for_each(rdev, mddev) {
 
 		disk_idx = rdev->raid_disk;
-		if (disk_idx >= conf->raid_disks
+		if (disk_idx >= conf->geo.raid_disks
 		    || disk_idx < 0)
 			continue;
 		disk = conf->mirrors + disk_idx;
@@ -3341,7 +3347,7 @@ static int run(struct mddev *mddev)
 	}
 
 	mddev->degraded = 0;
-	for (i = 0; i < conf->raid_disks; i++) {
+	for (i = 0; i < conf->geo.raid_disks; i++) {
 
 		disk = conf->mirrors + i;
 
@@ -3368,8 +3374,8 @@ static int run(struct mddev *mddev)
 		       mdname(mddev));
 	printk(KERN_INFO
 		"md/raid10:%s: active with %d out of %d devices\n",
-		mdname(mddev), conf->raid_disks - mddev->degraded,
-		conf->raid_disks);
+		mdname(mddev), conf->geo.raid_disks - mddev->degraded,
+		conf->geo.raid_disks);
 	/*
 	 * Ok, everything is just fine now
 	 */
@@ -3386,9 +3392,9 @@ static int run(struct mddev *mddev)
 	 * maybe...
 	 */
 	{
-		int stripe = conf->raid_disks *
+		int stripe = conf->geo.raid_disks *
 			((mddev->chunk_sectors << 9) / PAGE_SIZE);
-		stripe /= conf->near_copies;
+		stripe /= conf->geo.near_copies;
 		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
 	}
@@ -3460,7 +3466,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
 	struct r10conf *conf = mddev->private;
 	sector_t oldsize, size;
 
-	if (conf->far_copies > 1 && !conf->far_offset)
+	if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
 		return -EINVAL;
 
 	oldsize = raid10_size(mddev, 0, 0);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 7c615613c38..4c4942ac46f 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -14,33 +14,34 @@ struct mirror_info {
 struct r10conf {
 	struct mddev		*mddev;
 	struct mirror_info	*mirrors;
-	int			raid_disks;
 	spinlock_t		device_lock;
 
 	/* geometry */
-	int			near_copies;  /* number of copies laid out
+	struct geom {
+		int		raid_disks;
+		int		near_copies;  /* number of copies laid out
 					       * raid0 style */
-	int 			far_copies;   /* number of copies laid out
+		int		far_copies;   /* number of copies laid out
 					       * at large strides across drives
 					       */
-	int			far_offset;   /* far_copies are offset by 1
+		int		far_offset;   /* far_copies are offset by 1
 					       * stripe instead of many
 					       */
-	int			copies;	      /* near_copies * far_copies.
-					       * must be <= raid_disks
-					       */
-	sector_t		stride;	      /* distance between far copies.
+		sector_t	stride;	      /* distance between far copies.
 					       * This is size / far_copies unless
 					       * far_offset, in which case it is
 					       * 1 stripe.
 					       */
+		int		chunk_shift; /* shift from chunks to sectors */
+		sector_t	chunk_mask;
+	} geo;
+	int			copies;	      /* near_copies * far_copies.
+					       * must be <= raid_disks
+					       */
 
 	sector_t		dev_sectors;  /* temp copy of
 					       * mddev->dev_sectors */
 
-	int			chunk_shift; /* shift from chunks to sectors */
-	sector_t		chunk_mask;
-
 	struct list_head	retry_list;
 	/* queue pending writes and submit them on unplug */
 	struct bio_list		pending_bio_list;

From 1fdd6fc92f330b81604c9a4f892f713775a9751a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 May 2012 09:28:32 +1000
Subject: [PATCH 07/56] md: teach sync_page_io about new_data_offset.

Some code in raid1 and raid10 use sync_page_io to
read/write pages when responding to read errors.
As we will shortly support changing data_offset for
raid10, this function must understand new_data_offset.

So add that understanding.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9fa98fc74b0..c6b8d9fd0b0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -888,6 +888,10 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 		rdev->meta_bdev : rdev->bdev;
 	if (metadata_op)
 		bio->bi_sector = sector + rdev->sb_start;
+	else if (rdev->mddev->reshape_position != MaxSector &&
+		 (rdev->mddev->reshape_backwards ==
+		  (sector >= rdev->mddev->reshape_position)))
+		bio->bi_sector = sector + rdev->new_data_offset;
 	else
 		bio->bi_sector = sector + rdev->data_offset;
 	bio_add_page(bio, page, size, 0);

From c804cdecea418c067ee7359d62139b2b3c8cec39 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 May 2012 09:28:33 +1000
Subject: [PATCH 08/56] md: use resync_max_sectors for reshape as well as
 resync.

Some resync type operations need to act on the address space of the
device, others on the address space of the array.

This only affects RAID10, so it sets resync_max_sectors to the array
size (it defaults to the device size), and that is currently used for
resync only.  However reshape of a RAID10 must be done against the
array size, not device size, so change code to use resync_max_sectors
for both the resync and the reshape cases.
This does not affect RAID5 or RAID1, just RAID10.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c6b8d9fd0b0..8fe1abf1b89 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4425,7 +4425,8 @@ sync_completed_show(struct mddev *mddev, char *page)
 	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		return sprintf(page, "none\n");
 
-	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		max_sectors = mddev->resync_max_sectors;
 	else
 		max_sectors = mddev->dev_sectors;
@@ -6803,7 +6804,8 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev)
 
 	resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
 
-	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		max_sectors = mddev->resync_max_sectors;
 	else
 		max_sectors = mddev->dev_sectors;
@@ -7366,7 +7368,7 @@ void md_do_sync(struct mddev *mddev)
 			j = mddev->recovery_cp;
 
 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-		max_sectors = mddev->dev_sectors;
+		max_sectors = mddev->resync_max_sectors;
 	else {
 		/* recovery follows the physical size of devices */
 		max_sectors = mddev->dev_sectors;

From f8c9e74ff0832f2244d7991d2aea13851b20a622 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 May 2012 09:28:33 +1000
Subject: [PATCH 09/56] md/raid10: Introduce 'prev' geometry to support
 reshape.

When RAID10 supports reshape it will need a 'previous' and a 'current'
geometry, so introduce that here.
Use the 'prev' geometry when before the reshape_position, and the
current 'geo' when beyond it.  At other times, use both as
appropriate.

For now, both are identical (And reshape_position is never set).

When we use the 'prev' geometry, we must use the old data_offset.
When we use the current (And a reshape is happening) we must use
the new_data_offset.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 107 +++++++++++++++++++++++++++++++++++---------
 drivers/md/raid10.h |   8 +++-
 2 files changed, 92 insertions(+), 23 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 36f445f9e11..1c90005ab34 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -504,15 +504,13 @@ static void raid10_end_write_request(struct bio *bio, int error)
  * sector offset to a virtual address
  */
 
-static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
+static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
 {
 	int n,f;
 	sector_t sector;
 	sector_t chunk;
 	sector_t stripe;
 	int dev;
-	struct geom *geo = &conf->geo;
-
 	int slot = 0;
 
 	/* now calculate first sector/dev */
@@ -550,12 +548,29 @@ static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
 			sector += (geo->chunk_mask + 1);
 		}
 	}
-	BUG_ON(slot != conf->copies);
+}
+
+static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
+{
+	struct geom *geo = &conf->geo;
+
+	if (conf->reshape_progress != MaxSector &&
+	    ((r10bio->sector >= conf->reshape_progress) !=
+	     conf->mddev->reshape_backwards)) {
+		set_bit(R10BIO_Previous, &r10bio->state);
+		geo = &conf->prev;
+	} else
+		clear_bit(R10BIO_Previous, &r10bio->state);
+
+	__raid10_find_phys(geo, r10bio);
 }
 
 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 {
 	sector_t offset, chunk, vchunk;
+	/* Never use conf->prev as this is only called during resync
+	 * or recovery, so reshape isn't happening
+	 */
 	struct geom *geo = &conf->geo;
 
 	offset = sector & geo->chunk_mask;
@@ -603,6 +618,11 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 	struct geom *geo = &conf->geo;
 
+	if (conf->reshape_progress != MaxSector &&
+	    ((sector >= conf->reshape_progress) !=
+	     conf->mddev->reshape_backwards))
+		geo = &conf->prev;
+
 	if (geo->near_copies < geo->raid_disks) {
 		max = (chunk_sectors - ((sector & (chunk_sectors - 1))
 					+ bio_sectors)) << 9;
@@ -617,6 +637,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 	if (mddev->merge_check_needed) {
 		struct r10bio r10_bio;
 		int s;
+		if (conf->reshape_progress != MaxSector) {
+			/* Cannot give any guidance during reshape */
+			if (max <= biovec->bv_len && bio_sectors == 0)
+				return biovec->bv_len;
+			return 0;
+		}
 		r10_bio.sector = sector;
 		raid10_find_phys(conf, &r10_bio);
 		rcu_read_lock();
@@ -816,7 +842,10 @@ static int raid10_congested(void *data, int bits)
 	if (mddev_congested(mddev, bits))
 		return 1;
 	rcu_read_lock();
-	for (i = 0; i < conf->geo.raid_disks && ret == 0; i++) {
+	for (i = 0;
+	     (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
+		     && ret == 0;
+	     i++) {
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -977,13 +1006,23 @@ static void unfreeze_array(struct r10conf *conf)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+static sector_t choose_data_offset(struct r10bio *r10_bio,
+				   struct md_rdev *rdev)
+{
+	if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
+	    test_bit(R10BIO_Previous, &r10_bio->state))
+		return rdev->data_offset;
+	else
+		return rdev->new_data_offset;
+}
+
 static void make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct r10conf *conf = mddev->private;
 	struct r10bio *r10_bio;
 	struct bio *read_bio;
 	int i;
-	sector_t chunk_mask = conf->geo.chunk_mask;
+	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
 	int chunk_sects = chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
@@ -1004,7 +1043,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	 */
 	if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
 		     > chunk_sects
-		     && conf->geo.near_copies < conf->geo.raid_disks)) {
+		     && (conf->geo.near_copies < conf->geo.raid_disks
+			 || conf->prev.near_copies < conf->prev.raid_disks))) {
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
 		if (bio->bi_vcnt != 1 ||
@@ -1098,7 +1138,7 @@ read_again:
 		r10_bio->devs[slot].rdev = rdev;
 
 		read_bio->bi_sector = r10_bio->devs[slot].addr +
-			rdev->data_offset;
+			choose_data_offset(r10_bio, rdev);
 		read_bio->bi_bdev = rdev->bdev;
 		read_bio->bi_end_io = raid10_end_read_request;
 		read_bio->bi_rw = READ | do_sync;
@@ -1302,7 +1342,8 @@ retry_write:
 		r10_bio->devs[i].bio = mbio;
 
 		mbio->bi_sector	= (r10_bio->devs[i].addr+
-				   conf->mirrors[d].rdev->data_offset);
+				   choose_data_offset(r10_bio,
+						      conf->mirrors[d].rdev));
 		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
 		mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -1326,8 +1367,10 @@ retry_write:
 		 * so it cannot disappear, so the replacement cannot
 		 * become NULL here
 		 */
-		mbio->bi_sector	= (r10_bio->devs[i].addr+
-				   conf->mirrors[d].replacement->data_offset);
+		mbio->bi_sector	= (r10_bio->devs[i].addr +
+				   choose_data_offset(
+					   r10_bio,
+					   conf->mirrors[d].replacement));
 		mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
 		mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -1397,7 +1440,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
  * Don't consider the device numbered 'ignore'
  * as we might be about to remove it.
  */
-static int enough(struct r10conf *conf, int ignore)
+static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
 {
 	int first = 0;
 
@@ -1408,7 +1451,7 @@ static int enough(struct r10conf *conf, int ignore)
 			if (conf->mirrors[first].rdev &&
 			    first != ignore)
 				cnt++;
-			first = (first+1) % conf->geo.raid_disks;
+			first = (first+1) % geo->raid_disks;
 		}
 		if (cnt == 0)
 			return 0;
@@ -1416,6 +1459,12 @@ static int enough(struct r10conf *conf, int ignore)
 	return 1;
 }
 
+static int enough(struct r10conf *conf, int ignore)
+{
+	return _enough(conf, &conf->geo, ignore) &&
+		_enough(conf, &conf->prev, ignore);
+}
+
 static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
@@ -1548,7 +1597,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * very different from resync
 		 */
 		return -EBUSY;
-	if (rdev->saved_raid_disk < 0 && !enough(conf, -1))
+	if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
 		return -EINVAL;
 
 	if (rdev->raid_disk >= 0)
@@ -2223,7 +2272,9 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 				       " (%d sectors at %llu on %s)\n",
 				       mdname(mddev), s,
 				       (unsigned long long)(
-					       sect + rdev->data_offset),
+					       sect +
+					       choose_data_offset(r10_bio,
+								  rdev)),
 				       bdevname(rdev->bdev, b));
 				printk(KERN_NOTICE "md/raid10:%s: %s: failing "
 				       "drive\n",
@@ -2261,7 +2312,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 				       " (%d sectors at %llu on %s)\n",
 				       mdname(mddev), s,
 				       (unsigned long long)(
-					       sect + rdev->data_offset),
+					       sect +
+					       choose_data_offset(r10_bio, rdev)),
 				       bdevname(rdev->bdev, b));
 				printk(KERN_NOTICE "md/raid10:%s: %s: failing "
 				       "drive\n",
@@ -2274,7 +2326,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 				       " (%d sectors at %llu on %s)\n",
 				       mdname(mddev), s,
 				       (unsigned long long)(
-					       sect + rdev->data_offset),
+					       sect +
+					       choose_data_offset(r10_bio, rdev)),
 				       bdevname(rdev->bdev, b));
 				atomic_add(s, &rdev->corrected_errors);
 			}
@@ -2348,7 +2401,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
 		wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 		md_trim_bio(wbio, sector - bio->bi_sector, sectors);
 		wbio->bi_sector = (r10_bio->devs[i].addr+
-				   rdev->data_offset+
+				   choose_data_offset(r10_bio, rdev) +
 				   (sector - r10_bio->sector));
 		wbio->bi_bdev = rdev->bdev;
 		if (submit_bio_wait(WRITE, wbio) == 0)
@@ -2425,7 +2478,7 @@ read_more:
 	r10_bio->devs[slot].bio = bio;
 	r10_bio->devs[slot].rdev = rdev;
 	bio->bi_sector = r10_bio->devs[slot].addr
-		+ rdev->data_offset;
+		+ choose_data_offset(r10_bio, rdev);
 	bio->bi_bdev = rdev->bdev;
 	bio->bi_rw = READ | do_sync;
 	bio->bi_private = r10_bio;
@@ -3254,6 +3307,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 		goto out;
 
 	calc_sectors(conf, mddev->dev_sectors);
+	conf->prev = conf->geo;
+	conf->reshape_progress = MaxSector;
 
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
@@ -3319,8 +3374,10 @@ static int run(struct mddev *mddev)
 	rdev_for_each(rdev, mddev) {
 
 		disk_idx = rdev->raid_disk;
-		if (disk_idx >= conf->geo.raid_disks
-		    || disk_idx < 0)
+		if (disk_idx < 0)
+			continue;
+		if (disk_idx >= conf->geo.raid_disks &&
+		    disk_idx >= conf->prev.raid_disks)
 			continue;
 		disk = conf->mirrors + disk_idx;
 
@@ -3347,7 +3404,10 @@ static int run(struct mddev *mddev)
 	}
 
 	mddev->degraded = 0;
-	for (i = 0; i < conf->geo.raid_disks; i++) {
+	for (i = 0;
+	     i < conf->geo.raid_disks
+		     || i < conf->prev.raid_disks;
+	     i++) {
 
 		disk = conf->mirrors + i;
 
@@ -3466,6 +3526,9 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
 	struct r10conf *conf = mddev->private;
 	sector_t oldsize, size;
 
+	if (mddev->reshape_position != MaxSector)
+		return -EBUSY;
+
 	if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
 		return -EINVAL;
 
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 4c4942ac46f..37509d7134a 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -34,13 +34,14 @@ struct r10conf {
 					       */
 		int		chunk_shift; /* shift from chunks to sectors */
 		sector_t	chunk_mask;
-	} geo;
+	} prev, geo;
 	int			copies;	      /* near_copies * far_copies.
 					       * must be <= raid_disks
 					       */
 
 	sector_t		dev_sectors;  /* temp copy of
 					       * mddev->dev_sectors */
+	sector_t		reshape_progress;
 
 	struct list_head	retry_list;
 	/* queue pending writes and submit them on unplug */
@@ -147,5 +148,10 @@ enum r10bio_state {
  */
 	R10BIO_MadeGood,
 	R10BIO_WriteError,
+/* During a reshape we might be performing IO on the
+ * 'previous' part of the array, in which case this
+ * flag is set
+ */
+	R10BIO_Previous,
 };
 #endif

From deb200d08590622d987718135a1e6323f83154aa Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 May 2012 09:28:33 +1000
Subject: [PATCH 10/56] md/raid10: split out interpretation of layout to
 separate function.

We will soon be interpreting the layout (and chunksize etc) from
multiple places to support reshape.  So split it out into separate
function.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 67 +++++++++++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1c90005ab34..f102e88fc78 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3253,26 +3253,64 @@ static void calc_sectors(struct r10conf *conf, sector_t size)
 	}
 }
 
+enum geo_type {geo_new, geo_old, geo_start};
+static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
+{
+	int nc, fc, fo;
+	int layout, chunk, disks;
+	switch (new) {
+	case geo_old:
+		layout = mddev->layout;
+		chunk = mddev->chunk_sectors;
+		disks = mddev->raid_disks - mddev->delta_disks;
+		break;
+	case geo_new:
+		layout = mddev->new_layout;
+		chunk = mddev->new_chunk_sectors;
+		disks = mddev->raid_disks;
+		break;
+	default: /* avoid 'may be unused' warnings */
+	case geo_start: /* new when starting reshape - raid_disks not
+			 * updated yet. */
+		layout = mddev->new_layout;
+		chunk = mddev->new_chunk_sectors;
+		disks = mddev->raid_disks + mddev->delta_disks;
+		break;
+	}
+	if (layout >> 17)
+		return -1;
+	if (chunk < (PAGE_SIZE >> 9) ||
+	    !is_power_of_2(chunk))
+		return -2;
+	nc = layout & 255;
+	fc = (layout >> 8) & 255;
+	fo = layout & (1<<16);
+	geo->raid_disks = disks;
+	geo->near_copies = nc;
+	geo->far_copies = fc;
+	geo->far_offset = fo;
+	geo->chunk_mask = chunk - 1;
+	geo->chunk_shift = ffz(~chunk);
+	return nc*fc;
+}
+
 static struct r10conf *setup_conf(struct mddev *mddev)
 {
 	struct r10conf *conf = NULL;
-	int nc, fc, fo;
 	int err = -EINVAL;
+	struct geom geo;
+	int copies;
 
-	if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
-	    !is_power_of_2(mddev->new_chunk_sectors)) {
+	copies = setup_geo(&geo, mddev, geo_new);
+
+	if (copies == -2) {
 		printk(KERN_ERR "md/raid10:%s: chunk size must be "
 		       "at least PAGE_SIZE(%ld) and be a power of 2.\n",
 		       mdname(mddev), PAGE_SIZE);
 		goto out;
 	}
 
-	nc = mddev->new_layout & 255;
-	fc = (mddev->new_layout >> 8) & 255;
-	fo = mddev->new_layout & (1<<16);
-
-	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
-	    (mddev->new_layout >> 17)) {
+	if (copies < 2 || copies > mddev->raid_disks) {
 		printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
 		       mdname(mddev), mddev->new_layout);
 		goto out;
@@ -3292,15 +3330,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	if (!conf->tmppage)
 		goto out;
 
-
-	conf->geo.raid_disks = mddev->raid_disks;
-	conf->geo.near_copies = nc;
-	conf->geo.far_copies = fc;
-	conf->copies = nc*fc;
-	conf->geo.far_offset = fo;
-	conf->geo.chunk_mask = mddev->new_chunk_sectors - 1;
-	conf->geo.chunk_shift = ffz(~mddev->new_chunk_sectors);
-
+	conf->geo = geo;
+	conf->copies = copies;
 	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
 					   r10bio_pool_free, conf);
 	if (!conf->r10bio_pool)

From 3ea7daa5d7fde47cd41f4d56c2deb949114da9d6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:53:47 +1000
Subject: [PATCH 11/56] md/raid10: add reshape support

A 'near' or 'offset' lay RAID10 array can be reshaped to a different
'near' or 'offset' layout, a different chunk size, and a different
number of devices.
However the number of copies cannot change.

Unlike RAID5/6, we do not support having user-space backup data that
is being relocated during a 'critical section'.  Rather, the
data_offset of each device must change so that when writing any block
to a new location, it will not over-write any data that is still
'live'.

This means that RAID10 reshape is not supportable on v0.90 metadata.

The different between the old data_offset and the new_offset must be
at least the larger of the chunksize multiplied by offset copies of
each of the old and new layout. (for 'near' mode, offset_copies == 1).

A larger difference of around 64M seems useful for in-place reshapes
as more data can be moved between metadata updates.
Very large differences (e.g. 512M) seem to slow the process down due
to lots of long seeks (on oldish consumer graded devices at least).

Metadata needs to be updated whenever the place we are about to write
to is considered - by the current metadata - to still contain data in
the old layout.

[unbalanced locking fix from Dan Carpenter <dan.carpenter@oracle.com>]

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 890 ++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid10.h |   5 +
 2 files changed, 872 insertions(+), 23 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f102e88fc78..ec271ae4318 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
+#include <linux/kthread.h>
 #include "md.h"
 #include "raid10.h"
 #include "raid0.h"
@@ -68,6 +69,11 @@ static int max_queued_requests = 1024;
 static void allow_barrier(struct r10conf *conf);
 static void lower_barrier(struct r10conf *conf);
 static int enough(struct r10conf *conf, int ignore);
+static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
+				int *skipped);
+static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
+static void end_reshape_write(struct bio *bio, int error);
+static void end_reshape(struct r10conf *conf);
 
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	if (!r10_bio)
 		return NULL;
 
-	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
+	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
+	    test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
 		nalloc = conf->copies; /* resync */
 	else
 		nalloc = 2; /* recovery */
@@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 		struct bio *rbio = r10_bio->devs[j].repl_bio;
 		bio = r10_bio->devs[j].bio;
 		for (i = 0; i < RESYNC_PAGES; i++) {
-			if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
-						&conf->mddev->recovery)) {
-				/* we can share bv_page's during recovery */
+			if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
+					       &conf->mddev->recovery)) {
+				/* we can share bv_page's during recovery
+				 * and reshape */
 				struct bio *rbio = r10_bio->devs[0].bio;
 				page = rbio->bi_io_vec[i].bv_page;
 				get_page(page);
@@ -614,10 +622,11 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 	struct r10conf *conf = mddev->private;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
-	unsigned int chunk_sectors = mddev->chunk_sectors;
+	unsigned int chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 	struct geom *geo = &conf->geo;
 
+	chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
 	if (conf->reshape_progress != MaxSector &&
 	    ((sector >= conf->reshape_progress) !=
 	     conf->mddev->reshape_backwards))
@@ -1032,6 +1041,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	int plugged;
 	int sectors_handled;
 	int max_sectors;
+	int sectors;
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
@@ -1096,10 +1106,41 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	 */
 	wait_barrier(conf);
 
+	sectors = bio->bi_size >> 9;
+	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+	    bio->bi_sector < conf->reshape_progress &&
+	    bio->bi_sector + sectors > conf->reshape_progress) {
+		/* IO spans the reshape position.  Need to wait for
+		 * reshape to pass
+		 */
+		allow_barrier(conf);
+		wait_event(conf->wait_barrier,
+			   conf->reshape_progress <= bio->bi_sector ||
+			   conf->reshape_progress >= bio->bi_sector + sectors);
+		wait_barrier(conf);
+	}
+	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+	    bio_data_dir(bio) == WRITE &&
+	    (mddev->reshape_backwards
+	     ? (bio->bi_sector < conf->reshape_safe &&
+		bio->bi_sector + sectors > conf->reshape_progress)
+	     : (bio->bi_sector + sectors > conf->reshape_safe &&
+		bio->bi_sector < conf->reshape_progress))) {
+		/* Need to update reshape_position in metadata */
+		mddev->reshape_position = conf->reshape_progress;
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		set_bit(MD_CHANGE_PENDING, &mddev->flags);
+		md_wakeup_thread(mddev->thread);
+		wait_event(mddev->sb_wait,
+			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
+
+		conf->reshape_safe = mddev->reshape_position;
+	}
+
 	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
 	r10_bio->master_bio = bio;
-	r10_bio->sectors = bio->bi_size >> 9;
+	r10_bio->sectors = sectors;
 
 	r10_bio->mddev = mddev;
 	r10_bio->sector = bio->bi_sector;
@@ -1730,7 +1771,11 @@ static void end_sync_read(struct bio *bio, int error)
 	struct r10conf *conf = r10_bio->mddev->private;
 	int d;
 
-	d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
+	if (bio == r10_bio->master_bio) {
+		/* this is a reshape read */
+		d = r10_bio->read_slot; /* really the read dev */
+	} else
+		d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
 
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -2631,6 +2676,8 @@ static void raid10d(struct mddev *mddev)
 		if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
 		    test_bit(R10BIO_WriteError, &r10_bio->state))
 			handle_write_completed(conf, r10_bio);
+		else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
+			reshape_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_IsSync, &r10_bio->state))
 			sync_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
@@ -2723,7 +2770,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 
  skipped:
 	max_sector = mddev->dev_sectors;
-	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		max_sector = mddev->resync_max_sectors;
 	if (sector_nr >= max_sector) {
 		/* If we aborted, we need to abort the
@@ -2735,6 +2783,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 		 * we need to convert that to several
 		 * virtual addresses.
 		 */
+		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+			end_reshape(conf);
+			return 0;
+		}
+
 		if (mddev->curr_resync < max_sector) { /* aborted */
 			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 				bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
@@ -2766,6 +2819,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 		*skipped = 1;
 		return sectors_skipped;
 	}
+
+	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+		return reshape_request(mddev, sector_nr, skipped);
+
 	if (chunks_skipped >= conf->geo.raid_disks) {
 		/* if there has been nothing to do on any drive,
 		 * then there is nothing to do at all..
@@ -3211,7 +3268,8 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 	struct r10conf *conf = mddev->private;
 
 	if (!raid_disks)
-		raid_disks = conf->geo.raid_disks;
+		raid_disks = min(conf->geo.raid_disks,
+				 conf->prev.raid_disks);
 	if (!sectors)
 		sectors = conf->dev_sectors;
 
@@ -3321,7 +3379,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	if (!conf)
 		goto out;
 
-	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+	/* FIXME calc properly */
+	conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks +
+							    max(0,mddev->delta_disks)),
 				GFP_KERNEL);
 	if (!conf->mirrors)
 		goto out;
@@ -3338,9 +3398,21 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 		goto out;
 
 	calc_sectors(conf, mddev->dev_sectors);
-	conf->prev = conf->geo;
-	conf->reshape_progress = MaxSector;
-
+	if (mddev->reshape_position == MaxSector) {
+		conf->prev = conf->geo;
+		conf->reshape_progress = MaxSector;
+	} else {
+		if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
+			err = -EINVAL;
+			goto out;
+		}
+		conf->reshape_progress = mddev->reshape_position;
+		if (conf->prev.far_offset)
+			conf->prev.stride = 1 << conf->prev.chunk_shift;
+		else
+			/* far_copies must be 1 */
+			conf->prev.stride = conf->dev_sectors;
+	}
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
 
@@ -3355,8 +3427,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	return conf;
 
  out:
-	printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
-	       mdname(mddev));
+	if (err == -ENOMEM)
+		printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
+		       mdname(mddev));
 	if (conf) {
 		if (conf->r10bio_pool)
 			mempool_destroy(conf->r10bio_pool);
@@ -3374,12 +3447,8 @@ static int run(struct mddev *mddev)
 	struct mirror_info *disk;
 	struct md_rdev *rdev;
 	sector_t size;
-
-	/*
-	 * copy the already verified devices into our private RAID10
-	 * bookkeeping area. [whatever we allocate in run(),
-	 * should be freed in stop()]
-	 */
+	sector_t min_offset_diff = 0;
+	int first = 1;
 
 	if (mddev->private == NULL) {
 		conf = setup_conf(mddev);
@@ -3403,6 +3472,7 @@ static int run(struct mddev *mddev)
 				 (conf->geo.raid_disks / conf->geo.near_copies));
 
 	rdev_for_each(rdev, mddev) {
+		long long diff;
 
 		disk_idx = rdev->raid_disk;
 		if (disk_idx < 0)
@@ -3421,12 +3491,20 @@ static int run(struct mddev *mddev)
 				goto out_free_conf;
 			disk->rdev = rdev;
 		}
+		diff = (rdev->new_data_offset - rdev->data_offset);
+		if (!mddev->reshape_backwards)
+			diff = -diff;
+		if (diff < 0)
+			diff = 0;
+		if (first || diff < min_offset_diff)
+			min_offset_diff = diff;
 
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
 
 		disk->head_position = 0;
 	}
+
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf, -1)) {
 		printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
@@ -3434,6 +3512,16 @@ static int run(struct mddev *mddev)
 		goto out_free_conf;
 	}
 
+	if (conf->reshape_progress != MaxSector) {
+		/* must ensure that shape change is supported */
+		if (conf->geo.far_copies != 1 &&
+		    conf->geo.far_offset == 0)
+			goto out_free_conf;
+		if (conf->prev.far_copies != 1 &&
+		    conf->geo.far_offset == 0)
+			goto out_free_conf;
+	}
+
 	mddev->degraded = 0;
 	for (i = 0;
 	     i < conf->geo.raid_disks
@@ -3486,8 +3574,8 @@ static int run(struct mddev *mddev)
 		int stripe = conf->geo.raid_disks *
 			((mddev->chunk_sectors << 9) / PAGE_SIZE);
 		stripe /= conf->geo.near_copies;
-		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
-			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
+		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 	}
 
 	blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
@@ -3495,6 +3583,30 @@ static int run(struct mddev *mddev)
 	if (md_integrity_register(mddev))
 		goto out_free_conf;
 
+	if (conf->reshape_progress != MaxSector) {
+		unsigned long before_length, after_length;
+
+		before_length = ((1 << conf->prev.chunk_shift) *
+				 conf->prev.far_copies);
+		after_length = ((1 << conf->geo.chunk_shift) *
+				conf->geo.far_copies);
+
+		if (max(before_length, after_length) > min_offset_diff) {
+			/* This cannot work */
+			printk("md/raid10: offset difference not enough to continue reshape\n");
+			goto out_free_conf;
+		}
+		conf->offset_diff = min_offset_diff;
+
+		conf->reshape_safe = conf->reshape_progress;
+		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+							"reshape");
+	}
+
 	return 0;
 
 out_free_conf:
@@ -3634,6 +3746,735 @@ static void *raid10_takeover(struct mddev *mddev)
 	return ERR_PTR(-EINVAL);
 }
 
+static int raid10_check_reshape(struct mddev *mddev)
+{
+	/* Called when there is a request to change
+	 * - layout (to ->new_layout)
+	 * - chunk size (to ->new_chunk_sectors)
+	 * - raid_disks (by delta_disks)
+	 * or when trying to restart a reshape that was ongoing.
+	 *
+	 * We need to validate the request and possibly allocate
+	 * space if that might be an issue later.
+	 *
+	 * Currently we reject any reshape of a 'far' mode array,
+	 * allow chunk size to change if new is generally acceptable,
+	 * allow raid_disks to increase, and allow
+	 * a switch between 'near' mode and 'offset' mode.
+	 */
+	struct r10conf *conf = mddev->private;
+	struct geom geo;
+
+	if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
+		return -EINVAL;
+
+	if (setup_geo(&geo, mddev, geo_start) != conf->copies)
+		/* mustn't change number of copies */
+		return -EINVAL;
+	if (geo.far_copies > 1 && !geo.far_offset)
+		/* Cannot switch to 'far' mode */
+		return -EINVAL;
+
+	if (mddev->array_sectors & geo.chunk_mask)
+			/* not factor of array size */
+			return -EINVAL;
+
+	if (mddev->bitmap)
+		return -EBUSY;
+	if (!enough(conf, -1))
+		return -EINVAL;
+
+	kfree(conf->mirrors_new);
+	conf->mirrors_new = NULL;
+	if (mddev->delta_disks > 0) {
+		/* allocate new 'mirrors' list */
+		conf->mirrors_new = kzalloc(
+			sizeof(struct mirror_info)
+			*(mddev->raid_disks +
+			  mddev->delta_disks),
+			GFP_KERNEL);
+		if (!conf->mirrors_new)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+/*
+ * Need to check if array has failed when deciding whether to:
+ *  - start an array
+ *  - remove non-faulty devices
+ *  - add a spare
+ *  - allow a reshape
+ * This determination is simple when no reshape is happening.
+ * However if there is a reshape, we need to carefully check
+ * both the before and after sections.
+ * This is because some failed devices may only affect one
+ * of the two sections, and some non-in_sync devices may
+ * be insync in the section most affected by failed devices.
+ */
+static int calc_degraded(struct r10conf *conf)
+{
+	int degraded, degraded2;
+	int i;
+
+	rcu_read_lock();
+	degraded = 0;
+	/* 'prev' section first */
+	for (i = 0; i < conf->prev.raid_disks; i++) {
+		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+		if (!rdev || test_bit(Faulty, &rdev->flags))
+			degraded++;
+		else if (!test_bit(In_sync, &rdev->flags))
+			/* When we can reduce the number of devices in
+			 * an array, this might not contribute to
+			 * 'degraded'.  It does now.
+			 */
+			degraded++;
+	}
+	rcu_read_unlock();
+	if (conf->geo.raid_disks == conf->prev.raid_disks)
+		return degraded;
+	rcu_read_lock();
+	degraded2 = 0;
+	for (i = 0; i < conf->geo.raid_disks; i++) {
+		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+		if (!rdev || test_bit(Faulty, &rdev->flags))
+			degraded2++;
+		else if (!test_bit(In_sync, &rdev->flags)) {
+			/* If reshape is increasing the number of devices,
+			 * this section has already been recovered, so
+			 * it doesn't contribute to degraded.
+			 * else it does.
+			 */
+			if (conf->geo.raid_disks <= conf->prev.raid_disks)
+				degraded2++;
+		}
+	}
+	rcu_read_unlock();
+	if (degraded2 > degraded)
+		return degraded2;
+	return degraded;
+}
+
+static int raid10_start_reshape(struct mddev *mddev)
+{
+	/* A 'reshape' has been requested. This commits
+	 * the various 'new' fields and sets MD_RECOVER_RESHAPE
+	 * This also checks if there are enough spares and adds them
+	 * to the array.
+	 * We currently require enough spares to make the final
+	 * array non-degraded.  We also require that the difference
+	 * between old and new data_offset - on each device - is
+	 * enough that we never risk over-writing.
+	 */
+
+	unsigned long before_length, after_length;
+	sector_t min_offset_diff = 0;
+	int first = 1;
+	struct geom new;
+	struct r10conf *conf = mddev->private;
+	struct md_rdev *rdev;
+	int spares = 0;
+
+	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return -EBUSY;
+
+	if (setup_geo(&new, mddev, geo_start) != conf->copies)
+		return -EINVAL;
+
+	before_length = ((1 << conf->prev.chunk_shift) *
+			 conf->prev.far_copies);
+	after_length = ((1 << conf->geo.chunk_shift) *
+			conf->geo.far_copies);
+
+	rdev_for_each(rdev, mddev) {
+		if (!test_bit(In_sync, &rdev->flags)
+		    && !test_bit(Faulty, &rdev->flags))
+			spares++;
+		if (rdev->raid_disk >= 0) {
+			long long diff = (rdev->new_data_offset
+					  - rdev->data_offset);
+			if (!mddev->reshape_backwards)
+				diff = -diff;
+			if (diff < 0)
+				diff = 0;
+			if (first || diff < min_offset_diff)
+				min_offset_diff = diff;
+		}
+	}
+
+	if (max(before_length, after_length) > min_offset_diff)
+		return -EINVAL;
+
+	if (spares < mddev->delta_disks)
+		return -EINVAL;
+
+	conf->offset_diff = min_offset_diff;
+	spin_lock_irq(&conf->device_lock);
+	if (conf->mirrors_new) {
+		memcpy(conf->mirrors_new, conf->mirrors,
+		       sizeof(struct mirror_info)*conf->prev.raid_disks);
+		smp_mb();
+		kfree(conf->mirrors_old); /* FIXME and elsewhere */
+		conf->mirrors_old = conf->mirrors;
+		conf->mirrors = conf->mirrors_new;
+		conf->mirrors_new = NULL;
+	}
+	setup_geo(&conf->geo, mddev, geo_start);
+	smp_mb();
+	if (mddev->reshape_backwards) {
+		sector_t size = raid10_size(mddev, 0, 0);
+		if (size < mddev->array_sectors) {
+			spin_unlock_irq(&conf->device_lock);
+			printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+		mddev->resync_max_sectors = size;
+		conf->reshape_progress = size;
+	} else
+		conf->reshape_progress = 0;
+	spin_unlock_irq(&conf->device_lock);
+
+	if (mddev->delta_disks > 0) {
+		rdev_for_each(rdev, mddev)
+			if (rdev->raid_disk < 0 &&
+			    !test_bit(Faulty, &rdev->flags)) {
+				if (raid10_add_disk(mddev, rdev) == 0) {
+					if (rdev->raid_disk >=
+					    conf->prev.raid_disks)
+						set_bit(In_sync, &rdev->flags);
+					else
+						rdev->recovery_offset = 0;
+
+					if (sysfs_link_rdev(mddev, rdev))
+						/* Failure here  is OK */;
+				}
+			} else if (rdev->raid_disk >= conf->prev.raid_disks
+				   && !test_bit(Faulty, &rdev->flags)) {
+				/* This is a spare that was manually added */
+				set_bit(In_sync, &rdev->flags);
+			}
+	}
+	/* When a reshape changes the number of devices,
+	 * ->degraded is measured against the larger of the
+	 * pre and  post numbers.
+	 */
+	spin_lock_irq(&conf->device_lock);
+	mddev->degraded = calc_degraded(conf);
+	spin_unlock_irq(&conf->device_lock);
+	mddev->raid_disks = conf->geo.raid_disks;
+	mddev->reshape_position = conf->reshape_progress;
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+
+	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+
+	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+						"reshape");
+	if (!mddev->sync_thread) {
+		mddev->recovery = 0;
+		spin_lock_irq(&conf->device_lock);
+		conf->geo = conf->prev;
+		mddev->raid_disks = conf->geo.raid_disks;
+		rdev_for_each(rdev, mddev)
+			rdev->new_data_offset = rdev->data_offset;
+		smp_wmb();
+		conf->reshape_progress = MaxSector;
+		mddev->reshape_position = MaxSector;
+		spin_unlock_irq(&conf->device_lock);
+		return -EAGAIN;
+	}
+	conf->reshape_checkpoint = jiffies;
+	md_wakeup_thread(mddev->sync_thread);
+	md_new_event(mddev);
+	return 0;
+}
+
+/* Calculate the last device-address that could contain
+ * any block from the chunk that includes the array-address 's'
+ * and report the next address.
+ * i.e. the address returned will be chunk-aligned and after
+ * any data that is in the chunk containing 's'.
+ */
+static sector_t last_dev_address(sector_t s, struct geom *geo)
+{
+	s = (s | geo->chunk_mask) + 1;
+	s >>= geo->chunk_shift;
+	s *= geo->near_copies;
+	s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
+	s *= geo->far_copies;
+	s <<= geo->chunk_shift;
+	return s;
+}
+
+/* Calculate the first device-address that could contain
+ * any block from the chunk that includes the array-address 's'.
+ * This too will be the start of a chunk
+ */
+static sector_t first_dev_address(sector_t s, struct geom *geo)
+{
+	s >>= geo->chunk_shift;
+	s *= geo->near_copies;
+	sector_div(s, geo->raid_disks);
+	s *= geo->far_copies;
+	s <<= geo->chunk_shift;
+	return s;
+}
+
+static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
+				int *skipped)
+{
+	/* We simply copy at most one chunk (smallest of old and new)
+	 * at a time, possibly less if that exceeds RESYNC_PAGES,
+	 * or we hit a bad block or something.
+	 * This might mean we pause for normal IO in the middle of
+	 * a chunk, but that is not a problem was mddev->reshape_position
+	 * can record any location.
+	 *
+	 * If we will want to write to a location that isn't
+	 * yet recorded as 'safe' (i.e. in metadata on disk) then
+	 * we need to flush all reshape requests and update the metadata.
+	 *
+	 * When reshaping forwards (e.g. to more devices), we interpret
+	 * 'safe' as the earliest block which might not have been copied
+	 * down yet.  We divide this by previous stripe size and multiply
+	 * by previous stripe length to get lowest device offset that we
+	 * cannot write to yet.
+	 * We interpret 'sector_nr' as an address that we want to write to.
+	 * From this we use last_device_address() to find where we might
+	 * write to, and first_device_address on the  'safe' position.
+	 * If this 'next' write position is after the 'safe' position,
+	 * we must update the metadata to increase the 'safe' position.
+	 *
+	 * When reshaping backwards, we round in the opposite direction
+	 * and perform the reverse test:  next write position must not be
+	 * less than current safe position.
+	 *
+	 * In all this the minimum difference in data offsets
+	 * (conf->offset_diff - always positive) allows a bit of slack,
+	 * so next can be after 'safe', but not by more than offset_disk
+	 *
+	 * We need to prepare all the bios here before we start any IO
+	 * to ensure the size we choose is acceptable to all devices.
+	 * The means one for each copy for write-out and an extra one for
+	 * read-in.
+	 * We store the read-in bio in ->master_bio and the others in
+	 * ->devs[x].bio and ->devs[x].repl_bio.
+	 */
+	struct r10conf *conf = mddev->private;
+	struct r10bio *r10_bio;
+	sector_t next, safe, last;
+	int max_sectors;
+	int nr_sectors;
+	int s;
+	struct md_rdev *rdev;
+	int need_flush = 0;
+	struct bio *blist;
+	struct bio *bio, *read_bio;
+	int sectors_done = 0;
+
+	if (sector_nr == 0) {
+		/* If restarting in the middle, skip the initial sectors */
+		if (mddev->reshape_backwards &&
+		    conf->reshape_progress < raid10_size(mddev, 0, 0)) {
+			sector_nr = (raid10_size(mddev, 0, 0)
+				     - conf->reshape_progress);
+		} else if (!mddev->reshape_backwards &&
+			   conf->reshape_progress > 0)
+			sector_nr = conf->reshape_progress;
+		if (sector_nr) {
+			mddev->curr_resync_completed = sector_nr;
+			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+			*skipped = 1;
+			return sector_nr;
+		}
+	}
+
+	/* We don't use sector_nr to track where we are up to
+	 * as that doesn't work well for ->reshape_backwards.
+	 * So just use ->reshape_progress.
+	 */
+	if (mddev->reshape_backwards) {
+		/* 'next' is the earliest device address that we might
+		 * write to for this chunk in the new layout
+		 */
+		next = first_dev_address(conf->reshape_progress - 1,
+					 &conf->geo);
+
+		/* 'safe' is the last device address that we might read from
+		 * in the old layout after a restart
+		 */
+		safe = last_dev_address(conf->reshape_safe - 1,
+					&conf->prev);
+
+		if (next + conf->offset_diff < safe)
+			need_flush = 1;
+
+		last = conf->reshape_progress - 1;
+		sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
+					       & conf->prev.chunk_mask);
+		if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
+			sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
+	} else {
+		/* 'next' is after the last device address that we
+		 * might write to for this chunk in the new layout
+		 */
+		next = last_dev_address(conf->reshape_progress, &conf->geo);
+
+		/* 'safe' is the earliest device address that we might
+		 * read from in the old layout after a restart
+		 */
+		safe = first_dev_address(conf->reshape_safe, &conf->prev);
+
+		/* Need to update metadata if 'next' might be beyond 'safe'
+		 * as that would possibly corrupt data
+		 */
+		if (next > safe + conf->offset_diff)
+			need_flush = 1;
+
+		sector_nr = conf->reshape_progress;
+		last  = sector_nr | (conf->geo.chunk_mask
+				     & conf->prev.chunk_mask);
+
+		if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
+			last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
+	}
+
+	if (need_flush ||
+	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
+		/* Need to update reshape_position in metadata */
+		wait_barrier(conf);
+		mddev->reshape_position = conf->reshape_progress;
+		if (mddev->reshape_backwards)
+			mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
+				- conf->reshape_progress;
+		else
+			mddev->curr_resync_completed = conf->reshape_progress;
+		conf->reshape_checkpoint = jiffies;
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		md_wakeup_thread(mddev->thread);
+		wait_event(mddev->sb_wait, mddev->flags == 0 ||
+			   kthread_should_stop());
+		conf->reshape_safe = mddev->reshape_position;
+		allow_barrier(conf);
+	}
+
+read_more:
+	/* Now schedule reads for blocks from sector_nr to last */
+	r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+	raise_barrier(conf, sectors_done != 0);
+	atomic_set(&r10_bio->remaining, 0);
+	r10_bio->mddev = mddev;
+	r10_bio->sector = sector_nr;
+	set_bit(R10BIO_IsReshape, &r10_bio->state);
+	r10_bio->sectors = last - sector_nr + 1;
+	rdev = read_balance(conf, r10_bio, &max_sectors);
+	BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
+
+	if (!rdev) {
+		/* Cannot read from here, so need to record bad blocks
+		 * on all the target devices.
+		 */
+		// FIXME
+		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		return sectors_done;
+	}
+
+	read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
+
+	read_bio->bi_bdev = rdev->bdev;
+	read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+			       + rdev->data_offset);
+	read_bio->bi_private = r10_bio;
+	read_bio->bi_end_io = end_sync_read;
+	read_bio->bi_rw = READ;
+	read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+	read_bio->bi_flags |= 1 << BIO_UPTODATE;
+	read_bio->bi_vcnt = 0;
+	read_bio->bi_idx = 0;
+	read_bio->bi_size = 0;
+	r10_bio->master_bio = read_bio;
+	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
+
+	/* Now find the locations in the new layout */
+	__raid10_find_phys(&conf->geo, r10_bio);
+
+	blist = read_bio;
+	read_bio->bi_next = NULL;
+
+	for (s = 0; s < conf->copies*2; s++) {
+		struct bio *b;
+		int d = r10_bio->devs[s/2].devnum;
+		struct md_rdev *rdev2;
+		if (s&1) {
+			rdev2 = conf->mirrors[d].replacement;
+			b = r10_bio->devs[s/2].repl_bio;
+		} else {
+			rdev2 = conf->mirrors[d].rdev;
+			b = r10_bio->devs[s/2].bio;
+		}
+		if (!rdev2 || test_bit(Faulty, &rdev2->flags))
+			continue;
+		b->bi_bdev = rdev2->bdev;
+		b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
+		b->bi_private = r10_bio;
+		b->bi_end_io = end_reshape_write;
+		b->bi_rw = WRITE;
+		b->bi_flags &= ~(BIO_POOL_MASK - 1);
+		b->bi_flags |= 1 << BIO_UPTODATE;
+		b->bi_next = blist;
+		b->bi_vcnt = 0;
+		b->bi_idx = 0;
+		b->bi_size = 0;
+		blist = b;
+	}
+
+	/* Now add as many pages as possible to all of these bios. */
+
+	nr_sectors = 0;
+	for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
+		struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
+		int len = (max_sectors - s) << 9;
+		if (len > PAGE_SIZE)
+			len = PAGE_SIZE;
+		for (bio = blist; bio ; bio = bio->bi_next) {
+			struct bio *bio2;
+			if (bio_add_page(bio, page, len, 0))
+				continue;
+
+			/* Didn't fit, must stop */
+			for (bio2 = blist;
+			     bio2 && bio2 != bio;
+			     bio2 = bio2->bi_next) {
+				/* Remove last page from this bio */
+				bio2->bi_vcnt--;
+				bio2->bi_size -= len;
+				bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
+			}
+			goto bio_full;
+		}
+		sector_nr += len >> 9;
+		nr_sectors += len >> 9;
+	}
+bio_full:
+	r10_bio->sectors = nr_sectors;
+
+	/* Now submit the read */
+	md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
+	atomic_inc(&r10_bio->remaining);
+	read_bio->bi_next = NULL;
+	generic_make_request(read_bio);
+	sector_nr += nr_sectors;
+	sectors_done += nr_sectors;
+	if (sector_nr <= last)
+		goto read_more;
+
+	/* Now that we have done the whole section we can
+	 * update reshape_progress
+	 */
+	if (mddev->reshape_backwards)
+		conf->reshape_progress -= sectors_done;
+	else
+		conf->reshape_progress += sectors_done;
+
+	return sectors_done;
+}
+
+static void end_reshape_request(struct r10bio *r10_bio);
+static int handle_reshape_read_error(struct mddev *mddev,
+				     struct r10bio *r10_bio);
+static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
+{
+	/* Reshape read completed.  Hopefully we have a block
+	 * to write out.
+	 * If we got a read error then we do sync 1-page reads from
+	 * elsewhere until we find the data - or give up.
+	 */
+	struct r10conf *conf = mddev->private;
+	int s;
+
+	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+		if (handle_reshape_read_error(mddev, r10_bio) < 0) {
+			/* Reshape has been aborted */
+			md_done_sync(mddev, r10_bio->sectors, 0);
+			return;
+		}
+
+	/* We definitely have the data in the pages, schedule the
+	 * writes.
+	 */
+	atomic_set(&r10_bio->remaining, 1);
+	for (s = 0; s < conf->copies*2; s++) {
+		struct bio *b;
+		int d = r10_bio->devs[s/2].devnum;
+		struct md_rdev *rdev;
+		if (s&1) {
+			rdev = conf->mirrors[d].replacement;
+			b = r10_bio->devs[s/2].repl_bio;
+		} else {
+			rdev = conf->mirrors[d].rdev;
+			b = r10_bio->devs[s/2].bio;
+		}
+		if (!rdev || test_bit(Faulty, &rdev->flags))
+			continue;
+		atomic_inc(&rdev->nr_pending);
+		md_sync_acct(b->bi_bdev, r10_bio->sectors);
+		atomic_inc(&r10_bio->remaining);
+		b->bi_next = NULL;
+		generic_make_request(b);
+	}
+	end_reshape_request(r10_bio);
+}
+
+static void end_reshape(struct r10conf *conf)
+{
+	if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
+		return;
+
+	spin_lock_irq(&conf->device_lock);
+	conf->prev = conf->geo;
+	md_finish_reshape(conf->mddev);
+	smp_wmb();
+	conf->reshape_progress = MaxSector;
+	spin_unlock_irq(&conf->device_lock);
+
+	/* read-ahead size must cover two whole stripes, which is
+	 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
+	 */
+	if (conf->mddev->queue) {
+		int stripe = conf->geo.raid_disks *
+			((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
+		stripe /= conf->geo.near_copies;
+		if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+			conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+	}
+	conf->fullsync = 0;
+}
+
+
+static int handle_reshape_read_error(struct mddev *mddev,
+				     struct r10bio *r10_bio)
+{
+	/* Use sync reads to get the blocks from somewhere else */
+	int sectors = r10_bio->sectors;
+	struct r10bio r10b;
+	struct r10conf *conf = mddev->private;
+	int slot = 0;
+	int idx = 0;
+	struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
+
+	r10b.sector = r10_bio->sector;
+	__raid10_find_phys(&conf->prev, &r10b);
+
+	while (sectors) {
+		int s = sectors;
+		int success = 0;
+		int first_slot = slot;
+
+		if (s > (PAGE_SIZE >> 9))
+			s = PAGE_SIZE >> 9;
+
+		while (!success) {
+			int d = r10b.devs[slot].devnum;
+			struct md_rdev *rdev = conf->mirrors[d].rdev;
+			sector_t addr;
+			if (rdev == NULL ||
+			    test_bit(Faulty, &rdev->flags) ||
+			    !test_bit(In_sync, &rdev->flags))
+				goto failed;
+
+			addr = r10b.devs[slot].addr + idx * PAGE_SIZE;
+			success = sync_page_io(rdev,
+					       addr,
+					       s << 9,
+					       bvec[idx].bv_page,
+					       READ, false);
+			if (success)
+				break;
+		failed:
+			slot++;
+			if (slot >= conf->copies)
+				slot = 0;
+			if (slot == first_slot)
+				break;
+		}
+		if (!success) {
+			/* couldn't read this block, must give up */
+			set_bit(MD_RECOVERY_INTR,
+				&mddev->recovery);
+			return -EIO;
+		}
+		sectors -= s;
+		idx++;
+	}
+	return 0;
+}
+
+static void end_reshape_write(struct bio *bio, int error)
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct r10bio *r10_bio = bio->bi_private;
+	struct mddev *mddev = r10_bio->mddev;
+	struct r10conf *conf = mddev->private;
+	int d;
+	int slot;
+	int repl;
+	struct md_rdev *rdev = NULL;
+
+	d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+	if (repl)
+		rdev = conf->mirrors[d].replacement;
+	if (!rdev) {
+		smp_mb();
+		rdev = conf->mirrors[d].rdev;
+	}
+
+	if (!uptodate) {
+		/* FIXME should record badblock */
+		md_error(mddev, rdev);
+	}
+
+	rdev_dec_pending(rdev, mddev);
+	end_reshape_request(r10_bio);
+}
+
+static void end_reshape_request(struct r10bio *r10_bio)
+{
+	if (!atomic_dec_and_test(&r10_bio->remaining))
+		return;
+	md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
+	bio_put(r10_bio->master_bio);
+	put_buf(r10_bio);
+}
+
+static void raid10_finish_reshape(struct mddev *mddev)
+{
+	struct r10conf *conf = mddev->private;
+
+	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+		return;
+
+	if (mddev->delta_disks > 0) {
+		sector_t size = raid10_size(mddev, 0, 0);
+		md_set_array_sectors(mddev, size);
+		if (mddev->recovery_cp > mddev->resync_max_sectors) {
+			mddev->recovery_cp = mddev->resync_max_sectors;
+			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		}
+		mddev->resync_max_sectors = size;
+		set_capacity(mddev->gendisk, mddev->array_sectors);
+		revalidate_disk(mddev->gendisk);
+	}
+	mddev->layout = mddev->new_layout;
+	mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
+	mddev->reshape_position = MaxSector;
+	mddev->delta_disks = 0;
+	mddev->reshape_backwards = 0;
+}
+
 static struct md_personality raid10_personality =
 {
 	.name		= "raid10",
@@ -3652,6 +4493,9 @@ static struct md_personality raid10_personality =
 	.size		= raid10_size,
 	.resize		= raid10_resize,
 	.takeover	= raid10_takeover,
+	.check_reshape	= raid10_check_reshape,
+	.start_reshape	= raid10_start_reshape,
+	.finish_reshape	= raid10_finish_reshape,
 };
 
 static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 37509d7134a..135b1b0a155 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -14,6 +14,7 @@ struct mirror_info {
 struct r10conf {
 	struct mddev		*mddev;
 	struct mirror_info	*mirrors;
+	struct mirror_info	*mirrors_new, *mirrors_old;
 	spinlock_t		device_lock;
 
 	/* geometry */
@@ -42,6 +43,9 @@ struct r10conf {
 	sector_t		dev_sectors;  /* temp copy of
 					       * mddev->dev_sectors */
 	sector_t		reshape_progress;
+	sector_t		reshape_safe;
+	unsigned long		reshape_checkpoint;
+	sector_t		offset_diff;
 
 	struct list_head	retry_list;
 	/* queue pending writes and submit them on unplug */
@@ -138,6 +142,7 @@ enum r10bio_state {
 	R10BIO_Uptodate,
 	R10BIO_IsSync,
 	R10BIO_IsRecover,
+	R10BIO_IsReshape,
 	R10BIO_Degraded,
 /* Set ReadError on bios that experience a read error
  * so that raid10d knows what to do with them.

From 6a328475cccacb99a13b06978091da936dd51d89 Mon Sep 17 00:00:00 2001
From: Jim Kukunas <james.t.kukunas@linux.intel.com>
Date: Tue, 22 May 2012 13:54:03 +1000
Subject: [PATCH 12/56] crypto: wait for a full jiffy in do_xor_speed

In the existing do_xor_speed(), there is no guarantee that we actually
run do_2() for a full jiffy. We get the current jiffy, then run do_2()
until the next jiffy.

Instead, let's get the current jiffy, then wait until the next jiffy
to start our test.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 crypto/xor.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/crypto/xor.c b/crypto/xor.c
index b75182d8ab1..87884435bdd 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -63,7 +63,7 @@ static void
 do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 {
 	int speed;
-	unsigned long now;
+	unsigned long now, j;
 	int i, count, max;
 
 	tmpl->next = template_list;
@@ -76,9 +76,11 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 	 */
 	max = 0;
 	for (i = 0; i < 5; i++) {
-		now = jiffies;
+		j = jiffies;
 		count = 0;
-		while (jiffies == now) {
+		while ((now = jiffies) == j)
+			cpu_relax();
+		while (time_before(jiffies, now + 1)) {
 			mb(); /* prevent loop optimzation */
 			tmpl->do_2(BENCH_SIZE, b1, b2);
 			mb();

From 56a519913eeba2bdae4d7ee39e80fab442c3836c Mon Sep 17 00:00:00 2001
From: Jim Kukunas <james.t.kukunas@linux.intel.com>
Date: Tue, 22 May 2012 13:54:04 +1000
Subject: [PATCH 13/56] crypto: disable preemption while benchmarking RAID5 xor
 checksumming

With CONFIG_PREEMPT=y, we need to disable preemption while benchmarking
RAID5 xor checksumming to ensure we're actually measuring what we think
we're measuring.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 crypto/xor.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/crypto/xor.c b/crypto/xor.c
index 87884435bdd..84daa1111da 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -21,6 +21,7 @@
 #include <linux/gfp.h>
 #include <linux/raid/xor.h>
 #include <linux/jiffies.h>
+#include <linux/preempt.h>
 #include <asm/xor.h>
 
 /* The xor routines to use.  */
@@ -69,6 +70,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 	tmpl->next = template_list;
 	template_list = tmpl;
 
+	preempt_disable();
+
 	/*
 	 * Count the number of XORs done during a whole jiffy, and use
 	 * this to calculate the speed of checksumming.  We use a 2-page
@@ -91,6 +94,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 			max = count;
 	}
 
+	preempt_enable();
+
 	speed = max * (HZ * BENCH_SIZE / 1024);
 	tmpl->speed = speed;
 

From ea4d26ae24e58fbd2c61de9242adab053cb982d8 Mon Sep 17 00:00:00 2001
From: Jim Kukunas <james.t.kukunas@linux.intel.com>
Date: Tue, 22 May 2012 13:54:04 +1000
Subject: [PATCH 14/56] raid5: add AVX optimized RAID5 checksumming

Optimize RAID5 xor checksumming by taking advantage of
256-bit YMM registers introduced in AVX.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 arch/x86/Makefile              |   5 +-
 arch/x86/include/asm/xor_32.h  |   6 +-
 arch/x86/include/asm/xor_64.h  |   8 +-
 arch/x86/include/asm/xor_avx.h | 214 +++++++++++++++++++++++++++++++++
 4 files changed, 229 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/include/asm/xor_avx.h

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 41a7237606a..7a1cc9ee5c8 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
 
 # does binutils support specific instructions?
 asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 133b40a0f49..454570891bd 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
 	.do_5 = xor_sse_5,
 };
 
+/* Also try the AVX routines */
+#include "xor_avx.h"
+
 /* Also try the generic routines.  */
 #include <asm-generic/xor.h>
 
@@ -871,6 +874,7 @@ do {							\
 	xor_speed(&xor_block_8regs_p);			\
 	xor_speed(&xor_block_32regs);			\
 	xor_speed(&xor_block_32regs_p);			\
+	AVX_XOR_SPEED;					\
 	if (cpu_has_xmm)				\
 		xor_speed(&xor_block_pIII_sse);		\
 	if (cpu_has_mmx) {				\
@@ -883,6 +887,6 @@ do {							\
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
 #define XOR_SELECT_TEMPLATE(FASTEST)			\
-	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+	AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
 
 #endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1549b5e261f..b9b2323e90f 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
 	.do_5 = xor_sse_5,
 };
 
+
+/* Also try the AVX routines */
+#include "xor_avx.h"
+
 #undef XOR_TRY_TEMPLATES
 #define XOR_TRY_TEMPLATES			\
 do {						\
+	AVX_XOR_SPEED;				\
 	xor_speed(&xor_block_sse);		\
 } while (0)
 
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+	AVX_SELECT(&xor_block_sse)
 
 #endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 00000000000..2510d35f480
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
+#ifndef _ASM_X86_XOR_AVX_H
+#define _ASM_X86_XOR_AVX_H
+
+/*
+ * Optimized RAID-5 checksumming functions for AVX
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#ifdef CONFIG_AS_AVX
+
+#include <linux/compiler.h>
+#include <asm/i387.h>
+
+#define ALIGN32 __aligned(32)
+
+#define YMM_SAVED_REGS 4
+
+#define YMMS_SAVE \
+do { \
+	preempt_disable(); \
+	cr0 = read_cr0(); \
+	clts(); \
+	asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
+	asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
+	asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
+	asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
+} while (0);
+
+#define YMMS_RESTORE \
+do { \
+	asm volatile("sfence" : : : "memory"); \
+	asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
+	asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
+	asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
+	asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
+	write_cr0(cr0); \
+	preempt_enable(); \
+} while (0);
+
+#define BLOCK4(i) \
+		BLOCK(32 * i, 0) \
+		BLOCK(32 * (i + 1), 1) \
+		BLOCK(32 * (i + 2), 2) \
+		BLOCK(32 * (i + 3), 3)
+
+#define BLOCK16() \
+		BLOCK4(0) \
+		BLOCK4(4) \
+		BLOCK4(8) \
+		BLOCK4(12)
+
+static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
+{
+	unsigned long cr0, lines = bytes >> 9;
+	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+	YMMS_SAVE
+
+	while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
+		"m" (p0[i / sizeof(*p0)])); \
+	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+		"=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+		BLOCK16()
+
+		p0 = (unsigned long *)((uintptr_t)p0 + 512);
+		p1 = (unsigned long *)((uintptr_t)p1 + 512);
+	}
+
+	YMMS_RESTORE
+}
+
+static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+	unsigned long *p2)
+{
+	unsigned long cr0, lines = bytes >> 9;
+	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+	YMMS_SAVE
+
+	while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p1[i / sizeof(*p1)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p0[i / sizeof(*p0)])); \
+	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+		"=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+		BLOCK16()
+
+		p0 = (unsigned long *)((uintptr_t)p0 + 512);
+		p1 = (unsigned long *)((uintptr_t)p1 + 512);
+		p2 = (unsigned long *)((uintptr_t)p2 + 512);
+	}
+
+	YMMS_RESTORE
+}
+
+static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+	unsigned long *p2, unsigned long *p3)
+{
+	unsigned long cr0, lines = bytes >> 9;
+	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+	YMMS_SAVE
+
+	while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p2[i / sizeof(*p2)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p1[i / sizeof(*p1)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p0[i / sizeof(*p0)])); \
+	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+		"=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+		BLOCK16();
+
+		p0 = (unsigned long *)((uintptr_t)p0 + 512);
+		p1 = (unsigned long *)((uintptr_t)p1 + 512);
+		p2 = (unsigned long *)((uintptr_t)p2 + 512);
+		p3 = (unsigned long *)((uintptr_t)p3 + 512);
+	}
+
+	YMMS_RESTORE
+}
+
+static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+	unsigned long *p2, unsigned long *p3, unsigned long *p4)
+{
+	unsigned long cr0, lines = bytes >> 9;
+	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+	YMMS_SAVE
+
+	while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p3[i / sizeof(*p3)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p2[i / sizeof(*p2)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p1[i / sizeof(*p1)])); \
+	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+		"m" (p0[i / sizeof(*p0)])); \
+	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+		"=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+		BLOCK16()
+
+		p0 = (unsigned long *)((uintptr_t)p0 + 512);
+		p1 = (unsigned long *)((uintptr_t)p1 + 512);
+		p2 = (unsigned long *)((uintptr_t)p2 + 512);
+		p3 = (unsigned long *)((uintptr_t)p3 + 512);
+		p4 = (unsigned long *)((uintptr_t)p4 + 512);
+	}
+
+	YMMS_RESTORE
+}
+
+static struct xor_block_template xor_block_avx = {
+	.name = "avx",
+	.do_2 = xor_avx_2,
+	.do_3 = xor_avx_3,
+	.do_4 = xor_avx_4,
+	.do_5 = xor_avx_5,
+};
+
+#define AVX_XOR_SPEED \
+do { \
+	if (cpu_has_avx) \
+		xor_speed(&xor_block_avx); \
+} while (0)
+
+#define AVX_SELECT(FASTEST) \
+	(cpu_has_avx ? &xor_block_avx : FASTEST)
+
+#else
+
+#define AVX_XOR_SPEED {}
+
+#define AVX_SELECT(FASTEST) (FASTEST)
+
+#endif
+#endif

From f674ef7b43881b2ac11f98d6ba2dc5d9dd0dd118 Mon Sep 17 00:00:00 2001
From: Jim Kukunas <james.t.kukunas@linux.intel.com>
Date: Tue, 22 May 2012 13:54:16 +1000
Subject: [PATCH 15/56] lib/raid6: fix test program build

<linux/module.h> drags in headers which are not visible to userspace,
thus breaking the build for the test program.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 lib/raid6/algos.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 8b02f60ffc8..f6a0f789916 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -17,11 +17,11 @@
  */
 
 #include <linux/raid/pq.h>
-#include <linux/module.h>
 #ifndef __KERNEL__
 #include <sys/mman.h>
 #include <stdio.h>
 #else
+#include <linux/module.h>
 #include <linux/gfp.h>
 #if !RAID6_USE_EMPTY_ZERO_PAGE
 /* In .bss so it's zeroed */

From 048a8b8c89dc427dd7a58527c8923224b1e66d83 Mon Sep 17 00:00:00 2001
From: Jim Kukunas <james.t.kukunas@linux.intel.com>
Date: Tue, 22 May 2012 13:54:18 +1000
Subject: [PATCH 16/56] lib/raid6: Add SSSE3 optimized recovery functions

Add SSSE3 optimized recovery functions, as well as a system
for selecting the most appropriate recovery functions to use.

Originally-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/raid/pq.h |  18 ++-
 lib/raid6/Makefile      |   2 +-
 lib/raid6/algos.c       |  37 +++++
 lib/raid6/mktables.c    |  25 +++
 lib/raid6/recov.c       |  15 +-
 lib/raid6/recov_ssse3.c | 335 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 425 insertions(+), 7 deletions(-)
 create mode 100644 lib/raid6/recov_ssse3.c

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 53272e9860a..640c69ceec9 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -99,8 +99,20 @@ extern const struct raid6_calls raid6_altivec2;
 extern const struct raid6_calls raid6_altivec4;
 extern const struct raid6_calls raid6_altivec8;
 
+struct raid6_recov_calls {
+	void (*data2)(int, size_t, int, int, void **);
+	void (*datap)(int, size_t, int, void **);
+	int  (*valid)(void);
+	const char *name;
+	int priority;
+};
+
+extern const struct raid6_recov_calls raid6_recov_intx1;
+extern const struct raid6_recov_calls raid6_recov_ssse3;
+
 /* Algorithm list */
 extern const struct raid6_calls * const raid6_algos[];
+extern const struct raid6_recov_calls *const raid6_recov_algos[];
 int raid6_select_algo(void);
 
 /* Return values from chk_syndrome */
@@ -111,14 +123,16 @@ int raid6_select_algo(void);
 
 /* Galois field tables */
 extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
+extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));
 extern const u8 raid6_gfexp[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
 
 /* Recovery routines */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+extern void (*raid6_2data_recov)(int disks, size_t bytes, int faila, int failb,
 		       void **ptrs);
-void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
+extern void (*raid6_datap_recov)(int disks, size_t bytes, int faila,
+			void **ptrs);
 void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
 		      void **ptrs);
 
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 8a38102770f..de06dfe165b 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 
-raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
+raid6_pq-y	+= algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \
 		   int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \
 		   altivec8.o mmx.o sse1.o sse2.o
 hostprogs-y	+= mktables
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index f6a0f789916..5a7f8022be1 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -64,6 +64,20 @@ const struct raid6_calls * const raid6_algos[] = {
 	NULL
 };
 
+void (*raid6_2data_recov)(int, size_t, int, int, void **);
+EXPORT_SYMBOL_GPL(raid6_2data_recov);
+
+void (*raid6_datap_recov)(int, size_t, int, void **);
+EXPORT_SYMBOL_GPL(raid6_datap_recov);
+
+const struct raid6_recov_calls *const raid6_recov_algos[] = {
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+	&raid6_recov_ssse3,
+#endif
+	&raid6_recov_intx1,
+	NULL
+};
+
 #ifdef __KERNEL__
 #define RAID6_TIME_JIFFIES_LG2	4
 #else
@@ -72,6 +86,26 @@ const struct raid6_calls * const raid6_algos[] = {
 #define time_before(x, y) ((x) < (y))
 #endif
 
+static inline void raid6_choose_recov(void)
+{
+	const struct raid6_recov_calls *const *algo;
+	const struct raid6_recov_calls *best;
+
+	for (best = NULL, algo = raid6_recov_algos; *algo; algo++)
+		if (!best || (*algo)->priority > best->priority)
+			if (!(*algo)->valid || (*algo)->valid())
+				best = *algo;
+
+	if (best) {
+		raid6_2data_recov = best->data2;
+		raid6_datap_recov = best->datap;
+
+		printk("raid6: using %s recovery algorithm\n", best->name);
+	} else
+		printk("raid6: Yikes! No recovery algorithm found!\n");
+}
+
+
 /* Try to pick the best algorithm */
 /* This code uses the gfmul table as convenient data set to abuse */
 
@@ -141,6 +175,9 @@ int __init raid6_select_algo(void)
 
 	free_pages((unsigned long)syndromes, 1);
 
+	/* select raid recover functions */
+	raid6_choose_recov();
+
 	return best ? 0 : -EINVAL;
 }
 
diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c
index 8a3780902ce..39787db588b 100644
--- a/lib/raid6/mktables.c
+++ b/lib/raid6/mktables.c
@@ -81,6 +81,31 @@ int main(int argc, char *argv[])
 	printf("EXPORT_SYMBOL(raid6_gfmul);\n");
 	printf("#endif\n");
 
+	/* Compute vector multiplication table */
+	printf("\nconst u8  __attribute__((aligned(256)))\n"
+		"raid6_vgfmul[256][32] =\n"
+		"{\n");
+	for (i = 0; i < 256; i++) {
+		printf("\t{\n");
+		for (j = 0; j < 16; j += 8) {
+			printf("\t\t");
+			for (k = 0; k < 8; k++)
+				printf("0x%02x,%c", gfmul(i, j + k),
+				       (k == 7) ? '\n' : ' ');
+		}
+		for (j = 0; j < 16; j += 8) {
+			printf("\t\t");
+			for (k = 0; k < 8; k++)
+				printf("0x%02x,%c", gfmul(i, (j + k) << 4),
+				       (k == 7) ? '\n' : ' ');
+		}
+		printf("\t},\n");
+	}
+	printf("};\n");
+	printf("#ifdef __KERNEL__\n");
+	printf("EXPORT_SYMBOL(raid6_vgfmul);\n");
+	printf("#endif\n");
+
 	/* Compute power-of-2 table (exponent) */
 	v = 1;
 	printf("\nconst u8 __attribute__((aligned(256)))\n"
diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c
index fe275d7b6b3..1805a5cc5da 100644
--- a/lib/raid6/recov.c
+++ b/lib/raid6/recov.c
@@ -22,7 +22,7 @@
 #include <linux/raid/pq.h>
 
 /* Recover two failed data blocks. */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb,
 		       void **ptrs)
 {
 	u8 *p, *q, *dp, *dq;
@@ -64,10 +64,9 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 		p++; q++;
 	}
 }
-EXPORT_SYMBOL_GPL(raid6_2data_recov);
 
 /* Recover failure of one data block plus the P block */
-void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
+void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs)
 {
 	u8 *p, *q, *dq;
 	const u8 *qmul;		/* Q multiplier table */
@@ -96,7 +95,15 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
 		q++; dq++;
 	}
 }
-EXPORT_SYMBOL_GPL(raid6_datap_recov);
+
+
+const struct raid6_recov_calls raid6_recov_intx1 = {
+	.data2 = raid6_2data_recov_intx1,
+	.datap = raid6_datap_recov_intx1,
+	.valid = NULL,
+	.name = "intx1",
+	.priority = 0,
+};
 
 #ifndef __KERNEL__
 /* Testing only */
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c
new file mode 100644
index 00000000000..37ae6193055
--- /dev/null
+++ b/lib/raid6/recov_ssse3.c
@@ -0,0 +1,335 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static int raid6_has_ssse3(void)
+{
+	return boot_cpu_has(X86_FEATURE_XMM) &&
+		boot_cpu_has(X86_FEATURE_XMM2) &&
+		boot_cpu_has(X86_FEATURE_SSSE3);
+}
+
+void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb,
+		       void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+	static const u8 __aligned(16) x0f[16] = {
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data pages
+	   Use the dead data pages as temporary storage for
+	   delta p and delta q */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dp;
+	ptrs[failb]   = dq;
+	ptrs[disks-2] = p;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+		raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0]));
+
+#ifdef CONFIG_X86_64
+	asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0]));
+	asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0]));
+	asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16]));
+#endif
+
+	/* Now do it... */
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		/* xmm6, xmm14, xmm15 */
+
+		asm volatile("movdqa %0,%%xmm1" : : "m" (q[0]));
+		asm volatile("movdqa %0,%%xmm9" : : "m" (q[16]));
+		asm volatile("movdqa %0,%%xmm0" : : "m" (p[0]));
+		asm volatile("movdqa %0,%%xmm8" : : "m" (p[16]));
+		asm volatile("pxor   %0,%%xmm1" : : "m" (dq[0]));
+		asm volatile("pxor   %0,%%xmm9" : : "m" (dq[16]));
+		asm volatile("pxor   %0,%%xmm0" : : "m" (dp[0]));
+		asm volatile("pxor   %0,%%xmm8" : : "m" (dp[16]));
+
+		/* xmm0/8 = px */
+
+		asm volatile("movdqa %xmm6,%xmm4");
+		asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
+		asm volatile("movdqa %xmm6,%xmm12");
+		asm volatile("movdqa %xmm5,%xmm13");
+		asm volatile("movdqa %xmm1,%xmm3");
+		asm volatile("movdqa %xmm9,%xmm11");
+		asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */
+		asm volatile("movdqa %xmm8,%xmm10");
+		asm volatile("psraw  $4,%xmm1");
+		asm volatile("psraw  $4,%xmm9");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm11");
+		asm volatile("pand   %xmm7,%xmm1");
+		asm volatile("pand   %xmm7,%xmm9");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm11,%xmm12");
+		asm volatile("pshufb %xmm1,%xmm5");
+		asm volatile("pshufb %xmm9,%xmm13");
+		asm volatile("pxor   %xmm4,%xmm5");
+		asm volatile("pxor   %xmm12,%xmm13");
+
+		/* xmm5/13 = qx */
+
+		asm volatile("movdqa %xmm14,%xmm4");
+		asm volatile("movdqa %xmm15,%xmm1");
+		asm volatile("movdqa %xmm14,%xmm12");
+		asm volatile("movdqa %xmm15,%xmm9");
+		asm volatile("movdqa %xmm2,%xmm3");
+		asm volatile("movdqa %xmm10,%xmm11");
+		asm volatile("psraw  $4,%xmm2");
+		asm volatile("psraw  $4,%xmm10");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm11");
+		asm volatile("pand   %xmm7,%xmm2");
+		asm volatile("pand   %xmm7,%xmm10");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm11,%xmm12");
+		asm volatile("pshufb %xmm2,%xmm1");
+		asm volatile("pshufb %xmm10,%xmm9");
+		asm volatile("pxor   %xmm4,%xmm1");
+		asm volatile("pxor   %xmm12,%xmm9");
+
+		/* xmm1/9 = pbmul[px] */
+		asm volatile("pxor   %xmm5,%xmm1");
+		asm volatile("pxor   %xmm13,%xmm9");
+		/* xmm1/9 = db = DQ */
+		asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0]));
+		asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16]));
+
+		asm volatile("pxor   %xmm1,%xmm0");
+		asm volatile("pxor   %xmm9,%xmm8");
+		asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0]));
+		asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16]));
+
+		bytes -= 32;
+		p += 32;
+		q += 32;
+		dp += 32;
+		dq += 32;
+#else
+		asm volatile("movdqa %0,%%xmm1" : : "m" (*q));
+		asm volatile("movdqa %0,%%xmm0" : : "m" (*p));
+		asm volatile("pxor   %0,%%xmm1" : : "m" (*dq));
+		asm volatile("pxor   %0,%%xmm0" : : "m" (*dp));
+
+		/* 1 = dq ^ q
+		 * 0 = dp ^ p
+		 */
+		asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0]));
+		asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
+
+		asm volatile("movdqa %xmm1,%xmm3");
+		asm volatile("psraw  $4,%xmm1");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm1");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm1,%xmm5");
+		asm volatile("pxor   %xmm4,%xmm5");
+
+		asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */
+
+		/* xmm5 = qx */
+
+		asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0]));
+		asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16]));
+		asm volatile("movdqa %xmm2,%xmm3");
+		asm volatile("psraw  $4,%xmm2");
+		asm volatile("pand   %xmm7,%xmm3");
+		asm volatile("pand   %xmm7,%xmm2");
+		asm volatile("pshufb %xmm3,%xmm4");
+		asm volatile("pshufb %xmm2,%xmm1");
+		asm volatile("pxor   %xmm4,%xmm1");
+
+		/* xmm1 = pbmul[px] */
+		asm volatile("pxor   %xmm5,%xmm1");
+		/* xmm1 = db = DQ */
+		asm volatile("movdqa %%xmm1,%0" : "=m" (*dq));
+
+		asm volatile("pxor   %xmm1,%xmm0");
+		asm volatile("movdqa %%xmm0,%0" : "=m" (*dp));
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dp += 16;
+		dq += 16;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+
+void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+	static const u8 __aligned(16) x0f[16] = {
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+		 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/* Compute syndrome with zero for the missing data page
+	   Use the dead data page as temporary storage for delta q */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dq;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0]));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
+		asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16]));
+		asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
+		asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
+
+		/* xmm3 = q[0] ^ dq[0] */
+
+		asm volatile("pxor %0, %%xmm4" : : "m" (q[16]));
+		asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
+
+		/* xmm4 = q[16] ^ dq[16] */
+
+		asm volatile("movdqa %xmm3, %xmm6");
+		asm volatile("movdqa %xmm4, %xmm8");
+
+		/* xmm4 = xmm8 = q[16] ^ dq[16] */
+
+		asm volatile("psraw $4, %xmm3");
+		asm volatile("pand %xmm7, %xmm6");
+		asm volatile("pand %xmm7, %xmm3");
+		asm volatile("pshufb %xmm6, %xmm0");
+		asm volatile("pshufb %xmm3, %xmm1");
+		asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0]));
+		asm volatile("pxor %xmm0, %xmm1");
+		asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16]));
+
+		/* xmm1 = qmul[q[0] ^ dq[0]] */
+
+		asm volatile("psraw $4, %xmm4");
+		asm volatile("pand %xmm7, %xmm8");
+		asm volatile("pand %xmm7, %xmm4");
+		asm volatile("pshufb %xmm8, %xmm10");
+		asm volatile("pshufb %xmm4, %xmm11");
+		asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
+		asm volatile("pxor %xmm10, %xmm11");
+		asm volatile("movdqa %0, %%xmm12" : : "m" (p[16]));
+
+		/* xmm11 = qmul[q[16] ^ dq[16]] */
+
+		asm volatile("pxor %xmm1, %xmm2");
+
+		/* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */
+
+		asm volatile("pxor %xmm11, %xmm12");
+
+		/* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */
+
+		asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
+		asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16]));
+
+		asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
+		asm volatile("movdqa %%xmm12, %0" : "=m" (p[16]));
+
+		bytes -= 32;
+		p += 32;
+		q += 32;
+		dq += 32;
+
+#else
+		asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
+		asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
+		asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
+		asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
+
+		/* xmm3 = *q ^ *dq */
+
+		asm volatile("movdqa %xmm3, %xmm6");
+		asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
+		asm volatile("psraw $4, %xmm3");
+		asm volatile("pand %xmm7, %xmm6");
+		asm volatile("pand %xmm7, %xmm3");
+		asm volatile("pshufb %xmm6, %xmm0");
+		asm volatile("pshufb %xmm3, %xmm1");
+		asm volatile("pxor %xmm0, %xmm1");
+
+		/* xmm1 = qmul[*q ^ *dq */
+
+		asm volatile("pxor %xmm1, %xmm2");
+
+		/* xmm2 = *p ^ qmul[*q ^ *dq] */
+
+		asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
+		asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dq += 16;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_ssse3 = {
+	.data2 = raid6_2data_recov_ssse3,
+	.datap = raid6_datap_recov_ssse3,
+	.valid = raid6_has_ssse3,
+#ifdef CONFIG_X86_64
+	.name = "ssse3x2",
+#else
+	.name = "ssse3x1",
+#endif
+	.priority = 1,
+};
+
+#endif

From 2dbf708448c836754d25fe6108c5bfe1f5697c95 Mon Sep 17 00:00:00 2001
From: Jim Kukunas <james.t.kukunas@linux.intel.com>
Date: Tue, 22 May 2012 13:54:23 +1000
Subject: [PATCH 17/56] lib/raid6: update test program for recovery functions

Test each combination of recovery and syndrome generation
functions.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 lib/raid6/test/Makefile |  2 +-
 lib/raid6/test/test.c   | 32 +++++++++++++++++++++-----------
 lib/raid6/x86.h         | 15 ++++++++++-----
 3 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index aa651697b6d..c76151d9476 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -23,7 +23,7 @@ RANLIB	 = ranlib
 all:	raid6.a raid6test
 
 raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \
-	 altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \
+	 altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \
 	 tables.o
 	 rm -f $@
 	 $(AR) cq $@ $^
diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c
index 7a930318b17..5a485b7a7d3 100644
--- a/lib/raid6/test/test.c
+++ b/lib/raid6/test/test.c
@@ -90,25 +90,35 @@ static int test_disks(int i, int j)
 int main(int argc, char *argv[])
 {
 	const struct raid6_calls *const *algo;
+	const struct raid6_recov_calls *const *ra;
 	int i, j;
 	int err = 0;
 
 	makedata();
 
-	for (algo = raid6_algos; *algo; algo++) {
-		if (!(*algo)->valid || (*algo)->valid()) {
-			raid6_call = **algo;
+	for (ra = raid6_recov_algos; *ra; ra++) {
+		if ((*ra)->valid  && !(*ra)->valid())
+			continue;
+		raid6_2data_recov = (*ra)->data2;
+		raid6_datap_recov = (*ra)->datap;
 
-			/* Nuke syndromes */
-			memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
+		printf("using recovery %s\n", (*ra)->name);
 
-			/* Generate assumed good syndrome */
-			raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
-						(void **)&dataptrs);
+		for (algo = raid6_algos; *algo; algo++) {
+			if (!(*algo)->valid || (*algo)->valid()) {
+				raid6_call = **algo;
 
-			for (i = 0; i < NDISKS-1; i++)
-				for (j = i+1; j < NDISKS; j++)
-					err += test_disks(i, j);
+				/* Nuke syndromes */
+				memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
+
+				/* Generate assumed good syndrome */
+				raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
+							(void **)&dataptrs);
+
+				for (i = 0; i < NDISKS-1; i++)
+					for (j = i+1; j < NDISKS; j++)
+						err += test_disks(i, j);
+			}
 		}
 		printf("\n");
 	}
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index cb2a8c91c88..d55d63232c5 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -35,24 +35,29 @@ static inline void kernel_fpu_end(void)
 {
 }
 
+#define __aligned(x) __attribute__((aligned(x)))
+
 #define X86_FEATURE_MMX		(0*32+23) /* Multimedia Extensions */
 #define X86_FEATURE_FXSR	(0*32+24) /* FXSAVE and FXRSTOR instructions
 					   * (fast save and restore) */
 #define X86_FEATURE_XMM		(0*32+25) /* Streaming SIMD Extensions */
 #define X86_FEATURE_XMM2	(0*32+26) /* Streaming SIMD Extensions-2 */
+#define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_SSSE3	(4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_AVX	(4*32+28) /* Advanced Vector Extensions */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
 
 /* Should work well enough on modern CPUs for testing */
 static inline int boot_cpu_has(int flag)
 {
-	u32 eax = (flag >> 5) ? 0x80000001 : 1;
-	u32 edx;
+	u32 eax = (flag & 0x20) ? 0x80000001 : 1;
+	u32 ecx, edx;
 
 	asm volatile("cpuid"
-		     : "+a" (eax), "=d" (edx)
-		     : : "ecx", "ebx");
+		     : "+a" (eax), "=d" (edx), "=c" (ecx)
+		     : : "ebx");
 
-	return (edx >> (flag & 31)) & 1;
+	return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1;
 }
 
 #endif /* ndef __KERNEL__ */

From 96e67703e71f4b3cc32b747dbb6158ec74d01e19 Mon Sep 17 00:00:00 2001
From: Jim Kukunas <james.t.kukunas@linux.intel.com>
Date: Tue, 22 May 2012 13:54:24 +1000
Subject: [PATCH 18/56] lib/raid6: cleanup gen_syndrome function selection

Reorders functions in raid6_algos as well as the preference check
to reduce the number of functions tested on initialization.

Also, creates symmetry between choosing the gen_syndrome functions
and choosing the recovery functions.

Signed-off-by: Jim Kukunas <james.t.kukunas@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 lib/raid6/algos.c | 104 +++++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 47 deletions(-)

diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 5a7f8022be1..589f5f50ad2 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -34,10 +34,6 @@ struct raid6_calls raid6_call;
 EXPORT_SYMBOL_GPL(raid6_call);
 
 const struct raid6_calls * const raid6_algos[] = {
-	&raid6_intx1,
-	&raid6_intx2,
-	&raid6_intx4,
-	&raid6_intx8,
 #if defined(__ia64__)
 	&raid6_intx16,
 	&raid6_intx32,
@@ -61,6 +57,10 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_altivec4,
 	&raid6_altivec8,
 #endif
+	&raid6_intx1,
+	&raid6_intx2,
+	&raid6_intx4,
+	&raid6_intx8,
 	NULL
 };
 
@@ -86,7 +86,7 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #define time_before(x, y) ((x) < (y))
 #endif
 
-static inline void raid6_choose_recov(void)
+static inline const struct raid6_recov_calls *raid6_choose_recov(void)
 {
 	const struct raid6_recov_calls *const *algo;
 	const struct raid6_recov_calls *best;
@@ -103,62 +103,38 @@ static inline void raid6_choose_recov(void)
 		printk("raid6: using %s recovery algorithm\n", best->name);
 	} else
 		printk("raid6: Yikes! No recovery algorithm found!\n");
+
+	return best;
 }
 
-
-/* Try to pick the best algorithm */
-/* This code uses the gfmul table as convenient data set to abuse */
-
-int __init raid6_select_algo(void)
+static inline const struct raid6_calls *raid6_choose_gen(
+	void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks)
 {
-	const struct raid6_calls * const * algo;
-	const struct raid6_calls * best;
-	char *syndromes;
-	void *dptrs[(65536/PAGE_SIZE)+2];
-	int i, disks;
-	unsigned long perf, bestperf;
-	int bestprefer;
-	unsigned long j0, j1;
+	unsigned long perf, bestperf, j0, j1;
+	const struct raid6_calls *const *algo;
+	const struct raid6_calls *best;
 
-	disks = (65536/PAGE_SIZE)+2;
-	for ( i = 0 ; i < disks-2 ; i++ ) {
-		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
-	}
+	for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
+		if (!best || (*algo)->prefer >= best->prefer) {
+			if ((*algo)->valid && !(*algo)->valid())
+				continue;
 
-	/* Normal code - use a 2-page allocation to avoid D$ conflict */
-	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
-
-	if ( !syndromes ) {
-		printk("raid6: Yikes!  No memory available.\n");
-		return -ENOMEM;
-	}
-
-	dptrs[disks-2] = syndromes;
-	dptrs[disks-1] = syndromes + PAGE_SIZE;
-
-	bestperf = 0;  bestprefer = 0;  best = NULL;
-
-	for ( algo = raid6_algos ; *algo ; algo++ ) {
-		if ( !(*algo)->valid || (*algo)->valid() ) {
 			perf = 0;
 
 			preempt_disable();
 			j0 = jiffies;
-			while ( (j1 = jiffies) == j0 )
+			while ((j1 = jiffies) == j0)
 				cpu_relax();
 			while (time_before(jiffies,
 					    j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
-				(*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs);
+				(*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs);
 				perf++;
 			}
 			preempt_enable();
 
-			if ( (*algo)->prefer > bestprefer ||
-			     ((*algo)->prefer == bestprefer &&
-			      perf > bestperf) ) {
-				best = *algo;
-				bestprefer = best->prefer;
+			if (perf > bestperf) {
 				bestperf = perf;
+				best = *algo;
 			}
 			printk("raid6: %-8s %5ld MB/s\n", (*algo)->name,
 			       (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
@@ -173,12 +149,46 @@ int __init raid6_select_algo(void)
 	} else
 		printk("raid6: Yikes!  No algorithm found!\n");
 
-	free_pages((unsigned long)syndromes, 1);
+	return best;
+}
+
+
+/* Try to pick the best algorithm */
+/* This code uses the gfmul table as convenient data set to abuse */
+
+int __init raid6_select_algo(void)
+{
+	const int disks = (65536/PAGE_SIZE)+2;
+
+	const struct raid6_calls *gen_best;
+	const struct raid6_recov_calls *rec_best;
+	char *syndromes;
+	void *dptrs[(65536/PAGE_SIZE)+2];
+	int i;
+
+	for (i = 0; i < disks-2; i++)
+		dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
+
+	/* Normal code - use a 2-page allocation to avoid D$ conflict */
+	syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
+
+	if (!syndromes) {
+		printk("raid6: Yikes!  No memory available.\n");
+		return -ENOMEM;
+	}
+
+	dptrs[disks-2] = syndromes;
+	dptrs[disks-1] = syndromes + PAGE_SIZE;
+
+	/* select raid gen_syndrome function */
+	gen_best = raid6_choose_gen(&dptrs, disks);
 
 	/* select raid recover functions */
-	raid6_choose_recov();
+	rec_best = raid6_choose_recov();
 
-	return best ? 0 : -EINVAL;
+	free_pages((unsigned long)syndromes, 1);
+
+	return gen_best && rec_best ? 0 : -EINVAL;
 }
 
 static void raid6_exit(void)

From 545c87957f4d53867b62921625f36df8c4b1bc08 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:54:30 +1000
Subject: [PATCH 19/56] md: dm-raid should call helper function to clear rdev.

dm-raid currently open-codes the freeing of some members of
and rdev.  It is more maintainable to have it call common code
from md.c which does this for all call-sites.

So remove free_disk_sb to md_rdev_clear, export it, and use it in
dm-raid.c

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/dm-raid.c | 5 +----
 drivers/md/md.c      | 8 ++++----
 drivers/md/md.h      | 1 +
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 68965e66324..73a068da10d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -155,10 +155,7 @@ static void context_free(struct raid_set *rs)
 	for (i = 0; i < rs->md.raid_disks; i++) {
 		if (rs->dev[i].meta_dev)
 			dm_put_device(rs->ti, rs->dev[i].meta_dev);
-		if (rs->dev[i].rdev.sb_page)
-			put_page(rs->dev[i].rdev.sb_page);
-		rs->dev[i].rdev.sb_page = NULL;
-		rs->dev[i].rdev.sb_loaded = 0;
+		md_rdev_clear(&rs->dev[i].rdev);
 		if (rs->dev[i].data_dev)
 			dm_put_device(rs->ti, rs->dev[i].data_dev);
 	}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8fe1abf1b89..d557e557ff8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -803,7 +803,7 @@ static int alloc_disk_sb(struct md_rdev * rdev)
 	return 0;
 }
 
-static void free_disk_sb(struct md_rdev * rdev)
+void md_rdev_clear(struct md_rdev *rdev)
 {
 	if (rdev->sb_page) {
 		put_page(rdev->sb_page);
@@ -817,7 +817,7 @@ static void free_disk_sb(struct md_rdev * rdev)
 		rdev->bb_page = NULL;
 	}
 }
-
+EXPORT_SYMBOL_GPL(md_rdev_clear);
 
 static void super_written(struct bio *bio, int error)
 {
@@ -2244,7 +2244,7 @@ static void export_rdev(struct md_rdev * rdev)
 		bdevname(rdev->bdev,b));
 	if (rdev->mddev)
 		MD_BUG();
-	free_disk_sb(rdev);
+	md_rdev_clear(rdev);
 #ifndef MODULE
 	if (test_bit(AutoDetected, &rdev->flags))
 		md_autodetect_dev(rdev->bdev->bd_dev);
@@ -3324,7 +3324,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
 abort_free:
 	if (rdev->bdev)
 		unlock_rdev(rdev);
-	free_disk_sb(rdev);
+	md_rdev_clear(rdev);
 	kfree(rdev->badblocks.page);
 	kfree(rdev);
 	return ERR_PTR(err);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 98913e8dac1..360937389e6 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -619,6 +619,7 @@ extern int md_run(struct mddev *mddev);
 extern void md_stop(struct mddev *mddev);
 extern void md_stop_writes(struct mddev *mddev);
 extern int md_rdev_init(struct md_rdev *rdev);
+extern void md_rdev_clear(struct md_rdev *rdev);
 
 extern void mddev_suspend(struct mddev *mddev);
 extern void mddev_resume(struct mddev *mddev);

From 4fa2f327681808f653711e14203a42cf4644bda0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:01 +1000
Subject: [PATCH 20/56] md: move freeing of badblocks.page into md_rdev_clear

This ensures that it is always freed - there were case where
we failed to free the page.

Reported-by: majianpeng <majianpeng@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d557e557ff8..ac99616f48d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -816,6 +816,8 @@ void md_rdev_clear(struct md_rdev *rdev)
 		put_page(rdev->bb_page);
 		rdev->bb_page = NULL;
 	}
+	kfree(rdev->badblocks.page);
+	rdev->badblocks.page = NULL;
 }
 EXPORT_SYMBOL_GPL(md_rdev_clear);
 
@@ -2191,9 +2193,7 @@ static void unbind_rdev_from_array(struct md_rdev * rdev)
 	sysfs_remove_link(&rdev->kobj, "block");
 	sysfs_put(rdev->sysfs_state);
 	rdev->sysfs_state = NULL;
-	kfree(rdev->badblocks.page);
 	rdev->badblocks.count = 0;
-	rdev->badblocks.page = NULL;
 	/* We need to delay this, otherwise we can deadlock when
 	 * writing to 'remove' to "dev/state".  We also need
 	 * to delay it due to rcu usage.
@@ -3325,7 +3325,6 @@ abort_free:
 	if (rdev->bdev)
 		unlock_rdev(rdev);
 	md_rdev_clear(rdev);
-	kfree(rdev->badblocks.page);
 	kfree(rdev);
 	return ERR_PTR(err);
 }

From da8840a747c0dbf49506ec906757a6b87b9741e9 Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Tue, 22 May 2012 13:55:03 +1000
Subject: [PATCH 21/56] md/raid1: allow fix_read_error to read from recovering
 device.

When attempting to fix a read error, it is acceptable to read from a
device that is recovering, provided the recovery has got past the
place we are reading from.  This makes the test for "can we read from
here" the same as the test in read_balance.

Signed-off-by: majianpeng <majianpeng@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 71a7dc038a8..22cfc6660b1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1859,7 +1859,9 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
 
 			rdev = conf->mirrors[d].rdev;
 			if (rdev &&
-			    test_bit(In_sync, &rdev->flags) &&
+			    (test_bit(In_sync, &rdev->flags) ||
+			     (!test_bit(Faulty, &rdev->flags) &&
+			      rdev->recovery_offset >= sect + s)) &&
 			    is_badblock(rdev, sect, s,
 					&first_bad, &bad_sectors) == 0 &&
 			    sync_page_io(rdev, sect, s<<9,

From 5fdd2cf8265c6de0f190dea80cc4c50da8f31293 Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Tue, 22 May 2012 13:55:03 +1000
Subject: [PATCH 22/56] md/raid10: Fix memleak in r10buf_pool_alloc

If the allocation of rep1_bio fails, we currently don't free the 'bio'
of the same dev.

Reported by kmemleak.

Signed-off-by: majianpeng <majianpeng@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ec271ae4318..fb9062b5022 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -173,10 +173,11 @@ out_free_pages:
 	while (j--)
 		for (i = 0; i < RESYNC_PAGES ; i++)
 			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
-	j = -1;
+	j = 0;
 out_free_bio:
-	while (++j < nalloc) {
-		bio_put(r10_bio->devs[j].bio);
+	for ( ; j < nalloc; j++) {
+		if (r10_bio->devs[j].bio)
+			bio_put(r10_bio->devs[j].bio);
 		if (r10_bio->devs[j].repl_bio)
 			bio_put(r10_bio->devs[j].repl_bio);
 	}

From cceeca43b5ad96766098144a3fd757e03de9f6f8 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Tue, 22 May 2012 13:55:04 +1000
Subject: [PATCH 23/56] raid5: remove unused variables

The two variables are useless.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0172bdd37b4..962c8f40908 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3985,12 +3985,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 	plugged = mddev_check_plugged(mddev);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
-		int disks, data_disks;
 		int previous;
 
 	retry:
 		previous = 0;
-		disks = conf->raid_disks;
 		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 		if (unlikely(conf->reshape_progress != MaxSector)) {
 			/* spinlock is needed as reshape_progress may be
@@ -4005,7 +4003,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 			if (mddev->reshape_backwards
 			    ? logical_sector < conf->reshape_progress
 			    : logical_sector >= conf->reshape_progress) {
-				disks = conf->previous_raid_disks;
 				previous = 1;
 			} else {
 				if (mddev->reshape_backwards
@@ -4018,7 +4015,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 			}
 			spin_unlock_irq(&conf->device_lock);
 		}
-		data_disks = disks - conf->max_degraded;
 
 		new_sector = raid5_compute_sector(conf, logical_sector,
 						  previous,

From bc0934f0477d0a2350a478004799d9c064923b7b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Tue, 22 May 2012 13:55:05 +1000
Subject: [PATCH 24/56] raid5: support sync request

REQ_SYNC is ignored in current raid5 code. Block layer does use it to do
policy,
for example ioscheduler. This patch adds it.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 12 ++++++++++--
 drivers/md/raid5.h |  1 +
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 962c8f40908..7bfd59b313d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -539,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			replace_only = 1;
 		} else
 			continue;
+		if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
+			rw |= REQ_SYNC;
 
 		bi = &sh->dev[i].req;
 		rbi = &sh->dev[i].rreq; /* For writing to replacement */
@@ -1145,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 				dev->sector + STRIPE_SECTORS) {
 				if (wbi->bi_rw & REQ_FUA)
 					set_bit(R5_WantFUA, &dev->flags);
+				if (wbi->bi_rw & REQ_SYNC)
+					set_bit(R5_SyncIO, &dev->flags);
 				tx = async_copy_data(1, wbi, dev->page,
 					dev->sector, tx);
 				wbi = r5_next_bio(wbi, dev->sector);
@@ -1162,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
 	int pd_idx = sh->pd_idx;
 	int qd_idx = sh->qd_idx;
 	int i;
-	bool fua = false;
+	bool fua = false, sync = false;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
-	for (i = disks; i--; )
+	for (i = disks; i--; ) {
 		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
+		sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
+	}
 
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
@@ -1177,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
 			set_bit(R5_UPTODATE, &dev->flags);
 			if (fua)
 				set_bit(R5_WantFUA, &dev->flags);
+			if (sync)
+				set_bit(R5_SyncIO, &dev->flags);
 		}
 	}
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index c6bdfa01d98..2164021f3b5 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -285,6 +285,7 @@ enum r5dev_flags {
 			 */
 	R5_Wantdrain,	/* dev->towrite needs to be drained */
 	R5_WantFUA,	/* Write should be FUA */
+	R5_SyncIO,	/* The IO is sync */
 	R5_WriteError,	/* got a write error - need to record it */
 	R5_MadeGood,	/* A bad block has been fixed by writing to it */
 	R5_ReadRepl,	/* Will/did read from replacement rather than orig */

From bf07bb7d5be813630d3530be274b3324f85e310c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:06 +1000
Subject: [PATCH 25/56] md/bitmap: disentangle two different 'pending' flags.

There are two different 'pending' concepts in the handling of the
write intent bitmap.

Firstly, a 'page' from the bitmap (which container PAGE_SIZE*8 bits)
may have changes (bits cleared) that should be written in due course.
There is no hurry for these and the page will transition from
PENDING to NEEDWRITE and will then be written, though if it ever
becomes DIRTY it will be written much sooner and PENDING will be
cleared.

Secondly, a page of counters - which contains PAGE_SIZE/2 counters, one
for each bit, can usefully have a 'pending' flag which indicates if
any of the counters are low (2 or 1) and ready to be processed by
bitmap_daemon_work().  If this flag is clear we can skip the whole
page.

These two concepts are currently combined in the bitmap-file flag.
This causes a tighter connection between the counters and the bitmap
file than I would like - as I want to add some flexibility to the
bitmap file.

So introduce a new flag with the page-of-counters, and rewrite
bitmap_daemon_work() so that it handles the two different 'pending'
concepts separately.

This also allows us to clear BITMAP_PAGE_PENDING when we write out
a dirty page, which may occasionally reduce the number of times we
write a page.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 219 +++++++++++++++++++++++---------------------
 drivers/md/bitmap.h |   7 +-
 2 files changed, 121 insertions(+), 105 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 17e2b472e16..c7784a98567 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -883,6 +883,8 @@ void bitmap_unplug(struct bitmap *bitmap)
 		need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
 		clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
 		clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
+		if (dirty || need_write)
+			clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
 		if (dirty)
 			wait = 1;
 		spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1086,6 +1088,17 @@ static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
 	bitmap->bp[page].count += inc;
 	bitmap_checkfree(bitmap, page);
 }
+
+static void bitmap_set_pending(struct bitmap *bitmap, sector_t offset)
+{
+	sector_t chunk = offset >> bitmap->chunkshift;
+	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+	struct bitmap_page *bp = &bitmap->bp[page];
+
+	if (!bp->pending)
+		bp->pending = 1;
+}
+
 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
 					    sector_t offset, sector_t *blocks,
 					    int create);
@@ -1099,8 +1112,8 @@ void bitmap_daemon_work(struct mddev *mddev)
 {
 	struct bitmap *bitmap;
 	unsigned long j;
+	unsigned long nextpage;
 	unsigned long flags;
-	struct page *page = NULL, *lastpage = NULL;
 	sector_t blocks;
 	void *paddr;
 
@@ -1124,115 +1137,121 @@ void bitmap_daemon_work(struct mddev *mddev)
 	}
 	bitmap->allclean = 1;
 
+	/* Any file-page which is PENDING now needs to be written.
+	 * So set NEEDWRITE now, then after we make any last-minute changes
+	 * we will write it.
+	 */
 	spin_lock_irqsave(&bitmap->lock, flags);
+	if (!bitmap->filemap)
+		/* error or shutdown */
+		goto out;
+
+	for (j = 0; j < bitmap->file_pages; j++)
+		if (test_page_attr(bitmap, bitmap->filemap[j],
+				   BITMAP_PAGE_PENDING)) {
+			set_page_attr(bitmap, bitmap->filemap[j],
+				      BITMAP_PAGE_NEEDWRITE);
+			clear_page_attr(bitmap, bitmap->filemap[j],
+					BITMAP_PAGE_PENDING);
+		}
+
+	if (bitmap->need_sync &&
+	    mddev->bitmap_info.external == 0) {
+		/* Arrange for superblock update as well as
+		 * other changes */
+		bitmap_super_t *sb;
+		bitmap->need_sync = 0;
+		sb = kmap_atomic(bitmap->sb_page);
+		sb->events_cleared =
+			cpu_to_le64(bitmap->events_cleared);
+		kunmap_atomic(sb);
+		set_page_attr(bitmap, bitmap->sb_page, BITMAP_PAGE_NEEDWRITE);
+	}
+	/* Now look at the bitmap counters and if any are '2' or '1',
+	 * decrement and handle accordingly.
+	 */
+	nextpage = 0;
 	for (j = 0; j < bitmap->chunks; j++) {
 		bitmap_counter_t *bmc;
-		if (!bitmap->filemap)
-			/* error or shutdown */
-			break;
 
-		page = filemap_get_page(bitmap, j);
-
-		if (page != lastpage) {
-			/* skip this page unless it's marked as needing cleaning */
-			if (!test_page_attr(bitmap, page, BITMAP_PAGE_PENDING)) {
-				int need_write = test_page_attr(bitmap, page,
-								BITMAP_PAGE_NEEDWRITE);
-				if (need_write)
-					clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
-
-				spin_unlock_irqrestore(&bitmap->lock, flags);
-				if (need_write)
-					write_page(bitmap, page, 0);
-				spin_lock_irqsave(&bitmap->lock, flags);
-				j |= (PAGE_BITS - 1);
+		if (j == nextpage) {
+			nextpage += PAGE_COUNTER_RATIO;
+			if (!bitmap->bp[j >> PAGE_COUNTER_SHIFT].pending) {
+				j |= PAGE_COUNTER_MASK;
 				continue;
 			}
-
-			/* grab the new page, sync and release the old */
-			if (lastpage != NULL) {
-				if (test_page_attr(bitmap, lastpage,
-						   BITMAP_PAGE_NEEDWRITE)) {
-					clear_page_attr(bitmap, lastpage,
-							BITMAP_PAGE_NEEDWRITE);
-					spin_unlock_irqrestore(&bitmap->lock, flags);
-					write_page(bitmap, lastpage, 0);
-				} else {
-					set_page_attr(bitmap, lastpage,
-						      BITMAP_PAGE_NEEDWRITE);
-					bitmap->allclean = 0;
-					spin_unlock_irqrestore(&bitmap->lock, flags);
-				}
-			} else
-				spin_unlock_irqrestore(&bitmap->lock, flags);
-			lastpage = page;
-
-			/* We are possibly going to clear some bits, so make
-			 * sure that events_cleared is up-to-date.
-			 */
-			if (bitmap->need_sync &&
-			    mddev->bitmap_info.external == 0) {
-				bitmap_super_t *sb;
-				bitmap->need_sync = 0;
-				sb = kmap_atomic(bitmap->sb_page);
-				sb->events_cleared =
-					cpu_to_le64(bitmap->events_cleared);
-				kunmap_atomic(sb);
-				write_page(bitmap, bitmap->sb_page, 1);
-			}
-			spin_lock_irqsave(&bitmap->lock, flags);
-			if (!bitmap->need_sync)
-				clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
-			else
-				bitmap->allclean = 0;
+			bitmap->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
 		}
 		bmc = bitmap_get_counter(bitmap,
 					 (sector_t)j << bitmap->chunkshift,
 					 &blocks, 0);
-		if (!bmc)
-			j |= PAGE_COUNTER_MASK;
-		else if (*bmc) {
-			if (*bmc == 1 && !bitmap->need_sync) {
-				/* we can clear the bit */
-				*bmc = 0;
-				bitmap_count_page(bitmap,
-						  (sector_t)j << bitmap->chunkshift,
-						  -1);
 
-				/* clear the bit */
-				paddr = kmap_atomic(page);
-				if (bitmap->flags & BITMAP_HOSTENDIAN)
-					clear_bit(file_page_offset(bitmap, j),
-						  paddr);
-				else
-					__clear_bit_le(
-						file_page_offset(bitmap,
-								 j),
-						paddr);
-				kunmap_atomic(paddr);
-			} else if (*bmc <= 2) {
-				*bmc = 1; /* maybe clear the bit next time */
-				set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
+		if (!bmc) {
+			j |= PAGE_COUNTER_MASK;
+			continue;
+		}
+		if (*bmc == 1 && !bitmap->need_sync) {
+			/* We can clear the bit */
+			struct page *page;
+			*bmc = 0;
+			bitmap_count_page(
+				bitmap,
+				(sector_t)j << bitmap->chunkshift,
+				-1);
+
+			page = filemap_get_page(bitmap, j);
+			paddr = kmap_atomic(page);
+			if (bitmap->flags & BITMAP_HOSTENDIAN)
+				clear_bit(file_page_offset(bitmap, j),
+					  paddr);
+			else
+				__clear_bit_le(file_page_offset(bitmap, j),
+					       paddr);
+			kunmap_atomic(paddr);
+			if (!test_page_attr(bitmap, page,
+					    BITMAP_PAGE_NEEDWRITE)) {
+				set_page_attr(bitmap, page,
+					      BITMAP_PAGE_PENDING);
 				bitmap->allclean = 0;
 			}
-		}
-	}
-	spin_unlock_irqrestore(&bitmap->lock, flags);
-
-	/* now sync the final page */
-	if (lastpage != NULL) {
-		spin_lock_irqsave(&bitmap->lock, flags);
-		if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
-			clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
-			spin_unlock_irqrestore(&bitmap->lock, flags);
-			write_page(bitmap, lastpage, 0);
-		} else {
-			set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+		} else if (*bmc && *bmc <= 2) {
+			*bmc = 1;
+			bitmap_set_pending(
+				bitmap,
+				(sector_t)j << bitmap->chunkshift);
 			bitmap->allclean = 0;
-			spin_unlock_irqrestore(&bitmap->lock, flags);
 		}
 	}
 
+	/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
+	 * DIRTY pages need to be written by bitmap_unplug so it can wait
+	 * for them.
+	 * If we find any DIRTY page we stop there and let bitmap_unplug
+	 * handle all the rest.  This is important in the case where
+	 * the first blocking holds the superblock and it has been updated.
+	 * We mustn't write any other blocks before the superblock.
+	 */
+	for (j = 0; j < bitmap->file_pages; j++) {
+		struct page *page = bitmap->filemap[j];
+
+		if (test_page_attr(bitmap, page,
+				   BITMAP_PAGE_DIRTY))
+			/* bitmap_unplug will handle the rest */
+			break;
+		if (test_page_attr(bitmap, page,
+				   BITMAP_PAGE_NEEDWRITE)) {
+			clear_page_attr(bitmap, page,
+					BITMAP_PAGE_NEEDWRITE);
+			spin_unlock_irqrestore(&bitmap->lock, flags);
+			write_page(bitmap, page, 0);
+			spin_lock_irqsave(&bitmap->lock, flags);
+			if (!bitmap->filemap)
+				break;
+		}
+	}
+out:
+	spin_unlock_irqrestore(&bitmap->lock, flags);
+
  done:
 	if (bitmap->allclean == 0)
 		mddev->thread->timeout =
@@ -1386,11 +1405,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 
 		(*bmc)--;
 		if (*bmc <= 2) {
-			set_page_attr(bitmap,
-				      filemap_get_page(
-					      bitmap,
-					      offset >> bitmap->chunkshift),
-				      BITMAP_PAGE_PENDING);
+			bitmap_set_pending(bitmap, offset);
 			bitmap->allclean = 0;
 		}
 		spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1476,9 +1491,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
 			*bmc |= NEEDED_MASK;
 		else {
 			if (*bmc <= 2) {
-				set_page_attr(bitmap,
-					      filemap_get_page(bitmap, offset >> bitmap->chunkshift),
-					      BITMAP_PAGE_PENDING);
+				bitmap_set_pending(bitmap, offset);
 				bitmap->allclean = 0;
 			}
 		}
@@ -1551,11 +1564,9 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
 		return;
 	}
 	if (!*bmc) {
-		struct page *page;
 		*bmc = 2 | (needed ? NEEDED_MASK : 0);
 		bitmap_count_page(bitmap, offset, 1);
-		page = filemap_get_page(bitmap, offset >> bitmap->chunkshift);
-		set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
+		bitmap_set_pending(bitmap, offset);
 		bitmap->allclean = 0;
 	}
 	spin_unlock_irq(&bitmap->lock);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index b44b0aba2d4..79e17983473 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -159,10 +159,15 @@ struct bitmap_page {
 	 * pointer and use it as two counters itself
 	 */
 	unsigned int hijacked:1;
+	/*
+	 * If any counter in this page is '1' or '2' - and so could be
+	 * cleared then that page is marked as 'pending'
+	 */
+	unsigned int pending:1;
 	/*
 	 * count of dirty bits on the page
 	 */
-	unsigned int  count:31;
+	unsigned int  count:30;
 };
 
 /* the main bitmap structure - one per mddev */

From 6409bb05a9831f6af36a20b97cda13059c2ef1b6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:07 +1000
Subject: [PATCH 26/56] md/bitmap: add new 'space' attribute for bitmaps.

If we are to allow bitmaps to be resized when the array is resized,
we need to know how much space there is.

So create an attribute to store this information and set appropriate
defaults.

It can be set more precisely via sysfs, or future metadata extensions
may allow it to be recorded.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 39 +++++++++++++++++++++++++++++++++++++++
 drivers/md/md.c     | 33 +++++++++++++++++++++++++++++++--
 drivers/md/md.h     |  3 +++
 3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index c7784a98567..ac688fb54e1 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1934,6 +1934,44 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry bitmap_location =
 __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
 
+/* 'bitmap/space' is the space available at 'location' for the
+ * bitmap.  This allows the kernel to know when it is safe to
+ * resize the bitmap to match a resized array.
+ */
+static ssize_t
+space_show(struct mddev *mddev, char *page)
+{
+	return sprintf(page, "%lu\n", mddev->bitmap_info.space);
+}
+
+static ssize_t
+space_store(struct mddev *mddev, const char *buf, size_t len)
+{
+	unsigned long sectors;
+	int rv;
+
+	rv = kstrtoul(buf, 10, &sectors);
+	if (rv)
+		return rv;
+
+	if (sectors == 0)
+		return -EINVAL;
+
+	if (mddev->bitmap &&
+	    sectors  < ((mddev->bitmap->file_pages - 1) * PAGE_SIZE
+			+ mddev->bitmap->last_page_size + 511) >> 9)
+		return -EFBIG; /* Bitmap is too big for this small space */
+
+	/* could make sure it isn't too big, but that isn't really
+	 * needed - user-space should be careful.
+	 */
+	mddev->bitmap_info.space = sectors;
+	return len;
+}
+
+static struct md_sysfs_entry bitmap_space =
+__ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store);
+
 static ssize_t
 timeout_show(struct mddev *mddev, char *page)
 {
@@ -2109,6 +2147,7 @@ __ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
 
 static struct attribute *md_bitmap_attrs[] = {
 	&bitmap_location.attr,
+	&bitmap_space.attr,
 	&bitmap_timeout.attr,
 	&bitmap_backlog.attr,
 	&bitmap_chunksize.attr,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ac99616f48d..9a677f2078a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1197,7 +1197,10 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 		mddev->dev_sectors = ((sector_t)sb->size) * 2;
 		mddev->events = ev1;
 		mddev->bitmap_info.offset = 0;
+		mddev->bitmap_info.space = 0;
+		/* bitmap can use 60 K after the 4K superblocks */
 		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
+		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
 		mddev->reshape_backwards = 0;
 
 		if (mddev->minor_version >= 91) {
@@ -1234,9 +1237,12 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 		mddev->max_disks = MD_SB_DISKS;
 
 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
-		    mddev->bitmap_info.file == NULL)
+		    mddev->bitmap_info.file == NULL) {
 			mddev->bitmap_info.offset =
 				mddev->bitmap_info.default_offset;
+			mddev->bitmap_info.space =
+				mddev->bitmap_info.space;
+		}
 
 	} else if (mddev->pers == NULL) {
 		/* Insist on good event counter while assembling, except
@@ -1677,7 +1683,12 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		mddev->dev_sectors = le64_to_cpu(sb->size);
 		mddev->events = ev1;
 		mddev->bitmap_info.offset = 0;
+		mddev->bitmap_info.space = 0;
+		/* Default location for bitmap is 1K after superblock
+		 * using 3K - total of 4K
+		 */
 		mddev->bitmap_info.default_offset = 1024 >> 9;
+		mddev->bitmap_info.default_space = (4096-1024) >> 9;
 		mddev->reshape_backwards = 0;
 
 		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
@@ -1686,9 +1697,23 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		mddev->max_disks =  (4096-256)/2;
 
 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
-		    mddev->bitmap_info.file == NULL )
+		    mddev->bitmap_info.file == NULL) {
 			mddev->bitmap_info.offset =
 				(__s32)le32_to_cpu(sb->bitmap_offset);
+			/* Metadata doesn't record how much space is available.
+			 * For 1.0, we assume we can use up to the superblock
+			 * if before, else to 4K beyond superblock.
+			 * For others, assume no change is possible.
+			 */
+			if (mddev->minor_version > 0)
+				mddev->bitmap_info.space = 0;
+			else if (mddev->bitmap_info.offset > 0)
+				mddev->bitmap_info.space =
+					8 - mddev->bitmap_info.offset;
+			else
+				mddev->bitmap_info.space =
+					-mddev->bitmap_info.offset;
+		}
 
 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
 			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
@@ -5280,6 +5305,7 @@ static void md_clean(struct mddev *mddev)
 	mddev->merge_check_needed = 0;
 	mddev->bitmap_info.offset = 0;
 	mddev->bitmap_info.default_offset = 0;
+	mddev->bitmap_info.default_space = 0;
 	mddev->bitmap_info.chunksize = 0;
 	mddev->bitmap_info.daemon_sleep = 0;
 	mddev->bitmap_info.max_write_behind = 0;
@@ -6076,6 +6102,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
 	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
+	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
 	mddev->bitmap_info.offset = 0;
 
 	mddev->reshape_position = MaxSector;
@@ -6258,6 +6285,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 				return -EINVAL;
 			mddev->bitmap_info.offset =
 				mddev->bitmap_info.default_offset;
+			mddev->bitmap_info.space =
+				mddev->bitmap_info.default_space;
 			mddev->pers->quiesce(mddev, 1);
 			rv = bitmap_create(mddev);
 			if (!rv)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 360937389e6..7b4a3c318ca 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -393,10 +393,13 @@ struct mddev {
 						 * For external metadata, offset
 						 * from start of device. 
 						 */
+		unsigned long		space; /* space available at this offset */
 		loff_t			default_offset; /* this is the offset to use when
 							 * hot-adding a bitmap.  It should
 							 * eventually be settable by sysfs.
 							 */
+		unsigned long		default_space; /* space available at
+							* default offset */
 		struct mutex		mutex;
 		unsigned long		chunksize;
 		unsigned long		daemon_sleep; /* how many jiffies between updates? */

From ef99bf480de9bde9d3b2afdf05324670fab4e571 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:08 +1000
Subject: [PATCH 27/56] md/bitmap: allow a bitmap with no backing storage.

An md bitmap comprises two parts
 - internal counting of active writes per 'chunk'.
 - external storage of whether there are any active writes on
   each chunk

The second requires the first, but the first doesn't require the
second.

Not having backing storage means that the bitmap cannot expedite
resync after a crash, but it still allows us to expedite the recovery
of a recently-removed device.

So: allow a bitmap to exist even if there is no backing device.
In that case we default to 128M chunks.

A particular value of this is that we can remove and re-add a bitmap
(possibly of a different granularity) on a degraded array, and not
lose the information needed to fast-recover the missing device.

We don't actually activate these bitmaps yet - that will come
in a later patch.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 136 +++++++++++++++++++++++++-------------------
 drivers/md/md.c     |   5 +-
 2 files changed, 79 insertions(+), 62 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ac688fb54e1..c042efd019c 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -553,6 +553,14 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	unsigned long long events;
 	int err = -EINVAL;
 
+	if (!bitmap->file && !bitmap->mddev->bitmap_info.offset) {
+		chunksize = 128 * 1024 * 1024;
+		daemon_sleep = 5 * HZ;
+		write_behind = 0;
+		bitmap->flags = BITMAP_STALE;
+		err = 0;
+		goto out_no_sb;
+	}
 	/* page 0 is the superblock, read it... */
 	if (bitmap->file) {
 		loff_t isize = i_size_read(bitmap->file->f_mapping->host);
@@ -623,18 +631,19 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	}
 
 	/* assign fields using values from superblock */
-	bitmap->mddev->bitmap_info.chunksize = chunksize;
-	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
-	bitmap->mddev->bitmap_info.max_write_behind = write_behind;
 	bitmap->flags |= le32_to_cpu(sb->state);
 	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
 		bitmap->flags |= BITMAP_HOSTENDIAN;
 	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
-	if (bitmap->flags & BITMAP_STALE)
-		bitmap->events_cleared = bitmap->mddev->events;
 	err = 0;
 out:
 	kunmap_atomic(sb);
+out_no_sb:
+	if (bitmap->flags & BITMAP_STALE)
+		bitmap->events_cleared = bitmap->mddev->events;
+	bitmap->mddev->bitmap_info.chunksize = chunksize;
+	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
+	bitmap->mddev->bitmap_info.max_write_behind = write_behind;
 	if (err)
 		bitmap_print_sb(bitmap);
 	return err;
@@ -837,9 +846,6 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	void *kaddr;
 	unsigned long chunk = block >> bitmap->chunkshift;
 
-	if (!bitmap->filemap)
-		return;
-
 	page = filemap_get_page(bitmap, chunk);
 	if (!page)
 		return;
@@ -857,6 +863,29 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
 }
 
+static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
+{
+	unsigned long bit;
+	struct page *page;
+	void *paddr;
+	unsigned long chunk = block >> bitmap->chunkshift;
+
+	page = filemap_get_page(bitmap, chunk);
+	if (!page)
+		return;
+	bit = file_page_offset(bitmap, chunk);
+	paddr = kmap_atomic(page);
+	if (bitmap->flags & BITMAP_HOSTENDIAN)
+		clear_bit(bit, paddr);
+	else
+		__clear_bit_le(bit, paddr);
+	kunmap_atomic(paddr);
+	if (!test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE)) {
+		set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
+		bitmap->allclean = 0;
+	}
+}
+
 /* this gets called when the md device is ready to unplug its underlying
  * (slave) device queues -- before we let any writes go down, we need to
  * sync the dirty pages of the bitmap file to disk */
@@ -867,7 +896,7 @@ void bitmap_unplug(struct bitmap *bitmap)
 	struct page *page;
 	int wait = 0;
 
-	if (!bitmap)
+	if (!bitmap || !bitmap->filemap)
 		return;
 
 	/* look at each page to see if there are any set bits that need to be
@@ -930,7 +959,20 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	chunks = bitmap->chunks;
 	file = bitmap->file;
 
-	BUG_ON(!file && !bitmap->mddev->bitmap_info.offset);
+	if (!file && !bitmap->mddev->bitmap_info.offset) {
+		/* No permanent bitmap - fill with '1s'. */
+		bitmap->filemap = NULL;
+		bitmap->file_pages = 0;
+		for (i = 0; i < chunks ; i++) {
+			/* if the disk bit is set, set the memory bit */
+			int needed = ((sector_t)(i+1) << (bitmap->chunkshift)
+				      >= start);
+			bitmap_set_memory_bits(bitmap,
+					       (sector_t)i << bitmap->chunkshift,
+					       needed);
+		}
+		return 0;
+	}
 
 	outofdate = bitmap->flags & BITMAP_STALE;
 	if (outofdate)
@@ -1045,15 +1087,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		}
 	}
 
-	/* everything went OK */
-	ret = 0;
-	bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
-
-	if (bit_cnt) { /* Kick recovery if any bits were set */
-		set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
-		md_wakeup_thread(bitmap->mddev->thread);
-	}
-
 	printk(KERN_INFO "%s: bitmap initialized from disk: "
 	       "read %lu/%lu pages, set %lu of %lu bits\n",
 	       bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks);
@@ -1073,6 +1106,12 @@ void bitmap_write_all(struct bitmap *bitmap)
 	 */
 	int i;
 
+	if (!bitmap || !bitmap->filemap)
+		return;
+	if (bitmap->file)
+		/* Only one copy, so nothing needed */
+		return;
+
 	spin_lock_irq(&bitmap->lock);
 	for (i = 0; i < bitmap->file_pages; i++)
 		set_page_attr(bitmap, bitmap->filemap[i],
@@ -1115,7 +1154,6 @@ void bitmap_daemon_work(struct mddev *mddev)
 	unsigned long nextpage;
 	unsigned long flags;
 	sector_t blocks;
-	void *paddr;
 
 	/* Use a mutex to guard daemon_work against
 	 * bitmap_destroy.
@@ -1142,10 +1180,6 @@ void bitmap_daemon_work(struct mddev *mddev)
 	 * we will write it.
 	 */
 	spin_lock_irqsave(&bitmap->lock, flags);
-	if (!bitmap->filemap)
-		/* error or shutdown */
-		goto out;
-
 	for (j = 0; j < bitmap->file_pages; j++)
 		if (test_page_attr(bitmap, bitmap->filemap[j],
 				   BITMAP_PAGE_PENDING)) {
@@ -1161,11 +1195,14 @@ void bitmap_daemon_work(struct mddev *mddev)
 		 * other changes */
 		bitmap_super_t *sb;
 		bitmap->need_sync = 0;
-		sb = kmap_atomic(bitmap->sb_page);
-		sb->events_cleared =
-			cpu_to_le64(bitmap->events_cleared);
-		kunmap_atomic(sb);
-		set_page_attr(bitmap, bitmap->sb_page, BITMAP_PAGE_NEEDWRITE);
+		if (bitmap->filemap) {
+			sb = kmap_atomic(bitmap->sb_page);
+			sb->events_cleared =
+				cpu_to_le64(bitmap->events_cleared);
+			kunmap_atomic(sb);
+			set_page_attr(bitmap, bitmap->sb_page,
+				      BITMAP_PAGE_NEEDWRITE);
+		}
 	}
 	/* Now look at the bitmap counters and if any are '2' or '1',
 	 * decrement and handle accordingly.
@@ -1173,6 +1210,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 	nextpage = 0;
 	for (j = 0; j < bitmap->chunks; j++) {
 		bitmap_counter_t *bmc;
+		sector_t  block = (sector_t)j << bitmap->chunkshift;
 
 		if (j == nextpage) {
 			nextpage += PAGE_COUNTER_RATIO;
@@ -1183,7 +1221,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 			bitmap->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
 		}
 		bmc = bitmap_get_counter(bitmap,
-					 (sector_t)j << bitmap->chunkshift,
+					 block,
 					 &blocks, 0);
 
 		if (!bmc) {
@@ -1192,33 +1230,12 @@ void bitmap_daemon_work(struct mddev *mddev)
 		}
 		if (*bmc == 1 && !bitmap->need_sync) {
 			/* We can clear the bit */
-			struct page *page;
 			*bmc = 0;
-			bitmap_count_page(
-				bitmap,
-				(sector_t)j << bitmap->chunkshift,
-				-1);
-
-			page = filemap_get_page(bitmap, j);
-			paddr = kmap_atomic(page);
-			if (bitmap->flags & BITMAP_HOSTENDIAN)
-				clear_bit(file_page_offset(bitmap, j),
-					  paddr);
-			else
-				__clear_bit_le(file_page_offset(bitmap, j),
-					       paddr);
-			kunmap_atomic(paddr);
-			if (!test_page_attr(bitmap, page,
-					    BITMAP_PAGE_NEEDWRITE)) {
-				set_page_attr(bitmap, page,
-					      BITMAP_PAGE_PENDING);
-				bitmap->allclean = 0;
-			}
+			bitmap_count_page(bitmap, block, -1);
+			bitmap_file_clear_bit(bitmap, block);
 		} else if (*bmc && *bmc <= 2) {
 			*bmc = 1;
-			bitmap_set_pending(
-				bitmap,
-				(sector_t)j << bitmap->chunkshift);
+			bitmap_set_pending(bitmap, block);
 			bitmap->allclean = 0;
 		}
 	}
@@ -1249,7 +1266,6 @@ void bitmap_daemon_work(struct mddev *mddev)
 				break;
 		}
 	}
-out:
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 
  done:
@@ -1551,7 +1567,7 @@ EXPORT_SYMBOL(bitmap_cond_end_sync);
 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
 {
 	/* For each chunk covered by any of these sectors, set the
-	 * counter to 1 and set resync_needed.  They should all
+	 * counter to 2 and possibly set resync_needed.  They should all
 	 * be 0 at this point
 	 */
 
@@ -1678,10 +1694,6 @@ int bitmap_create(struct mddev *mddev)
 
 	BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
 
-	if (!file
-	    && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
-		return 0;
-
 	BUG_ON(file && mddev->bitmap_info.offset);
 
 	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
@@ -1801,6 +1813,10 @@ int bitmap_load(struct mddev *mddev)
 
 	if (err)
 		goto out;
+	bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
+
+	/* Kick recovery in case any bits were set */
+	set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
 
 	mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
 	md_wakeup_thread(mddev->thread);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9a677f2078a..607771bb7e9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5139,7 +5139,8 @@ int md_run(struct mddev *mddev)
 		err = -EINVAL;
 		mddev->pers->stop(mddev);
 	}
-	if (err == 0 && mddev->pers->sync_request) {
+	if (err == 0 && mddev->pers->sync_request &&
+	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
 		err = bitmap_create(mddev);
 		if (err) {
 			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -7847,7 +7848,7 @@ void md_check_recovery(struct mddev *mddev)
 			goto unlock;
 
 		if (mddev->pers->sync_request) {
-			if (spares && mddev->bitmap && ! mddev->bitmap->file) {
+			if (spares) {
 				/* We are adding a device or devices to an array
 				 * which has the bitmap stored on all devices.
 				 * So make sure all bitmap pages get written

From 27581e5ae01f77b5472dc5c2368b41063fed7f37 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:08 +1000
Subject: [PATCH 28/56] md/bitmap: centralise allocation of bitmap file pages.

Instead of allocating pages in read_sb_page, read_page and
bitmap_read_sb, allocate them all in bitmap_init_from disk.

Also replace the hack of calling "attach_page_buffers(page, NULL)" to
ensure that free_buffer() won't complain, by putting a test for
PagePrivate in free_buffer().

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 149 ++++++++++++++++++++------------------------
 1 file changed, 68 insertions(+), 81 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index c042efd019c..324a198e8be 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -130,22 +130,14 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
  */
 
 /* IO operations when bitmap is stored near all superblocks */
-static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
-				 struct page *page,
-				 unsigned long index, int size)
+static int read_sb_page(struct mddev *mddev, loff_t offset,
+			struct page *page,
+			unsigned long index, int size)
 {
 	/* choose a good rdev and read the page from there */
 
 	struct md_rdev *rdev;
 	sector_t target;
-	int did_alloc = 0;
-
-	if (!page) {
-		page = alloc_page(GFP_KERNEL);
-		if (!page)
-			return ERR_PTR(-ENOMEM);
-		did_alloc = 1;
-	}
 
 	rdev_for_each(rdev, mddev) {
 		if (! test_bit(In_sync, &rdev->flags)
@@ -158,15 +150,10 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
 				 roundup(size, bdev_logical_block_size(rdev->bdev)),
 				 page, READ, true)) {
 			page->index = index;
-			attach_page_buffers(page, NULL); /* so that free_buffer will
-							  * quietly no-op */
-			return page;
+			return 0;
 		}
 	}
-	if (did_alloc)
-		put_page(page);
-	return ERR_PTR(-EIO);
-
+	return -EIO;
 }
 
 static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev)
@@ -325,8 +312,12 @@ __clear_page_buffers(struct page *page)
 }
 static void free_buffers(struct page *page)
 {
-	struct buffer_head *bh = page_buffers(page);
+	struct buffer_head *bh;
 
+	if (!PagePrivate(page))
+		return;
+
+	bh = page_buffers(page);
 	while (bh) {
 		struct buffer_head *next = bh->b_this_page;
 		free_buffer_head(bh);
@@ -343,11 +334,12 @@ static void free_buffers(struct page *page)
  * This usage is similar to how swap files are handled, and allows us
  * to write to a file with no concerns of memory allocation failing.
  */
-static struct page *read_page(struct file *file, unsigned long index,
-			      struct bitmap *bitmap,
-			      unsigned long count)
+static int read_page(struct file *file, unsigned long index,
+		     struct bitmap *bitmap,
+		     unsigned long count,
+		     struct page *page)
 {
-	struct page *page = NULL;
+	int ret = 0;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct buffer_head *bh;
 	sector_t block;
@@ -355,16 +347,9 @@ static struct page *read_page(struct file *file, unsigned long index,
 	pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
 		 (unsigned long long)index << PAGE_SHIFT);
 
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
-		page = ERR_PTR(-ENOMEM);
-	if (IS_ERR(page))
-		goto out;
-
 	bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
 	if (!bh) {
-		put_page(page);
-		page = ERR_PTR(-ENOMEM);
+		ret = -ENOMEM;
 		goto out;
 	}
 	attach_page_buffers(page, bh);
@@ -376,8 +361,7 @@ static struct page *read_page(struct file *file, unsigned long index,
 			bh->b_blocknr = bmap(inode, block);
 			if (bh->b_blocknr == 0) {
 				/* Cannot use this file! */
-				free_buffers(page);
-				page = ERR_PTR(-EINVAL);
+				ret = -EINVAL;
 				goto out;
 			}
 			bh->b_bdev = inode->i_sb->s_bdev;
@@ -400,17 +384,15 @@ static struct page *read_page(struct file *file, unsigned long index,
 
 	wait_event(bitmap->write_wait,
 		   atomic_read(&bitmap->pending_writes)==0);
-	if (bitmap->flags & BITMAP_WRITE_ERROR) {
-		free_buffers(page);
-		page = ERR_PTR(-EIO);
-	}
+	if (bitmap->flags & BITMAP_WRITE_ERROR)
+		ret = -EIO;
 out:
-	if (IS_ERR(page))
-		printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n",
+	if (ret)
+		printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n",
 			(int)PAGE_SIZE,
 			(unsigned long long)index << PAGE_SHIFT,
-			PTR_ERR(page));
-	return page;
+			ret);
+	return ret;
 }
 
 /*
@@ -552,6 +534,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	unsigned long chunksize, daemon_sleep, write_behind;
 	unsigned long long events;
 	int err = -EINVAL;
+	struct page *sb_page;
 
 	if (!bitmap->file && !bitmap->mddev->bitmap_info.offset) {
 		chunksize = 128 * 1024 * 1024;
@@ -562,24 +545,27 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		goto out_no_sb;
 	}
 	/* page 0 is the superblock, read it... */
+	sb_page = alloc_page(GFP_KERNEL);
+	if (!sb_page)
+		return -ENOMEM;
+	bitmap->sb_page = sb_page;
+
 	if (bitmap->file) {
 		loff_t isize = i_size_read(bitmap->file->f_mapping->host);
 		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
 
-		bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes);
+		err = read_page(bitmap->file, 0,
+				bitmap, bytes, sb_page);
 	} else {
-		bitmap->sb_page = read_sb_page(bitmap->mddev,
-					       bitmap->mddev->bitmap_info.offset,
-					       NULL,
-					       0, sizeof(bitmap_super_t));
+		err = read_sb_page(bitmap->mddev,
+				   bitmap->mddev->bitmap_info.offset,
+				   sb_page,
+				   0, sizeof(bitmap_super_t));
 	}
-	if (IS_ERR(bitmap->sb_page)) {
-		err = PTR_ERR(bitmap->sb_page);
-		bitmap->sb_page = NULL;
+	if (err)
 		return err;
-	}
 
-	sb = kmap_atomic(bitmap->sb_page);
+	sb = kmap_atomic(sb_page);
 
 	chunksize = le32_to_cpu(sb->chunksize);
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
@@ -948,7 +934,8 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
 static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 {
 	unsigned long i, chunks, index, oldindex, bit;
-	struct page *page = NULL, *oldpage = NULL;
+	int pnum;
+	struct page *page = NULL;
 	unsigned long num_pages, bit_cnt = 0;
 	struct file *file;
 	unsigned long bytes, offset;
@@ -999,6 +986,22 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	if (!bitmap->filemap)
 		goto err;
 
+	pnum = 0;
+	offset = 0;
+	if (bitmap->sb_page) {
+		bitmap->filemap[0] = bitmap->sb_page;
+		pnum = 1;
+		offset = sizeof(bitmap_super_t);
+	}
+	for ( ; pnum < num_pages; pnum++) {
+		bitmap->filemap[pnum] = alloc_page(GFP_KERNEL);
+		if (!bitmap->filemap[pnum]) {
+			bitmap->file_pages = pnum;
+			goto err;
+		}
+	}
+	bitmap->file_pages = pnum;
+
 	/* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */
 	bitmap->filemap_attr = kzalloc(
 		roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
@@ -1019,39 +1022,22 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				count = bytes - index * PAGE_SIZE;
 			else
 				count = PAGE_SIZE;
-			if (index == 0 && bitmap->sb_page) {
-				/*
-				 * if we're here then the superblock page
-				 * contains some bits (PAGE_SIZE != sizeof sb)
-				 * we've already read it in, so just use it
-				 */
-				page = bitmap->sb_page;
-				offset = sizeof(bitmap_super_t);
-				if (!file)
-					page = read_sb_page(
-						bitmap->mddev,
-						bitmap->mddev->bitmap_info.offset,
-						page,
-						index, count);
-			} else if (file) {
-				page = read_page(file, index, bitmap, count);
-				offset = 0;
-			} else {
-				page = read_sb_page(bitmap->mddev,
-						    bitmap->mddev->bitmap_info.offset,
-						    NULL,
-						    index, count);
-				offset = 0;
-			}
-			if (IS_ERR(page)) { /* read error */
-				ret = PTR_ERR(page);
+			page = bitmap->filemap[index];
+			if (file)
+				ret = read_page(file, index, bitmap,
+						count, page);
+			else
+				ret = read_sb_page(
+					bitmap->mddev,
+					bitmap->mddev->bitmap_info.offset,
+					page,
+					index, count);
+
+			if (ret)
 				goto err;
-			}
 
 			oldindex = index;
-			oldpage = page;
 
-			bitmap->filemap[bitmap->file_pages++] = page;
 			bitmap->last_page_size = count;
 
 			if (outofdate) {
@@ -1085,6 +1071,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 					       needed);
 			bit_cnt++;
 		}
+		offset = 0;
 	}
 
 	printk(KERN_INFO "%s: bitmap initialized from disk: "

From d189122d4bca2647441fdc170dc6d4cc5c8bb2fa Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:09 +1000
Subject: [PATCH 29/56] md/bitmap: change *_page_attr() to take a page number,
 not a page.

Most often we have the page number, not the page.  And that is what
the  *_page_attr() functions really want.  So change the arguments to
take that number.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 55 +++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 324a198e8be..511f0ed5a5d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -800,22 +800,22 @@ enum bitmap_page_attr {
 	BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
 };
 
-static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
-				enum bitmap_page_attr attr)
+static inline void set_page_attr(struct bitmap *bitmap, int pnum,
+				 enum bitmap_page_attr attr)
 {
-	__set_bit((page->index<<2) + attr, bitmap->filemap_attr);
+	__set_bit((pnum<<2) + attr, bitmap->filemap_attr);
 }
 
-static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
-				enum bitmap_page_attr attr)
+static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
+				   enum bitmap_page_attr attr)
 {
-	__clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
+	__clear_bit((pnum<<2) + attr, bitmap->filemap_attr);
 }
 
-static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
+static inline unsigned long test_page_attr(struct bitmap *bitmap, int pnum,
 					   enum bitmap_page_attr attr)
 {
-	return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
+	return test_bit((pnum<<2) + attr, bitmap->filemap_attr);
 }
 
 /*
@@ -846,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	kunmap_atomic(kaddr);
 	pr_debug("set file bit %lu page %lu\n", bit, page->index);
 	/* record page number so it gets flushed to disk when unplug occurs */
-	set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
+	set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY);
 }
 
 static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
@@ -866,8 +866,8 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	else
 		__clear_bit_le(bit, paddr);
 	kunmap_atomic(paddr);
-	if (!test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE)) {
-		set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
+	if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) {
+		set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING);
 		bitmap->allclean = 0;
 	}
 }
@@ -879,7 +879,6 @@ void bitmap_unplug(struct bitmap *bitmap)
 {
 	unsigned long i, flags;
 	int dirty, need_write;
-	struct page *page;
 	int wait = 0;
 
 	if (!bitmap || !bitmap->filemap)
@@ -893,19 +892,18 @@ void bitmap_unplug(struct bitmap *bitmap)
 			spin_unlock_irqrestore(&bitmap->lock, flags);
 			return;
 		}
-		page = bitmap->filemap[i];
-		dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
-		need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
-		clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
-		clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
+		dirty = test_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+		need_write = test_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
+		clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+		clear_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
 		if (dirty || need_write)
-			clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
+			clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
 		if (dirty)
 			wait = 1;
 		spin_unlock_irqrestore(&bitmap->lock, flags);
 
 		if (dirty || need_write)
-			write_page(bitmap, page, 0);
+			write_page(bitmap, bitmap->filemap[i], 0);
 	}
 	if (wait) { /* if any writes were performed, we need to wait on them */
 		if (bitmap->file)
@@ -1101,7 +1099,7 @@ void bitmap_write_all(struct bitmap *bitmap)
 
 	spin_lock_irq(&bitmap->lock);
 	for (i = 0; i < bitmap->file_pages; i++)
-		set_page_attr(bitmap, bitmap->filemap[i],
+		set_page_attr(bitmap, i,
 			      BITMAP_PAGE_NEEDWRITE);
 	bitmap->allclean = 0;
 	spin_unlock_irq(&bitmap->lock);
@@ -1168,11 +1166,11 @@ void bitmap_daemon_work(struct mddev *mddev)
 	 */
 	spin_lock_irqsave(&bitmap->lock, flags);
 	for (j = 0; j < bitmap->file_pages; j++)
-		if (test_page_attr(bitmap, bitmap->filemap[j],
+		if (test_page_attr(bitmap, j,
 				   BITMAP_PAGE_PENDING)) {
-			set_page_attr(bitmap, bitmap->filemap[j],
+			set_page_attr(bitmap, j,
 				      BITMAP_PAGE_NEEDWRITE);
-			clear_page_attr(bitmap, bitmap->filemap[j],
+			clear_page_attr(bitmap, j,
 					BITMAP_PAGE_PENDING);
 		}
 
@@ -1187,7 +1185,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 			sb->events_cleared =
 				cpu_to_le64(bitmap->events_cleared);
 			kunmap_atomic(sb);
-			set_page_attr(bitmap, bitmap->sb_page,
+			set_page_attr(bitmap, 0,
 				      BITMAP_PAGE_NEEDWRITE);
 		}
 	}
@@ -1236,18 +1234,17 @@ void bitmap_daemon_work(struct mddev *mddev)
 	 * We mustn't write any other blocks before the superblock.
 	 */
 	for (j = 0; j < bitmap->file_pages; j++) {
-		struct page *page = bitmap->filemap[j];
 
-		if (test_page_attr(bitmap, page,
+		if (test_page_attr(bitmap, j,
 				   BITMAP_PAGE_DIRTY))
 			/* bitmap_unplug will handle the rest */
 			break;
-		if (test_page_attr(bitmap, page,
+		if (test_page_attr(bitmap, j,
 				   BITMAP_PAGE_NEEDWRITE)) {
-			clear_page_attr(bitmap, page,
+			clear_page_attr(bitmap, j,
 					BITMAP_PAGE_NEEDWRITE);
 			spin_unlock_irqrestore(&bitmap->lock, flags);
-			write_page(bitmap, page, 0);
+			write_page(bitmap, bitmap->filemap[j], 0);
 			spin_lock_irqsave(&bitmap->lock, flags);
 			if (!bitmap->filemap)
 				break;

From 1ec885cdd01a9ad867dbb9fd32a1bfcc0875c486 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:10 +1000
Subject: [PATCH 30/56] md/bitmap: move some fields of 'struct bitmap' into a
 'storage' substruct.

This new 'struct bitmap_storage' reflects the external storage of the
bitmap.
Having this clearly defined will make it easier to change the storage
used while the array is active.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 180 +++++++++++++++++++++++---------------------
 drivers/md/bitmap.h |  17 +++--
 drivers/md/md.c     |   9 ++-
 3 files changed, 110 insertions(+), 96 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 511f0ed5a5d..c98f2fee1bd 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -195,6 +195,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 	struct md_rdev *rdev = NULL;
 	struct block_device *bdev;
 	struct mddev *mddev = bitmap->mddev;
+	struct bitmap_storage *store = &bitmap->storage;
 
 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 		int size = PAGE_SIZE;
@@ -202,8 +203,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 
 		bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
 
-		if (page->index == bitmap->file_pages-1)
-			size = roundup(bitmap->last_page_size,
+		if (page->index == store->file_pages-1)
+			size = roundup(store->last_page_size,
 				       bdev_logical_block_size(bdev));
 		/* Just make sure we aren't corrupting data or
 		 * metadata
@@ -263,7 +264,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 {
 	struct buffer_head *bh;
 
-	if (bitmap->file == NULL) {
+	if (bitmap->storage.file == NULL) {
 		switch (write_sb_page(bitmap, page, wait)) {
 		case -EINVAL:
 			bitmap->flags |= BITMAP_WRITE_ERROR;
@@ -408,9 +409,9 @@ void bitmap_update_sb(struct bitmap *bitmap)
 		return;
 	if (bitmap->mddev->bitmap_info.external)
 		return;
-	if (!bitmap->sb_page) /* no superblock */
+	if (!bitmap->storage.sb_page) /* no superblock */
 		return;
-	sb = kmap_atomic(bitmap->sb_page);
+	sb = kmap_atomic(bitmap->storage.sb_page);
 	sb->events = cpu_to_le64(bitmap->mddev->events);
 	if (bitmap->mddev->events < bitmap->events_cleared)
 		/* rocking back to read-only */
@@ -421,7 +422,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
 	sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
 	kunmap_atomic(sb);
-	write_page(bitmap, bitmap->sb_page, 1);
+	write_page(bitmap, bitmap->storage.sb_page, 1);
 }
 
 /* print out the bitmap file superblock */
@@ -429,9 +430,9 @@ void bitmap_print_sb(struct bitmap *bitmap)
 {
 	bitmap_super_t *sb;
 
-	if (!bitmap || !bitmap->sb_page)
+	if (!bitmap || !bitmap->storage.sb_page)
 		return;
-	sb = kmap_atomic(bitmap->sb_page);
+	sb = kmap_atomic(bitmap->storage.sb_page);
 	printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
 	printk(KERN_DEBUG "         magic: %08x\n", le32_to_cpu(sb->magic));
 	printk(KERN_DEBUG "       version: %d\n", le32_to_cpu(sb->version));
@@ -470,15 +471,15 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 	unsigned long chunksize, daemon_sleep, write_behind;
 	int err = -EINVAL;
 
-	bitmap->sb_page = alloc_page(GFP_KERNEL);
-	if (IS_ERR(bitmap->sb_page)) {
-		err = PTR_ERR(bitmap->sb_page);
-		bitmap->sb_page = NULL;
+	bitmap->storage.sb_page = alloc_page(GFP_KERNEL);
+	if (IS_ERR(bitmap->storage.sb_page)) {
+		err = PTR_ERR(bitmap->storage.sb_page);
+		bitmap->storage.sb_page = NULL;
 		return err;
 	}
-	bitmap->sb_page->index = 0;
+	bitmap->storage.sb_page->index = 0;
 
-	sb = kmap_atomic(bitmap->sb_page);
+	sb = kmap_atomic(bitmap->storage.sb_page);
 
 	sb->magic = cpu_to_le32(BITMAP_MAGIC);
 	sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
@@ -536,7 +537,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	int err = -EINVAL;
 	struct page *sb_page;
 
-	if (!bitmap->file && !bitmap->mddev->bitmap_info.offset) {
+	if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
 		chunksize = 128 * 1024 * 1024;
 		daemon_sleep = 5 * HZ;
 		write_behind = 0;
@@ -548,13 +549,13 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	sb_page = alloc_page(GFP_KERNEL);
 	if (!sb_page)
 		return -ENOMEM;
-	bitmap->sb_page = sb_page;
+	bitmap->storage.sb_page = sb_page;
 
-	if (bitmap->file) {
-		loff_t isize = i_size_read(bitmap->file->f_mapping->host);
+	if (bitmap->storage.file) {
+		loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
 		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
 
-		err = read_page(bitmap->file, 0,
+		err = read_page(bitmap->storage.file, 0,
 				bitmap, bytes, sb_page);
 	} else {
 		err = read_sb_page(bitmap->mddev,
@@ -647,9 +648,9 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
 	bitmap_super_t *sb;
 	int old;
 
-	if (!bitmap->sb_page) /* can't set the state */
+	if (!bitmap->storage.sb_page) /* can't set the state */
 		return 0;
-	sb = kmap_atomic(bitmap->sb_page);
+	sb = kmap_atomic(bitmap->storage.sb_page);
 	old = le32_to_cpu(sb->state) & bits;
 	switch (op) {
 	case MASK_SET:
@@ -678,17 +679,19 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
  * file a page at a time. There's a superblock at the start of the file.
  */
 /* calculate the index of the page that contains this bit */
-static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk)
+static inline unsigned long file_page_index(struct bitmap_storage *store,
+					    unsigned long chunk)
 {
-	if (!bitmap->mddev->bitmap_info.external)
+	if (store->sb_page)
 		chunk += sizeof(bitmap_super_t) << 3;
 	return chunk >> PAGE_BIT_SHIFT;
 }
 
 /* calculate the (bit) offset of this bit within a page */
-static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk)
+static inline unsigned long file_page_offset(struct bitmap_storage *store,
+					     unsigned long chunk)
 {
-	if (!bitmap->mddev->bitmap_info.external)
+	if (store->sb_page)
 		chunk += sizeof(bitmap_super_t) << 3;
 	return chunk & (PAGE_BITS - 1);
 }
@@ -700,13 +703,13 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon
  * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
  * 0 or page 1
  */
-static inline struct page *filemap_get_page(struct bitmap *bitmap,
+static inline struct page *filemap_get_page(struct bitmap_storage *store,
 					    unsigned long chunk)
 {
-	if (file_page_index(bitmap, chunk) >= bitmap->file_pages)
+	if (file_page_index(store, chunk) >= store->file_pages)
 		return NULL;
-	return bitmap->filemap[file_page_index(bitmap, chunk)
-			       - file_page_index(bitmap, 0)];
+	return store->filemap[file_page_index(store, chunk)
+			      - file_page_index(store, 0)];
 }
 
 static void bitmap_file_unmap(struct bitmap *bitmap)
@@ -715,16 +718,17 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 	unsigned long *attr;
 	int pages;
 	unsigned long flags;
+	struct bitmap_storage *store = &bitmap->storage;
 
 	spin_lock_irqsave(&bitmap->lock, flags);
-	map = bitmap->filemap;
-	bitmap->filemap = NULL;
-	attr = bitmap->filemap_attr;
-	bitmap->filemap_attr = NULL;
-	pages = bitmap->file_pages;
-	bitmap->file_pages = 0;
-	sb_page = bitmap->sb_page;
-	bitmap->sb_page = NULL;
+	map = store->filemap;
+	store->filemap = NULL;
+	attr = store->filemap_attr;
+	store->filemap_attr = NULL;
+	pages = store->file_pages;
+	store->file_pages = 0;
+	sb_page = store->sb_page;
+	store->sb_page = NULL;
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 
 	while (pages--)
@@ -743,8 +747,8 @@ static void bitmap_file_put(struct bitmap *bitmap)
 	unsigned long flags;
 
 	spin_lock_irqsave(&bitmap->lock, flags);
-	file = bitmap->file;
-	bitmap->file = NULL;
+	file = bitmap->storage.file;
+	bitmap->storage.file = NULL;
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 
 	if (file)
@@ -771,11 +775,11 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 	if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) {
 		bitmap_update_sb(bitmap);
 
-		if (bitmap->file) {
+		if (bitmap->storage.file) {
 			path = kmalloc(PAGE_SIZE, GFP_KERNEL);
 			if (path)
-				ptr = d_path(&bitmap->file->f_path, path,
-					     PAGE_SIZE);
+				ptr = d_path(&bitmap->storage.file->f_path,
+					     path, PAGE_SIZE);
 
 			printk(KERN_ALERT
 			      "%s: kicking failed bitmap file %s from array!\n",
@@ -803,19 +807,19 @@ enum bitmap_page_attr {
 static inline void set_page_attr(struct bitmap *bitmap, int pnum,
 				 enum bitmap_page_attr attr)
 {
-	__set_bit((pnum<<2) + attr, bitmap->filemap_attr);
+	__set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
 }
 
 static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
 				   enum bitmap_page_attr attr)
 {
-	__clear_bit((pnum<<2) + attr, bitmap->filemap_attr);
+	__clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
 }
 
 static inline unsigned long test_page_attr(struct bitmap *bitmap, int pnum,
 					   enum bitmap_page_attr attr)
 {
-	return test_bit((pnum<<2) + attr, bitmap->filemap_attr);
+	return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
 }
 
 /*
@@ -832,10 +836,10 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	void *kaddr;
 	unsigned long chunk = block >> bitmap->chunkshift;
 
-	page = filemap_get_page(bitmap, chunk);
+	page = filemap_get_page(&bitmap->storage, chunk);
 	if (!page)
 		return;
-	bit = file_page_offset(bitmap, chunk);
+	bit = file_page_offset(&bitmap->storage, chunk);
 
 	/* set the bit */
 	kaddr = kmap_atomic(page);
@@ -856,10 +860,10 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	void *paddr;
 	unsigned long chunk = block >> bitmap->chunkshift;
 
-	page = filemap_get_page(bitmap, chunk);
+	page = filemap_get_page(&bitmap->storage, chunk);
 	if (!page)
 		return;
-	bit = file_page_offset(bitmap, chunk);
+	bit = file_page_offset(&bitmap->storage, chunk);
 	paddr = kmap_atomic(page);
 	if (bitmap->flags & BITMAP_HOSTENDIAN)
 		clear_bit(bit, paddr);
@@ -881,14 +885,14 @@ void bitmap_unplug(struct bitmap *bitmap)
 	int dirty, need_write;
 	int wait = 0;
 
-	if (!bitmap || !bitmap->filemap)
+	if (!bitmap || !bitmap->storage.filemap)
 		return;
 
 	/* look at each page to see if there are any set bits that need to be
 	 * flushed out to disk */
-	for (i = 0; i < bitmap->file_pages; i++) {
+	for (i = 0; i < bitmap->storage.file_pages; i++) {
 		spin_lock_irqsave(&bitmap->lock, flags);
-		if (!bitmap->filemap) {
+		if (!bitmap->storage.filemap) {
 			spin_unlock_irqrestore(&bitmap->lock, flags);
 			return;
 		}
@@ -903,10 +907,10 @@ void bitmap_unplug(struct bitmap *bitmap)
 		spin_unlock_irqrestore(&bitmap->lock, flags);
 
 		if (dirty || need_write)
-			write_page(bitmap, bitmap->filemap[i], 0);
+			write_page(bitmap, bitmap->storage.filemap[i], 0);
 	}
 	if (wait) { /* if any writes were performed, we need to wait on them */
-		if (bitmap->file)
+		if (bitmap->storage.file)
 			wait_event(bitmap->write_wait,
 				   atomic_read(&bitmap->pending_writes)==0);
 		else
@@ -940,14 +944,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	int outofdate;
 	int ret = -ENOSPC;
 	void *paddr;
+	struct bitmap_storage *store = &bitmap->storage;
 
 	chunks = bitmap->chunks;
-	file = bitmap->file;
+	file = store->file;
 
 	if (!file && !bitmap->mddev->bitmap_info.offset) {
 		/* No permanent bitmap - fill with '1s'. */
-		bitmap->filemap = NULL;
-		bitmap->file_pages = 0;
+		store->filemap = NULL;
+		store->file_pages = 0;
 		for (i = 0; i < chunks ; i++) {
 			/* if the disk bit is set, set the memory bit */
 			int needed = ((sector_t)(i+1) << (bitmap->chunkshift)
@@ -980,39 +985,40 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 
 	ret = -ENOMEM;
 
-	bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
-	if (!bitmap->filemap)
+	store->filemap = kmalloc(sizeof(struct page *)
+					  * num_pages, GFP_KERNEL);
+	if (!store->filemap)
 		goto err;
 
 	pnum = 0;
 	offset = 0;
-	if (bitmap->sb_page) {
-		bitmap->filemap[0] = bitmap->sb_page;
+	if (store->sb_page) {
+		store->filemap[0] = store->sb_page;
 		pnum = 1;
 		offset = sizeof(bitmap_super_t);
 	}
 	for ( ; pnum < num_pages; pnum++) {
-		bitmap->filemap[pnum] = alloc_page(GFP_KERNEL);
-		if (!bitmap->filemap[pnum]) {
-			bitmap->file_pages = pnum;
+		store->filemap[pnum] = alloc_page(GFP_KERNEL);
+		if (!store->filemap[pnum]) {
+			store->file_pages = pnum;
 			goto err;
 		}
 	}
-	bitmap->file_pages = pnum;
+	store->file_pages = pnum;
 
 	/* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */
-	bitmap->filemap_attr = kzalloc(
+	store->filemap_attr = kzalloc(
 		roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
 		GFP_KERNEL);
-	if (!bitmap->filemap_attr)
+	if (!store->filemap_attr)
 		goto err;
 
 	oldindex = ~0L;
 
 	for (i = 0; i < chunks; i++) {
 		int b;
-		index = file_page_index(bitmap, i);
-		bit = file_page_offset(bitmap, i);
+		index = file_page_index(&bitmap->storage, i);
+		bit = file_page_offset(&bitmap->storage, i);
 		if (index != oldindex) { /* this is a new page, read it in */
 			int count;
 			/* unmap the old page, we're done with it */
@@ -1020,7 +1026,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				count = bytes - index * PAGE_SIZE;
 			else
 				count = PAGE_SIZE;
-			page = bitmap->filemap[index];
+			page = store->filemap[index];
 			if (file)
 				ret = read_page(file, index, bitmap,
 						count, page);
@@ -1036,7 +1042,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 
 			oldindex = index;
 
-			bitmap->last_page_size = count;
+			store->last_page_size = count;
 
 			if (outofdate) {
 				/*
@@ -1074,7 +1080,8 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 
 	printk(KERN_INFO "%s: bitmap initialized from disk: "
 	       "read %lu/%lu pages, set %lu of %lu bits\n",
-	       bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks);
+	       bmname(bitmap), store->file_pages,
+	       num_pages, bit_cnt, chunks);
 
 	return 0;
 
@@ -1091,14 +1098,14 @@ void bitmap_write_all(struct bitmap *bitmap)
 	 */
 	int i;
 
-	if (!bitmap || !bitmap->filemap)
+	if (!bitmap || !bitmap->storage.filemap)
 		return;
-	if (bitmap->file)
+	if (bitmap->storage.file)
 		/* Only one copy, so nothing needed */
 		return;
 
 	spin_lock_irq(&bitmap->lock);
-	for (i = 0; i < bitmap->file_pages; i++)
+	for (i = 0; i < bitmap->storage.file_pages; i++)
 		set_page_attr(bitmap, i,
 			      BITMAP_PAGE_NEEDWRITE);
 	bitmap->allclean = 0;
@@ -1165,7 +1172,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 	 * we will write it.
 	 */
 	spin_lock_irqsave(&bitmap->lock, flags);
-	for (j = 0; j < bitmap->file_pages; j++)
+	for (j = 0; j < bitmap->storage.file_pages; j++)
 		if (test_page_attr(bitmap, j,
 				   BITMAP_PAGE_PENDING)) {
 			set_page_attr(bitmap, j,
@@ -1180,8 +1187,8 @@ void bitmap_daemon_work(struct mddev *mddev)
 		 * other changes */
 		bitmap_super_t *sb;
 		bitmap->need_sync = 0;
-		if (bitmap->filemap) {
-			sb = kmap_atomic(bitmap->sb_page);
+		if (bitmap->storage.filemap) {
+			sb = kmap_atomic(bitmap->storage.sb_page);
 			sb->events_cleared =
 				cpu_to_le64(bitmap->events_cleared);
 			kunmap_atomic(sb);
@@ -1233,7 +1240,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 	 * the first blocking holds the superblock and it has been updated.
 	 * We mustn't write any other blocks before the superblock.
 	 */
-	for (j = 0; j < bitmap->file_pages; j++) {
+	for (j = 0; j < bitmap->storage.file_pages; j++) {
 
 		if (test_page_attr(bitmap, j,
 				   BITMAP_PAGE_DIRTY))
@@ -1244,9 +1251,9 @@ void bitmap_daemon_work(struct mddev *mddev)
 			clear_page_attr(bitmap, j,
 					BITMAP_PAGE_NEEDWRITE);
 			spin_unlock_irqrestore(&bitmap->lock, flags);
-			write_page(bitmap, bitmap->filemap[j], 0);
+			write_page(bitmap, bitmap->storage.filemap[j], 0);
 			spin_lock_irqsave(&bitmap->lock, flags);
-			if (!bitmap->filemap)
+			if (!bitmap->storage.filemap)
 				break;
 		}
 	}
@@ -1700,7 +1707,7 @@ int bitmap_create(struct mddev *mddev)
 	} else
 		bitmap->sysfs_can_clear = NULL;
 
-	bitmap->file = file;
+	bitmap->storage.file = file;
 	if (file) {
 		get_file(file);
 		/* As future accesses to this file will use bmap,
@@ -1832,9 +1839,9 @@ void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 		   << (PAGE_SHIFT - 10),
 		   chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
 		   chunk_kb ? "KB" : "B");
-	if (bitmap->file) {
+	if (bitmap->storage.file) {
 		seq_printf(seq, ", file: ");
-		seq_path(seq, &bitmap->file->f_path, " \t\n");
+		seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
 	}
 
 	seq_printf(seq, "\n");
@@ -1958,8 +1965,9 @@ space_store(struct mddev *mddev, const char *buf, size_t len)
 		return -EINVAL;
 
 	if (mddev->bitmap &&
-	    sectors  < ((mddev->bitmap->file_pages - 1) * PAGE_SIZE
-			+ mddev->bitmap->last_page_size + 511) >> 9)
+	    sectors  < ((mddev->bitmap->storage.file_pages - 1)
+			      * PAGE_SIZE
+			+ mddev->bitmap->storage.last_page_size + 511) >> 9)
 		return -EFBIG; /* Bitmap is too big for this small space */
 
 	/* could make sure it isn't too big, but that isn't really
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 79e17983473..162ab095b86 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -188,12 +188,17 @@ struct bitmap {
 	/* bitmap spinlock */
 	spinlock_t lock;
 
-	struct file *file; /* backing disk file */
-	struct page *sb_page; /* cached copy of the bitmap file superblock */
-	struct page **filemap; /* list of cache pages for the file */
-	unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
-	unsigned long file_pages; /* number of pages in the file */
-	int last_page_size; /* bytes in the last page */
+	struct bitmap_storage {
+		struct file *file;		/* backing disk file */
+		struct page *sb_page;		/* cached copy of the bitmap
+						 * file superblock */
+		struct page **filemap;		/* list of cache pages for
+						 * the file */
+		unsigned long *filemap_attr;	/* attributes associated
+						 * w/ filemap pages */
+		unsigned long file_pages;	/* number of pages in the file*/
+		int last_page_size;		/* bytes in the last page */
+	} storage;
 
 	unsigned long flags;
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 607771bb7e9..9e2336fbbd3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1989,7 +1989,7 @@ super_1_allow_new_offset(struct md_rdev *rdev,
 	bitmap = rdev->mddev->bitmap;
 	if (bitmap && !rdev->mddev->bitmap_info.file &&
 	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
-	    bitmap->file_pages * (PAGE_SIZE>>9) > new_offset)
+	    bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
 		return 0;
 	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
 		return 0;
@@ -5649,7 +5649,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
 		goto out;
 
 	/* bitmap disabled, zero the first byte and copy out */
-	if (!mddev->bitmap || !mddev->bitmap->file) {
+	if (!mddev->bitmap || !mddev->bitmap->storage.file) {
 		file->pathname[0] = '\0';
 		goto copy_out;
 	}
@@ -5658,7 +5658,8 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
 	if (!buf)
 		goto out;
 
-	ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
+	ptr = d_path(&mddev->bitmap->storage.file->f_path,
+		     buf, sizeof(file->pathname));
 	if (IS_ERR(ptr))
 		goto out;
 
@@ -6299,7 +6300,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 			/* remove the bitmap */
 			if (!mddev->bitmap)
 				return -ENOENT;
-			if (mddev->bitmap->file)
+			if (mddev->bitmap->storage.file)
 				return -EINVAL;
 			mddev->pers->quiesce(mddev, 1);
 			bitmap_destroy(mddev);

From 9b1215c102d4b12f6c815d7fdd35d0628db35b28 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:11 +1000
Subject: [PATCH 31/56] md/bitmap: store bytes in file rather than just in last
 page.

This number is more generally useful, and bytes-in-last-page is
easily extracted from it.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 16 +++++++++-------
 drivers/md/bitmap.h |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index c98f2fee1bd..7a3be0d4103 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -203,9 +203,13 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 
 		bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
 
-		if (page->index == store->file_pages-1)
-			size = roundup(store->last_page_size,
+		if (page->index == store->file_pages-1) {
+			int last_page_size = store->bytes & (PAGE_SIZE-1);
+			if (last_page_size == 0)
+				last_page_size = PAGE_SIZE;
+			size = roundup(last_page_size,
 				       bdev_logical_block_size(bdev));
+		}
 		/* Just make sure we aren't corrupting data or
 		 * metadata
 		 */
@@ -973,6 +977,8 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	if (!bitmap->mddev->bitmap_info.external)
 		bytes += sizeof(bitmap_super_t);
 
+	store->bytes = bytes;
+
 	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
 
 	if (file && i_size_read(file->f_mapping->host) < bytes) {
@@ -1042,8 +1048,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 
 			oldindex = index;
 
-			store->last_page_size = count;
-
 			if (outofdate) {
 				/*
 				 * if bitmap is out of date, dirty the
@@ -1965,9 +1969,7 @@ space_store(struct mddev *mddev, const char *buf, size_t len)
 		return -EINVAL;
 
 	if (mddev->bitmap &&
-	    sectors  < ((mddev->bitmap->storage.file_pages - 1)
-			      * PAGE_SIZE
-			+ mddev->bitmap->storage.last_page_size + 511) >> 9)
+	    sectors < (mddev->bitmap->storage.bytes + 511) >> 9)
 		return -EFBIG; /* Bitmap is too big for this small space */
 
 	/* could make sure it isn't too big, but that isn't really
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 162ab095b86..26689260e17 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -197,7 +197,7 @@ struct bitmap {
 		unsigned long *filemap_attr;	/* attributes associated
 						 * w/ filemap pages */
 		unsigned long file_pages;	/* number of pages in the file*/
-		int last_page_size;		/* bytes in the last page */
+		unsigned long bytes;		/* total bytes in the bitmap */
 	} storage;
 
 	unsigned long flags;

From d1244cb062750bdb2298ca2565239d3d8cbd91a8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:12 +1000
Subject: [PATCH 32/56] md/bitmap: separate bitmap file allocation to its own
 function.

This will allow allocation before swapping in a new bitmap.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 115 ++++++++++++++++++++++++++------------------
 1 file changed, 68 insertions(+), 47 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7a3be0d4103..4ac60ed66c4 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -716,6 +716,58 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store,
 			      - file_page_index(store, 0)];
 }
 
+static int bitmap_storage_alloc(struct bitmap_storage *store,
+				unsigned long chunks, int with_super)
+{
+	int pnum;
+	unsigned long num_pages;
+	unsigned long bytes;
+
+	bytes = DIV_ROUND_UP(chunks, 8);
+	if (with_super)
+		bytes += sizeof(bitmap_super_t);
+
+	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
+
+	store->filemap = kmalloc(sizeof(struct page *)
+				 * num_pages, GFP_KERNEL);
+	if (!store->filemap)
+		return -ENOMEM;
+
+	if (with_super && !store->sb_page) {
+		store->sb_page = alloc_page(GFP_KERNEL);
+		if (store->sb_page == NULL)
+			return -ENOMEM;
+		store->sb_page->index = 0;
+	}
+	pnum = 0;
+	if (store->sb_page) {
+		store->filemap[0] = store->sb_page;
+		pnum = 1;
+	}
+	for ( ; pnum < num_pages; pnum++) {
+		store->filemap[pnum] = alloc_page(GFP_KERNEL);
+		if (!store->filemap[pnum]) {
+			store->file_pages = pnum;
+			return -ENOMEM;
+		}
+		store->filemap[pnum]->index = pnum;
+	}
+	store->file_pages = pnum;
+
+	/* We need 4 bits per page, rounded up to a multiple
+	 * of sizeof(unsigned long) */
+	store->filemap_attr = kzalloc(
+		roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
+		GFP_KERNEL);
+	if (!store->filemap_attr)
+		return -ENOMEM;
+
+	store->bytes = bytes;
+
+	return 0;
+}
+
 static void bitmap_file_unmap(struct bitmap *bitmap)
 {
 	struct page **map, *sb_page;
@@ -940,11 +992,10 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
 static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 {
 	unsigned long i, chunks, index, oldindex, bit;
-	int pnum;
 	struct page *page = NULL;
-	unsigned long num_pages, bit_cnt = 0;
+	unsigned long bit_cnt = 0;
 	struct file *file;
-	unsigned long bytes, offset;
+	unsigned long offset;
 	int outofdate;
 	int ret = -ENOSPC;
 	void *paddr;
@@ -973,53 +1024,23 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		printk(KERN_INFO "%s: bitmap file is out of date, doing full "
 			"recovery\n", bmname(bitmap));
 
-	bytes = DIV_ROUND_UP(bitmap->chunks, 8);
-	if (!bitmap->mddev->bitmap_info.external)
-		bytes += sizeof(bitmap_super_t);
-
-	store->bytes = bytes;
-
-	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
-
-	if (file && i_size_read(file->f_mapping->host) < bytes) {
+	if (file && i_size_read(file->f_mapping->host) < store->bytes) {
 		printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
-			bmname(bitmap),
-			(unsigned long) i_size_read(file->f_mapping->host),
-			bytes);
+		       bmname(bitmap),
+		       (unsigned long) i_size_read(file->f_mapping->host),
+		       store->bytes);
 		goto err;
 	}
 
-	ret = -ENOMEM;
-
-	store->filemap = kmalloc(sizeof(struct page *)
-					  * num_pages, GFP_KERNEL);
-	if (!store->filemap)
-		goto err;
-
-	pnum = 0;
-	offset = 0;
-	if (store->sb_page) {
-		store->filemap[0] = store->sb_page;
-		pnum = 1;
-		offset = sizeof(bitmap_super_t);
-	}
-	for ( ; pnum < num_pages; pnum++) {
-		store->filemap[pnum] = alloc_page(GFP_KERNEL);
-		if (!store->filemap[pnum]) {
-			store->file_pages = pnum;
-			goto err;
-		}
-	}
-	store->file_pages = pnum;
-
-	/* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */
-	store->filemap_attr = kzalloc(
-		roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
-		GFP_KERNEL);
-	if (!store->filemap_attr)
+	ret = bitmap_storage_alloc(&bitmap->storage, bitmap->chunks,
+				   !bitmap->mddev->bitmap_info.external);
+	if (ret)
 		goto err;
 
 	oldindex = ~0L;
+	offset = 0;
+	if (!bitmap->mddev->bitmap_info.external)
+		offset = sizeof(bitmap_super_t);
 
 	for (i = 0; i < chunks; i++) {
 		int b;
@@ -1028,8 +1049,8 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		if (index != oldindex) { /* this is a new page, read it in */
 			int count;
 			/* unmap the old page, we're done with it */
-			if (index == num_pages-1)
-				count = bytes - index * PAGE_SIZE;
+			if (index == store->file_pages-1)
+				count = store->bytes - index * PAGE_SIZE;
 			else
 				count = PAGE_SIZE;
 			page = store->filemap[index];
@@ -1083,9 +1104,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	}
 
 	printk(KERN_INFO "%s: bitmap initialized from disk: "
-	       "read %lu/%lu pages, set %lu of %lu bits\n",
+	       "read %lu pages, set %lu of %lu bits\n",
 	       bmname(bitmap), store->file_pages,
-	       num_pages, bit_cnt, chunks);
+	       bit_cnt, chunks);
 
 	return 0;
 

From bc9891a8853842a19c33dda0ba02fbacf7da067f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:12 +1000
Subject: [PATCH 33/56] md/bitmap: move storage allocation from bitmap_load to
 bitmap_create.

We should allocate memory for the storage-bitmap at create-time, not
load time.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 4ac60ed66c4..1a93ed1fb8d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1032,11 +1032,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		goto err;
 	}
 
-	ret = bitmap_storage_alloc(&bitmap->storage, bitmap->chunks,
-				   !bitmap->mddev->bitmap_info.external);
-	if (ret)
-		goto err;
-
 	oldindex = ~0L;
 	offset = 0;
 	if (!bitmap->mddev->bitmap_info.external)
@@ -1782,6 +1777,12 @@ int bitmap_create(struct mddev *mddev)
 	if (!bitmap->bp)
 		goto error;
 
+	if (file || mddev->bitmap_info.offset) {
+		err = bitmap_storage_alloc(&bitmap->storage, bitmap->chunks,
+					   !mddev->bitmap_info.external);
+		if (err)
+			goto error;
+	}
 	printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
 		pages, bmname(bitmap));
 

From edbb79df6731bb1e99c15f5a519a864d488f4808 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:13 +1000
Subject: [PATCH 34/56] md/bitmap: remove bitmap_mask_state

This function isn't really needed.  It sets or clears a flag in both
bitmap->flags and sb->state.
However both times it is called, bitmap_update_sb is called soon
afterwards which copies bitmap->flags to sb->state.
So just make changes to bitmap->flags, and open-code those rather than
hiding in a function.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 37 +++----------------------------------
 1 file changed, 3 insertions(+), 34 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1a93ed1fb8d..2d25f3aebd4 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -640,38 +640,6 @@ out_no_sb:
 	return err;
 }
 
-enum bitmap_mask_op {
-	MASK_SET,
-	MASK_UNSET
-};
-
-/* record the state of the bitmap in the superblock.  Return the old value */
-static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
-			     enum bitmap_mask_op op)
-{
-	bitmap_super_t *sb;
-	int old;
-
-	if (!bitmap->storage.sb_page) /* can't set the state */
-		return 0;
-	sb = kmap_atomic(bitmap->storage.sb_page);
-	old = le32_to_cpu(sb->state) & bits;
-	switch (op) {
-	case MASK_SET:
-		sb->state |= cpu_to_le32(bits);
-		bitmap->flags |= bits;
-		break;
-	case MASK_UNSET:
-		sb->state &= cpu_to_le32(~bits);
-		bitmap->flags &= ~bits;
-		break;
-	default:
-		BUG();
-	}
-	kunmap_atomic(sb);
-	return old;
-}
-
 /*
  * general bitmap file operations
  */
@@ -828,7 +796,8 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 {
 	char *path, *ptr = NULL;
 
-	if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) {
+	if (!(bitmap->flags & BITMAP_STALE)) {
+		bitmap->flags |= BITMAP_STALE;
 		bitmap_update_sb(bitmap);
 
 		if (bitmap->storage.file) {
@@ -1830,7 +1799,7 @@ int bitmap_load(struct mddev *mddev)
 
 	if (err)
 		goto out;
-	bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
+	bitmap->flags &= ~BITMAP_STALE;
 
 	/* Kick recovery in case any bits were set */
 	set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);

From 84e923453e19d6427c6aa884d9561f01e1425d09 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:14 +1000
Subject: [PATCH 35/56] md/bitmap: remove single-bit manipulation on sb->state

Just do single-bit manipulations on bitmap->flags and copy whole
value between that and sb->state.

This will allow next patch which changes how bit manipulations are
performed on bitmap->flags.

This does result in BITMAP_STALE not being set in sb by
bitmap_read_sb, however as the setting is determined by other
information in the 'sb' we do not lose information this way.
Normally, bitmap_load will be called shortly which will clear
BITMAP_STALE anyway.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2d25f3aebd4..09d360ff77d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -522,7 +522,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 	memcpy(sb->uuid, bitmap->mddev->uuid, 16);
 
 	bitmap->flags |= BITMAP_STALE;
-	sb->state |= cpu_to_le32(BITMAP_STALE);
+	sb->state = cpu_to_le32(bitmap->flags);
 	bitmap->events_cleared = bitmap->mddev->events;
 	sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
 
@@ -617,7 +617,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 			       "-- forcing full recovery\n",
 			       bmname(bitmap), events,
 			       (unsigned long long) bitmap->mddev->events);
-			sb->state |= cpu_to_le32(BITMAP_STALE);
+			bitmap->flags |= BITMAP_STALE;
 		}
 	}
 

From b405fe91e50c60c80e72d798025aea4917096421 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:15 +1000
Subject: [PATCH 36/56] md/bitmap: use set_bit, test_bit, etc for operation on
 bitmap->flags.

We currently use '&' and '|' which isn't the norm in the kernel
and doesn't allow easy atomicity.
So change to bit numbers and {set,clear,test}_bit.
This allows us to remove a spinlock/unlock (which was dubious anyway)
and some other simplifications.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 46 +++++++++++++++++++++------------------------
 drivers/md/bitmap.h |  6 +++---
 2 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 09d360ff77d..f3a9dffc3fc 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -271,7 +271,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 	if (bitmap->storage.file == NULL) {
 		switch (write_sb_page(bitmap, page, wait)) {
 		case -EINVAL:
-			bitmap->flags |= BITMAP_WRITE_ERROR;
+			set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
 		}
 	} else {
 
@@ -289,20 +289,16 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 			wait_event(bitmap->write_wait,
 				   atomic_read(&bitmap->pending_writes)==0);
 	}
-	if (bitmap->flags & BITMAP_WRITE_ERROR)
+	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
 		bitmap_file_kick(bitmap);
 }
 
 static void end_bitmap_write(struct buffer_head *bh, int uptodate)
 {
 	struct bitmap *bitmap = bh->b_private;
-	unsigned long flags;
 
-	if (!uptodate) {
-		spin_lock_irqsave(&bitmap->lock, flags);
-		bitmap->flags |= BITMAP_WRITE_ERROR;
-		spin_unlock_irqrestore(&bitmap->lock, flags);
-	}
+	if (!uptodate)
+		set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
 	if (atomic_dec_and_test(&bitmap->pending_writes))
 		wake_up(&bitmap->write_wait);
 }
@@ -389,7 +385,7 @@ static int read_page(struct file *file, unsigned long index,
 
 	wait_event(bitmap->write_wait,
 		   atomic_read(&bitmap->pending_writes)==0);
-	if (bitmap->flags & BITMAP_WRITE_ERROR)
+	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
 		ret = -EIO;
 out:
 	if (ret)
@@ -521,7 +517,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 
 	memcpy(sb->uuid, bitmap->mddev->uuid, 16);
 
-	bitmap->flags |= BITMAP_STALE;
+	set_bit(BITMAP_STALE, &bitmap->flags);
 	sb->state = cpu_to_le32(bitmap->flags);
 	bitmap->events_cleared = bitmap->mddev->events;
 	sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
@@ -545,7 +541,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		chunksize = 128 * 1024 * 1024;
 		daemon_sleep = 5 * HZ;
 		write_behind = 0;
-		bitmap->flags = BITMAP_STALE;
+		set_bit(BITMAP_STALE, &bitmap->flags);
 		err = 0;
 		goto out_no_sb;
 	}
@@ -617,20 +613,20 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 			       "-- forcing full recovery\n",
 			       bmname(bitmap), events,
 			       (unsigned long long) bitmap->mddev->events);
-			bitmap->flags |= BITMAP_STALE;
+			set_bit(BITMAP_STALE, &bitmap->flags);
 		}
 	}
 
 	/* assign fields using values from superblock */
 	bitmap->flags |= le32_to_cpu(sb->state);
 	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
-		bitmap->flags |= BITMAP_HOSTENDIAN;
+		set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
 	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
 	err = 0;
 out:
 	kunmap_atomic(sb);
 out_no_sb:
-	if (bitmap->flags & BITMAP_STALE)
+	if (test_bit(BITMAP_STALE, &bitmap->flags))
 		bitmap->events_cleared = bitmap->mddev->events;
 	bitmap->mddev->bitmap_info.chunksize = chunksize;
 	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
@@ -796,8 +792,7 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 {
 	char *path, *ptr = NULL;
 
-	if (!(bitmap->flags & BITMAP_STALE)) {
-		bitmap->flags |= BITMAP_STALE;
+	if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
 		bitmap_update_sb(bitmap);
 
 		if (bitmap->storage.file) {
@@ -868,7 +863,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 
 	/* set the bit */
 	kaddr = kmap_atomic(page);
-	if (bitmap->flags & BITMAP_HOSTENDIAN)
+	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
 		set_bit(bit, kaddr);
 	else
 		__set_bit_le(bit, kaddr);
@@ -890,7 +885,7 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 		return;
 	bit = file_page_offset(&bitmap->storage, chunk);
 	paddr = kmap_atomic(page);
-	if (bitmap->flags & BITMAP_HOSTENDIAN)
+	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
 		clear_bit(bit, paddr);
 	else
 		__clear_bit_le(bit, paddr);
@@ -941,7 +936,7 @@ void bitmap_unplug(struct bitmap *bitmap)
 		else
 			md_super_wait(bitmap->mddev);
 	}
-	if (bitmap->flags & BITMAP_WRITE_ERROR)
+	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
 		bitmap_file_kick(bitmap);
 }
 EXPORT_SYMBOL(bitmap_unplug);
@@ -988,7 +983,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		return 0;
 	}
 
-	outofdate = bitmap->flags & BITMAP_STALE;
+	outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
 	if (outofdate)
 		printk(KERN_INFO "%s: bitmap file is out of date, doing full "
 			"recovery\n", bmname(bitmap));
@@ -1045,12 +1040,13 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				write_page(bitmap, page, 1);
 
 				ret = -EIO;
-				if (bitmap->flags & BITMAP_WRITE_ERROR)
+				if (test_bit(BITMAP_WRITE_ERROR,
+					     &bitmap->flags))
 					goto err;
 			}
 		}
 		paddr = kmap_atomic(page);
-		if (bitmap->flags & BITMAP_HOSTENDIAN)
+		if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
 			b = test_bit(bit, paddr);
 		else
 			b = test_bit_le(bit, paddr);
@@ -1758,7 +1754,7 @@ int bitmap_create(struct mddev *mddev)
 	mddev->bitmap = bitmap;
 
 
-	return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0;
+	return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
 
  error:
 	bitmap_free(bitmap);
@@ -1799,7 +1795,7 @@ int bitmap_load(struct mddev *mddev)
 
 	if (err)
 		goto out;
-	bitmap->flags &= ~BITMAP_STALE;
+	clear_bit(BITMAP_STALE, &bitmap->flags);
 
 	/* Kick recovery in case any bits were set */
 	set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
@@ -1809,7 +1805,7 @@ int bitmap_load(struct mddev *mddev)
 
 	bitmap_update_sb(bitmap);
 
-	if (bitmap->flags & BITMAP_WRITE_ERROR)
+	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
 		err = -EIO;
 out:
 	return err;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 26689260e17..e2d999a62ba 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -111,9 +111,9 @@ typedef __u16 bitmap_counter_t;
 
 /* use these for bitmap->flags and bitmap->sb->state bit-fields */
 enum bitmap_state {
-	BITMAP_STALE  = 0x002,  /* the bitmap file is out of date or had -EIO */
-	BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
-	BITMAP_HOSTENDIAN = 0x8000,
+	BITMAP_STALE	   = 1,  /* the bitmap file is out of date or had -EIO */
+	BITMAP_WRITE_ERROR = 2, /* A write error has occurred */
+	BITMAP_HOSTENDIAN  =15,
 };
 
 /* the superblock at the front of the bitmap file -- little endian */

From 7466712347c58c367636cc875801c8703bc729e7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:19 +1000
Subject: [PATCH 37/56] md/bitmap: convert some spin_lock_irqsave to
 spin_lock_irq

All of these sites can only be called from process context with
irqs enabled, so using irqsave/irqrestore just adds noise.
Remove it.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index f3a9dffc3fc..a1862d9e6d8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -737,10 +737,9 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 	struct page **map, *sb_page;
 	unsigned long *attr;
 	int pages;
-	unsigned long flags;
 	struct bitmap_storage *store = &bitmap->storage;
 
-	spin_lock_irqsave(&bitmap->lock, flags);
+	spin_lock_irq(&bitmap->lock);
 	map = store->filemap;
 	store->filemap = NULL;
 	attr = store->filemap_attr;
@@ -749,7 +748,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 	store->file_pages = 0;
 	sb_page = store->sb_page;
 	store->sb_page = NULL;
-	spin_unlock_irqrestore(&bitmap->lock, flags);
+	spin_unlock_irq(&bitmap->lock);
 
 	while (pages--)
 		if (map[pages] != sb_page) /* 0 is sb_page, release it below */
@@ -764,12 +763,11 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 static void bitmap_file_put(struct bitmap *bitmap)
 {
 	struct file *file;
-	unsigned long flags;
 
-	spin_lock_irqsave(&bitmap->lock, flags);
+	spin_lock_irq(&bitmap->lock);
 	file = bitmap->storage.file;
 	bitmap->storage.file = NULL;
-	spin_unlock_irqrestore(&bitmap->lock, flags);
+	spin_unlock_irq(&bitmap->lock);
 
 	if (file)
 		wait_event(bitmap->write_wait,
@@ -901,7 +899,7 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
  * sync the dirty pages of the bitmap file to disk */
 void bitmap_unplug(struct bitmap *bitmap)
 {
-	unsigned long i, flags;
+	unsigned long i;
 	int dirty, need_write;
 	int wait = 0;
 
@@ -911,9 +909,9 @@ void bitmap_unplug(struct bitmap *bitmap)
 	/* look at each page to see if there are any set bits that need to be
 	 * flushed out to disk */
 	for (i = 0; i < bitmap->storage.file_pages; i++) {
-		spin_lock_irqsave(&bitmap->lock, flags);
+		spin_lock_irq(&bitmap->lock);
 		if (!bitmap->storage.filemap) {
-			spin_unlock_irqrestore(&bitmap->lock, flags);
+			spin_unlock_irq(&bitmap->lock);
 			return;
 		}
 		dirty = test_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
@@ -924,7 +922,7 @@ void bitmap_unplug(struct bitmap *bitmap)
 			clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
 		if (dirty)
 			wait = 1;
-		spin_unlock_irqrestore(&bitmap->lock, flags);
+		spin_unlock_irq(&bitmap->lock);
 
 		if (dirty || need_write)
 			write_page(bitmap, bitmap->storage.filemap[i], 0);
@@ -1129,7 +1127,6 @@ void bitmap_daemon_work(struct mddev *mddev)
 	struct bitmap *bitmap;
 	unsigned long j;
 	unsigned long nextpage;
-	unsigned long flags;
 	sector_t blocks;
 
 	/* Use a mutex to guard daemon_work against
@@ -1156,7 +1153,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 	 * So set NEEDWRITE now, then after we make any last-minute changes
 	 * we will write it.
 	 */
-	spin_lock_irqsave(&bitmap->lock, flags);
+	spin_lock_irq(&bitmap->lock);
 	for (j = 0; j < bitmap->storage.file_pages; j++)
 		if (test_page_attr(bitmap, j,
 				   BITMAP_PAGE_PENDING)) {
@@ -1235,14 +1232,14 @@ void bitmap_daemon_work(struct mddev *mddev)
 				   BITMAP_PAGE_NEEDWRITE)) {
 			clear_page_attr(bitmap, j,
 					BITMAP_PAGE_NEEDWRITE);
-			spin_unlock_irqrestore(&bitmap->lock, flags);
+			spin_unlock_irq(&bitmap->lock);
 			write_page(bitmap, bitmap->storage.filemap[j], 0);
-			spin_lock_irqsave(&bitmap->lock, flags);
+			spin_lock_irq(&bitmap->lock);
 			if (!bitmap->storage.filemap)
 				break;
 		}
 	}
-	spin_unlock_irqrestore(&bitmap->lock, flags);
+	spin_unlock_irq(&bitmap->lock);
 
  done:
 	if (bitmap->allclean == 0)
@@ -1815,12 +1812,11 @@ EXPORT_SYMBOL_GPL(bitmap_load);
 void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 {
 	unsigned long chunk_kb;
-	unsigned long flags;
 
 	if (!bitmap)
 		return;
 
-	spin_lock_irqsave(&bitmap->lock, flags);
+	spin_lock_irq(&bitmap->lock);
 	chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
 	seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
 		   "%lu%s chunk",
@@ -1836,7 +1832,7 @@ void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 	}
 
 	seq_printf(seq, "\n");
-	spin_unlock_irqrestore(&bitmap->lock, flags);
+	spin_unlock_irq(&bitmap->lock);
 }
 
 static ssize_t

From 62f82faaceb483284ea938b3a092b9f0a4d2d029 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:21 +1000
Subject: [PATCH 38/56] md/bitmap: remove async freeing of bitmap file.

There is no real value in freeing things the moment there is an error.
It is just as good to free the bitmap file and pages when the bitmap
is explicitly removed (and replaced?) or at shutdown.

With this gone, the bitmap will only disappear when the array is
quiescent, so we can remove some locking.

As the 'filemap' doesn't disappear now, include extra checks before
trying to write any of it out.
Also remove the check for "has it disappeared" in
bitmap_daemon_write().


Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index a1862d9e6d8..7e015c9ea73 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -739,7 +739,6 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 	int pages;
 	struct bitmap_storage *store = &bitmap->storage;
 
-	spin_lock_irq(&bitmap->lock);
 	map = store->filemap;
 	store->filemap = NULL;
 	attr = store->filemap_attr;
@@ -748,7 +747,6 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 	store->file_pages = 0;
 	sb_page = store->sb_page;
 	store->sb_page = NULL;
-	spin_unlock_irq(&bitmap->lock);
 
 	while (pages--)
 		if (map[pages] != sb_page) /* 0 is sb_page, release it below */
@@ -764,10 +762,8 @@ static void bitmap_file_put(struct bitmap *bitmap)
 {
 	struct file *file;
 
-	spin_lock_irq(&bitmap->lock);
 	file = bitmap->storage.file;
 	bitmap->storage.file = NULL;
-	spin_unlock_irq(&bitmap->lock);
 
 	if (file)
 		wait_event(bitmap->write_wait,
@@ -809,10 +805,6 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 			       "%s: disabling internal bitmap due to errors\n",
 			       bmname(bitmap));
 	}
-
-	bitmap_file_put(bitmap);
-
-	return;
 }
 
 enum bitmap_page_attr {
@@ -903,7 +895,8 @@ void bitmap_unplug(struct bitmap *bitmap)
 	int dirty, need_write;
 	int wait = 0;
 
-	if (!bitmap || !bitmap->storage.filemap)
+	if (!bitmap || !bitmap->storage.filemap ||
+	    test_bit(BITMAP_STALE, &bitmap->flags))
 		return;
 
 	/* look at each page to see if there are any set bits that need to be
@@ -1222,7 +1215,10 @@ void bitmap_daemon_work(struct mddev *mddev)
 	 * the first blocking holds the superblock and it has been updated.
 	 * We mustn't write any other blocks before the superblock.
 	 */
-	for (j = 0; j < bitmap->storage.file_pages; j++) {
+	for (j = 0;
+	     j < bitmap->storage.file_pages
+		     && !test_bit(BITMAP_STALE, &bitmap->flags);
+	     j++) {
 
 		if (test_page_attr(bitmap, j,
 				   BITMAP_PAGE_DIRTY))
@@ -1235,8 +1231,6 @@ void bitmap_daemon_work(struct mddev *mddev)
 			spin_unlock_irq(&bitmap->lock);
 			write_page(bitmap, bitmap->storage.filemap[j], 0);
 			spin_lock_irq(&bitmap->lock);
-			if (!bitmap->storage.filemap)
-				break;
 		}
 	}
 	spin_unlock_irq(&bitmap->lock);

From fae7d326cd0905624c173337fdfcfe9b86b9e6e2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:21 +1000
Subject: [PATCH 39/56] md/bitmap: merge bitmap_file_unmap and bitmap_file_put.

There functions really do one thing together: release the
'bitmap_storage'.  So make them just one function.

Since we removed the locking (previous patch), we don't need to zero
any fields before freeing them, so it all becomes a bit simpler.


Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7e015c9ea73..d2ab7d3df3e 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -732,43 +732,25 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 	return 0;
 }
 
-static void bitmap_file_unmap(struct bitmap *bitmap)
+static void bitmap_file_unmap(struct bitmap_storage *store)
 {
 	struct page **map, *sb_page;
-	unsigned long *attr;
 	int pages;
-	struct bitmap_storage *store = &bitmap->storage;
+	struct file *file;
 
+	file = store->file;
 	map = store->filemap;
-	store->filemap = NULL;
-	attr = store->filemap_attr;
-	store->filemap_attr = NULL;
 	pages = store->file_pages;
-	store->file_pages = 0;
 	sb_page = store->sb_page;
-	store->sb_page = NULL;
 
 	while (pages--)
 		if (map[pages] != sb_page) /* 0 is sb_page, release it below */
 			free_buffers(map[pages]);
 	kfree(map);
-	kfree(attr);
+	kfree(store->filemap_attr);
 
 	if (sb_page)
 		free_buffers(sb_page);
-}
-
-static void bitmap_file_put(struct bitmap *bitmap)
-{
-	struct file *file;
-
-	file = bitmap->storage.file;
-	bitmap->storage.file = NULL;
-
-	if (file)
-		wait_event(bitmap->write_wait,
-			   atomic_read(&bitmap->pending_writes)==0);
-	bitmap_file_unmap(bitmap);
 
 	if (file) {
 		struct inode *inode = file->f_path.dentry->d_inode;
@@ -1610,8 +1592,12 @@ static void bitmap_free(struct bitmap *bitmap)
 	if (!bitmap) /* there was no bitmap */
 		return;
 
-	/* release the bitmap file and kill the daemon */
-	bitmap_file_put(bitmap);
+	/* Shouldn't be needed - but just in case.... */
+	wait_event(bitmap->write_wait,
+		   atomic_read(&bitmap->pending_writes) == 0);
+
+	/* release the bitmap file  */
+	bitmap_file_unmap(&bitmap->storage);
 
 	bp = bitmap->bp;
 	pages = bitmap->pages;

From bdfd1140731a67c74fd74814f222c8078171adcd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:22 +1000
Subject: [PATCH 40/56] md/bitmap: make _page_attr bitops atomic.

Using e.g. set_bit instead of __set_bit and using test_and_clear_bit
allow us to remove some locking and contract other locked ranges.

It is rare that we set or clear a lot of these bits, so gain should
outweigh any cost.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 55 +++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index d2ab7d3df3e..2e2f53cd5b2 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -799,21 +799,27 @@ enum bitmap_page_attr {
 static inline void set_page_attr(struct bitmap *bitmap, int pnum,
 				 enum bitmap_page_attr attr)
 {
-	__set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
+	set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
 }
 
 static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
 				   enum bitmap_page_attr attr)
 {
-	__clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
+	clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
 }
 
-static inline unsigned long test_page_attr(struct bitmap *bitmap, int pnum,
-					   enum bitmap_page_attr attr)
+static inline int test_page_attr(struct bitmap *bitmap, int pnum,
+				 enum bitmap_page_attr attr)
 {
 	return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
 }
 
+static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum,
+					   enum bitmap_page_attr attr)
+{
+	return test_and_clear_bit((pnum<<2) + attr,
+				  bitmap->storage.filemap_attr);
+}
 /*
  * bitmap_file_set_bit -- called before performing a write to the md device
  * to set (and eventually sync) a particular bit in the bitmap file
@@ -884,23 +890,17 @@ void bitmap_unplug(struct bitmap *bitmap)
 	/* look at each page to see if there are any set bits that need to be
 	 * flushed out to disk */
 	for (i = 0; i < bitmap->storage.file_pages; i++) {
-		spin_lock_irq(&bitmap->lock);
-		if (!bitmap->storage.filemap) {
-			spin_unlock_irq(&bitmap->lock);
+		if (!bitmap->storage.filemap)
 			return;
-		}
-		dirty = test_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
-		need_write = test_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
-		clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
-		clear_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
-		if (dirty || need_write)
+		dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+		need_write = test_and_clear_page_attr(bitmap, i,
+						      BITMAP_PAGE_NEEDWRITE);
+		if (dirty || need_write) {
 			clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
+			write_page(bitmap, bitmap->storage.filemap[i], 0);
+		}
 		if (dirty)
 			wait = 1;
-		spin_unlock_irq(&bitmap->lock);
-
-		if (dirty || need_write)
-			write_page(bitmap, bitmap->storage.filemap[i], 0);
 	}
 	if (wait) { /* if any writes were performed, we need to wait on them */
 		if (bitmap->storage.file)
@@ -1062,12 +1062,10 @@ void bitmap_write_all(struct bitmap *bitmap)
 		/* Only one copy, so nothing needed */
 		return;
 
-	spin_lock_irq(&bitmap->lock);
 	for (i = 0; i < bitmap->storage.file_pages; i++)
 		set_page_attr(bitmap, i,
 			      BITMAP_PAGE_NEEDWRITE);
 	bitmap->allclean = 0;
-	spin_unlock_irq(&bitmap->lock);
 }
 
 static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
@@ -1128,15 +1126,11 @@ void bitmap_daemon_work(struct mddev *mddev)
 	 * So set NEEDWRITE now, then after we make any last-minute changes
 	 * we will write it.
 	 */
-	spin_lock_irq(&bitmap->lock);
 	for (j = 0; j < bitmap->storage.file_pages; j++)
-		if (test_page_attr(bitmap, j,
-				   BITMAP_PAGE_PENDING)) {
+		if (test_and_clear_page_attr(bitmap, j,
+					     BITMAP_PAGE_PENDING))
 			set_page_attr(bitmap, j,
 				      BITMAP_PAGE_NEEDWRITE);
-			clear_page_attr(bitmap, j,
-					BITMAP_PAGE_PENDING);
-		}
 
 	if (bitmap->need_sync &&
 	    mddev->bitmap_info.external == 0) {
@@ -1156,6 +1150,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 	/* Now look at the bitmap counters and if any are '2' or '1',
 	 * decrement and handle accordingly.
 	 */
+	spin_lock_irq(&bitmap->lock);
 	nextpage = 0;
 	for (j = 0; j < bitmap->chunks; j++) {
 		bitmap_counter_t *bmc;
@@ -1188,6 +1183,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 			bitmap->allclean = 0;
 		}
 	}
+	spin_unlock_irq(&bitmap->lock);
 
 	/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
 	 * DIRTY pages need to be written by bitmap_unplug so it can wait
@@ -1206,16 +1202,11 @@ void bitmap_daemon_work(struct mddev *mddev)
 				   BITMAP_PAGE_DIRTY))
 			/* bitmap_unplug will handle the rest */
 			break;
-		if (test_page_attr(bitmap, j,
-				   BITMAP_PAGE_NEEDWRITE)) {
-			clear_page_attr(bitmap, j,
-					BITMAP_PAGE_NEEDWRITE);
-			spin_unlock_irq(&bitmap->lock);
+		if (test_and_clear_page_attr(bitmap, j,
+					     BITMAP_PAGE_NEEDWRITE)) {
 			write_page(bitmap, bitmap->storage.filemap[j], 0);
-			spin_lock_irq(&bitmap->lock);
 		}
 	}
-	spin_unlock_irq(&bitmap->lock);
 
  done:
 	if (bitmap->allclean == 0)

From 63c68268b2fc63f7f612dfebecb5c9cbbd3ebeaf Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:23 +1000
Subject: [PATCH 41/56] md/bitmap: make bitmap bitops atomic.

This allows us to remove spinlock protection which is
more heavy-weight than simple atomics.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2e2f53cd5b2..7c3c1c58ff1 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -844,7 +844,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
 		set_bit(bit, kaddr);
 	else
-		__set_bit_le(bit, kaddr);
+		test_and_set_bit_le(bit, kaddr);
 	kunmap_atomic(kaddr);
 	pr_debug("set file bit %lu page %lu\n", bit, page->index);
 	/* record page number so it gets flushed to disk when unplug occurs */
@@ -866,7 +866,7 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
 		clear_bit(bit, paddr);
 	else
-		__clear_bit_le(bit, paddr);
+		test_and_clear_bit_le(bit, paddr);
 	kunmap_atomic(paddr);
 	if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) {
 		set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING);
@@ -1536,9 +1536,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
 	for (chunk = s; chunk <= e; chunk++) {
 		sector_t sec = (sector_t)chunk << bitmap->chunkshift;
 		bitmap_set_memory_bits(bitmap, sec, 1);
-		spin_lock_irq(&bitmap->lock);
 		bitmap_file_set_bit(bitmap, sec);
-		spin_unlock_irq(&bitmap->lock);
 		if (sec < bitmap->mddev->recovery_cp)
 			/* We are asserting that the array is dirty,
 			 * so move the recovery_cp address back so

From 40cffcc0e8f9f6e295630cb8b8d58a13baa6c7f9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:24 +1000
Subject: [PATCH 42/56] md/bitmap: create a 'struct bitmap_counts' substructure
 of 'struct bitmap'

The new "struct bitmap_counts" contains all the fields that are
related to counting the number of active writes in each bitmap chunk.

Having this separate will make it easier to change the chunksize
or overall size of a bitmap atomically.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 138 +++++++++++++++++++++++---------------------
 drivers/md/bitmap.h |  23 ++++----
 2 files changed, 84 insertions(+), 77 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7c3c1c58ff1..c100a47bc15 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -45,7 +45,7 @@ static inline char *bmname(struct bitmap *bitmap)
  * if we find our page, we increment the page's refcount so that it stays
  * allocated while we're using it
  */
-static int bitmap_checkpage(struct bitmap *bitmap,
+static int bitmap_checkpage(struct bitmap_counts *bitmap,
 			    unsigned long page, int create)
 __releases(bitmap->lock)
 __acquires(bitmap->lock)
@@ -76,8 +76,7 @@ __acquires(bitmap->lock)
 	spin_lock_irq(&bitmap->lock);
 
 	if (mappage == NULL) {
-		pr_debug("%s: bitmap map page allocation failed, hijacking\n",
-			 bmname(bitmap));
+		pr_debug("md/bitmap: map page allocation failed, hijacking\n");
 		/* failed - set the hijacked flag so that we can use the
 		 * pointer as a counter */
 		if (!bitmap->bp[page].map)
@@ -100,7 +99,7 @@ __acquires(bitmap->lock)
 /* if page is completely empty, put it back on the free list, or dealloc it */
 /* if page was hijacked, unmark the flag so it might get alloced next time */
 /* Note: lock should be held when calling this */
-static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
+static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
 {
 	char *ptr;
 
@@ -832,7 +831,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	unsigned long bit;
 	struct page *page;
 	void *kaddr;
-	unsigned long chunk = block >> bitmap->chunkshift;
+	unsigned long chunk = block >> bitmap->counts.chunkshift;
 
 	page = filemap_get_page(&bitmap->storage, chunk);
 	if (!page)
@@ -856,7 +855,7 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	unsigned long bit;
 	struct page *page;
 	void *paddr;
-	unsigned long chunk = block >> bitmap->chunkshift;
+	unsigned long chunk = block >> bitmap->counts.chunkshift;
 
 	page = filemap_get_page(&bitmap->storage, chunk);
 	if (!page)
@@ -938,7 +937,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	void *paddr;
 	struct bitmap_storage *store = &bitmap->storage;
 
-	chunks = bitmap->chunks;
+	chunks = bitmap->counts.chunks;
 	file = store->file;
 
 	if (!file && !bitmap->mddev->bitmap_info.offset) {
@@ -947,10 +946,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		store->file_pages = 0;
 		for (i = 0; i < chunks ; i++) {
 			/* if the disk bit is set, set the memory bit */
-			int needed = ((sector_t)(i+1) << (bitmap->chunkshift)
+			int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift)
 				      >= start);
 			bitmap_set_memory_bits(bitmap,
-					       (sector_t)i << bitmap->chunkshift,
+					       (sector_t)i << bitmap->counts.chunkshift,
 					       needed);
 		}
 		return 0;
@@ -1026,10 +1025,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		kunmap_atomic(paddr);
 		if (b) {
 			/* if the disk bit is set, set the memory bit */
-			int needed = ((sector_t)(i+1) << bitmap->chunkshift
+			int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
 				      >= start);
 			bitmap_set_memory_bits(bitmap,
-					       (sector_t)i << bitmap->chunkshift,
+					       (sector_t)i << bitmap->counts.chunkshift,
 					       needed);
 			bit_cnt++;
 		}
@@ -1068,7 +1067,8 @@ void bitmap_write_all(struct bitmap *bitmap)
 	bitmap->allclean = 0;
 }
 
-static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
+static void bitmap_count_page(struct bitmap_counts *bitmap,
+			      sector_t offset, int inc)
 {
 	sector_t chunk = offset >> bitmap->chunkshift;
 	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
@@ -1076,7 +1076,7 @@ static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
 	bitmap_checkfree(bitmap, page);
 }
 
-static void bitmap_set_pending(struct bitmap *bitmap, sector_t offset)
+static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
 {
 	sector_t chunk = offset >> bitmap->chunkshift;
 	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
@@ -1086,7 +1086,7 @@ static void bitmap_set_pending(struct bitmap *bitmap, sector_t offset)
 		bp->pending = 1;
 }
 
-static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
+static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
 					    sector_t offset, sector_t *blocks,
 					    int create);
 
@@ -1101,6 +1101,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 	unsigned long j;
 	unsigned long nextpage;
 	sector_t blocks;
+	struct bitmap_counts *counts;
 
 	/* Use a mutex to guard daemon_work against
 	 * bitmap_destroy.
@@ -1150,21 +1151,22 @@ void bitmap_daemon_work(struct mddev *mddev)
 	/* Now look at the bitmap counters and if any are '2' or '1',
 	 * decrement and handle accordingly.
 	 */
-	spin_lock_irq(&bitmap->lock);
+	counts = &bitmap->counts;
+	spin_lock_irq(&counts->lock);
 	nextpage = 0;
-	for (j = 0; j < bitmap->chunks; j++) {
+	for (j = 0; j < counts->chunks; j++) {
 		bitmap_counter_t *bmc;
-		sector_t  block = (sector_t)j << bitmap->chunkshift;
+		sector_t  block = (sector_t)j << counts->chunkshift;
 
 		if (j == nextpage) {
 			nextpage += PAGE_COUNTER_RATIO;
-			if (!bitmap->bp[j >> PAGE_COUNTER_SHIFT].pending) {
+			if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) {
 				j |= PAGE_COUNTER_MASK;
 				continue;
 			}
-			bitmap->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
+			counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
 		}
-		bmc = bitmap_get_counter(bitmap,
+		bmc = bitmap_get_counter(counts,
 					 block,
 					 &blocks, 0);
 
@@ -1175,15 +1177,15 @@ void bitmap_daemon_work(struct mddev *mddev)
 		if (*bmc == 1 && !bitmap->need_sync) {
 			/* We can clear the bit */
 			*bmc = 0;
-			bitmap_count_page(bitmap, block, -1);
+			bitmap_count_page(counts, block, -1);
 			bitmap_file_clear_bit(bitmap, block);
 		} else if (*bmc && *bmc <= 2) {
 			*bmc = 1;
-			bitmap_set_pending(bitmap, block);
+			bitmap_set_pending(counts, block);
 			bitmap->allclean = 0;
 		}
 	}
-	spin_unlock_irq(&bitmap->lock);
+	spin_unlock_irq(&counts->lock);
 
 	/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
 	 * DIRTY pages need to be written by bitmap_unplug so it can wait
@@ -1215,7 +1217,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 	mutex_unlock(&mddev->bitmap_info.mutex);
 }
 
-static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
+static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
 					    sector_t offset, sector_t *blocks,
 					    int create)
 __releases(bitmap->lock)
@@ -1277,10 +1279,10 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 		sector_t blocks;
 		bitmap_counter_t *bmc;
 
-		spin_lock_irq(&bitmap->lock);
-		bmc = bitmap_get_counter(bitmap, offset, &blocks, 1);
+		spin_lock_irq(&bitmap->counts.lock);
+		bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
 		if (!bmc) {
-			spin_unlock_irq(&bitmap->lock);
+			spin_unlock_irq(&bitmap->counts.lock);
 			return 0;
 		}
 
@@ -1292,7 +1294,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 			 */
 			prepare_to_wait(&bitmap->overflow_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
-			spin_unlock_irq(&bitmap->lock);
+			spin_unlock_irq(&bitmap->counts.lock);
 			io_schedule();
 			finish_wait(&bitmap->overflow_wait, &__wait);
 			continue;
@@ -1301,7 +1303,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 		switch (*bmc) {
 		case 0:
 			bitmap_file_set_bit(bitmap, offset);
-			bitmap_count_page(bitmap, offset, 1);
+			bitmap_count_page(&bitmap->counts, offset, 1);
 			/* fall through */
 		case 1:
 			*bmc = 2;
@@ -1309,7 +1311,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 
 		(*bmc)++;
 
-		spin_unlock_irq(&bitmap->lock);
+		spin_unlock_irq(&bitmap->counts.lock);
 
 		offset += blocks;
 		if (sectors > blocks)
@@ -1339,10 +1341,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 		unsigned long flags;
 		bitmap_counter_t *bmc;
 
-		spin_lock_irqsave(&bitmap->lock, flags);
-		bmc = bitmap_get_counter(bitmap, offset, &blocks, 0);
+		spin_lock_irqsave(&bitmap->counts.lock, flags);
+		bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0);
 		if (!bmc) {
-			spin_unlock_irqrestore(&bitmap->lock, flags);
+			spin_unlock_irqrestore(&bitmap->counts.lock, flags);
 			return;
 		}
 
@@ -1361,10 +1363,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 
 		(*bmc)--;
 		if (*bmc <= 2) {
-			bitmap_set_pending(bitmap, offset);
+			bitmap_set_pending(&bitmap->counts, offset);
 			bitmap->allclean = 0;
 		}
-		spin_unlock_irqrestore(&bitmap->lock, flags);
+		spin_unlock_irqrestore(&bitmap->counts.lock, flags);
 		offset += blocks;
 		if (sectors > blocks)
 			sectors -= blocks;
@@ -1383,8 +1385,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
 		*blocks = 1024;
 		return 1; /* always resync if no bitmap */
 	}
-	spin_lock_irq(&bitmap->lock);
-	bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
+	spin_lock_irq(&bitmap->counts.lock);
+	bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
 	rv = 0;
 	if (bmc) {
 		/* locked */
@@ -1398,7 +1400,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
 			}
 		}
 	}
-	spin_unlock_irq(&bitmap->lock);
+	spin_unlock_irq(&bitmap->counts.lock);
 	return rv;
 }
 
@@ -1435,8 +1437,8 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
 		*blocks = 1024;
 		return;
 	}
-	spin_lock_irqsave(&bitmap->lock, flags);
-	bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
+	spin_lock_irqsave(&bitmap->counts.lock, flags);
+	bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
 	if (bmc == NULL)
 		goto unlock;
 	/* locked */
@@ -1447,13 +1449,13 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
 			*bmc |= NEEDED_MASK;
 		else {
 			if (*bmc <= 2) {
-				bitmap_set_pending(bitmap, offset);
+				bitmap_set_pending(&bitmap->counts, offset);
 				bitmap->allclean = 0;
 			}
 		}
 	}
  unlock:
-	spin_unlock_irqrestore(&bitmap->lock, flags);
+	spin_unlock_irqrestore(&bitmap->counts.lock, flags);
 }
 EXPORT_SYMBOL(bitmap_end_sync);
 
@@ -1493,7 +1495,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
 
 	bitmap->mddev->curr_resync_completed = sector;
 	set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
-	sector &= ~((1ULL << bitmap->chunkshift) - 1);
+	sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
 	s = 0;
 	while (s < sector && s < bitmap->mddev->resync_max_sectors) {
 		bitmap_end_sync(bitmap, s, &blocks, 0);
@@ -1513,19 +1515,19 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
 
 	sector_t secs;
 	bitmap_counter_t *bmc;
-	spin_lock_irq(&bitmap->lock);
-	bmc = bitmap_get_counter(bitmap, offset, &secs, 1);
+	spin_lock_irq(&bitmap->counts.lock);
+	bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1);
 	if (!bmc) {
-		spin_unlock_irq(&bitmap->lock);
+		spin_unlock_irq(&bitmap->counts.lock);
 		return;
 	}
 	if (!*bmc) {
 		*bmc = 2 | (needed ? NEEDED_MASK : 0);
-		bitmap_count_page(bitmap, offset, 1);
-		bitmap_set_pending(bitmap, offset);
+		bitmap_count_page(&bitmap->counts, offset, 1);
+		bitmap_set_pending(&bitmap->counts, offset);
 		bitmap->allclean = 0;
 	}
-	spin_unlock_irq(&bitmap->lock);
+	spin_unlock_irq(&bitmap->counts.lock);
 }
 
 /* dirty the memory and file bits for bitmap chunks "s" to "e" */
@@ -1534,7 +1536,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
 	unsigned long chunk;
 
 	for (chunk = s; chunk <= e; chunk++) {
-		sector_t sec = (sector_t)chunk << bitmap->chunkshift;
+		sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift;
 		bitmap_set_memory_bits(bitmap, sec, 1);
 		bitmap_file_set_bit(bitmap, sec);
 		if (sec < bitmap->mddev->recovery_cp)
@@ -1588,8 +1590,8 @@ static void bitmap_free(struct bitmap *bitmap)
 	/* release the bitmap file  */
 	bitmap_file_unmap(&bitmap->storage);
 
-	bp = bitmap->bp;
-	pages = bitmap->pages;
+	bp = bitmap->counts.bp;
+	pages = bitmap->counts.pages;
 
 	/* free all allocated memory */
 
@@ -1642,7 +1644,7 @@ int bitmap_create(struct mddev *mddev)
 	if (!bitmap)
 		return -ENOMEM;
 
-	spin_lock_init(&bitmap->lock);
+	spin_lock_init(&bitmap->counts.lock);
 	atomic_set(&bitmap->pending_writes, 0);
 	init_waitqueue_head(&bitmap->write_wait);
 	init_waitqueue_head(&bitmap->overflow_wait);
@@ -1689,27 +1691,28 @@ int bitmap_create(struct mddev *mddev)
 		goto error;
 
 	bitmap->daemon_lastrun = jiffies;
-	bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize)
+	bitmap->counts.chunkshift = (ffz(~mddev->bitmap_info.chunksize)
 			      - BITMAP_BLOCK_SHIFT);
 
-	chunks = (blocks + (1 << bitmap->chunkshift) - 1) >>
-			bitmap->chunkshift;
+	chunks = (blocks + (1 << bitmap->counts.chunkshift) - 1) >>
+			bitmap->counts.chunkshift;
 	pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
 
 	BUG_ON(!pages);
 
-	bitmap->chunks = chunks;
-	bitmap->pages = pages;
-	bitmap->missing_pages = pages;
+	bitmap->counts.chunks = chunks;
+	bitmap->counts.pages = pages;
+	bitmap->counts.missing_pages = pages;
 
-	bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
+	bitmap->counts.bp = kzalloc(pages * sizeof(*bitmap->counts.bp),
+				    GFP_KERNEL);
 
 	err = -ENOMEM;
-	if (!bitmap->bp)
+	if (!bitmap->counts.bp)
 		goto error;
 
 	if (file || mddev->bitmap_info.offset) {
-		err = bitmap_storage_alloc(&bitmap->storage, bitmap->chunks,
+		err = bitmap_storage_alloc(&bitmap->storage, bitmap->counts.chunks,
 					   !mddev->bitmap_info.external);
 		if (err)
 			goto error;
@@ -1781,17 +1784,19 @@ EXPORT_SYMBOL_GPL(bitmap_load);
 void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 {
 	unsigned long chunk_kb;
+	struct bitmap_counts *counts;
 
 	if (!bitmap)
 		return;
 
-	spin_lock_irq(&bitmap->lock);
+	counts = &bitmap->counts;
+
 	chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
 	seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
 		   "%lu%s chunk",
-		   bitmap->pages - bitmap->missing_pages,
-		   bitmap->pages,
-		   (bitmap->pages - bitmap->missing_pages)
+		   counts->pages - counts->missing_pages,
+		   counts->pages,
+		   (counts->pages - counts->missing_pages)
 		   << (PAGE_SHIFT - 10),
 		   chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
 		   chunk_kb ? "KB" : "B");
@@ -1801,7 +1806,6 @@ void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 	}
 
 	seq_printf(seq, "\n");
-	spin_unlock_irq(&bitmap->lock);
 }
 
 static ssize_t
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index e2d999a62ba..6bde180e987 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -172,22 +172,25 @@ struct bitmap_page {
 
 /* the main bitmap structure - one per mddev */
 struct bitmap {
-	struct bitmap_page *bp;
-	unsigned long pages; /* total number of pages in the bitmap */
-	unsigned long missing_pages; /* number of pages not yet allocated */
+
+	struct bitmap_counts {
+		spinlock_t lock;
+		struct bitmap_page *bp;
+		unsigned long pages;		/* total number of pages
+						 * in the bitmap */
+		unsigned long missing_pages;	/* number of pages
+						 * not yet allocated */
+		unsigned long chunkshift;	/* chunksize = 2^chunkshift
+						 * (for bitops) */
+		unsigned long chunks;		/* Total number of data
+						 * chunks for the array */
+	} counts;
 
 	struct mddev *mddev; /* the md device that the bitmap is for */
 
-	/* bitmap chunksize -- how much data does each bit represent? */
-	unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
-	unsigned long chunks; /* total number of data chunks for the array */
-
 	__u64	events_cleared;
 	int need_sync;
 
-	/* bitmap spinlock */
-	spinlock_t lock;
-
 	struct bitmap_storage {
 		struct file *file;		/* backing disk file */
 		struct page *sb_page;		/* cached copy of the bitmap

From 15702d7fb6e7a6baf5a04286a227b0ad2fe4a03f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:25 +1000
Subject: [PATCH 43/56] md/bitmap: use DIV_ROUND_UP instead of open-code

Also take the opportunity to simplify CHUNK_BLOCK_RATIO.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index c100a47bc15..bc552bbad83 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1694,9 +1694,8 @@ int bitmap_create(struct mddev *mddev)
 	bitmap->counts.chunkshift = (ffz(~mddev->bitmap_info.chunksize)
 			      - BITMAP_BLOCK_SHIFT);
 
-	chunks = (blocks + (1 << bitmap->counts.chunkshift) - 1) >>
-			bitmap->counts.chunkshift;
-	pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
+	chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << bitmap->counts.chunkshift);
+	pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
 
 	BUG_ON(!pages);
 

From d60b479d177a5735b6b4db6ee5280ef6653f50e7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:25 +1000
Subject: [PATCH 44/56] md/bitmap: add bitmap_resize function to allow bitmap
 resizing.

This function will allocate the new data structures and copy
bits across from old to new, allowing for the possibility that the
chunksize has changed.

Use the same function for performing the initial allocation
of the structures.  This improves test coverage.

When bitmap_resize is used to resize an existing bitmap, it
only copies '1' bits in, not '0' bits.
So when allocating the bitmap, ensure everything is initialised
to ZERO.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 199 +++++++++++++++++++++++++++++++++++++-------
 drivers/md/bitmap.h |   3 +
 2 files changed, 172 insertions(+), 30 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index bc552bbad83..a35561f8f57 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -698,7 +698,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 		return -ENOMEM;
 
 	if (with_super && !store->sb_page) {
-		store->sb_page = alloc_page(GFP_KERNEL);
+		store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
 		if (store->sb_page == NULL)
 			return -ENOMEM;
 		store->sb_page->index = 0;
@@ -709,7 +709,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 		pnum = 1;
 	}
 	for ( ; pnum < num_pages; pnum++) {
-		store->filemap[pnum] = alloc_page(GFP_KERNEL);
+		store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
 		if (!store->filemap[pnum]) {
 			store->file_pages = pnum;
 			return -ENOMEM;
@@ -1630,8 +1630,6 @@ int bitmap_create(struct mddev *mddev)
 {
 	struct bitmap *bitmap;
 	sector_t blocks = mddev->resync_max_sectors;
-	unsigned long chunks;
-	unsigned long pages;
 	struct file *file = mddev->bitmap_info.file;
 	int err;
 	struct sysfs_dirent *bm = NULL;
@@ -1691,37 +1689,14 @@ int bitmap_create(struct mddev *mddev)
 		goto error;
 
 	bitmap->daemon_lastrun = jiffies;
-	bitmap->counts.chunkshift = (ffz(~mddev->bitmap_info.chunksize)
-			      - BITMAP_BLOCK_SHIFT);
-
-	chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << bitmap->counts.chunkshift);
-	pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
-
-	BUG_ON(!pages);
-
-	bitmap->counts.chunks = chunks;
-	bitmap->counts.pages = pages;
-	bitmap->counts.missing_pages = pages;
-
-	bitmap->counts.bp = kzalloc(pages * sizeof(*bitmap->counts.bp),
-				    GFP_KERNEL);
-
-	err = -ENOMEM;
-	if (!bitmap->counts.bp)
+	err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1);
+	if (err)
 		goto error;
 
-	if (file || mddev->bitmap_info.offset) {
-		err = bitmap_storage_alloc(&bitmap->storage, bitmap->counts.chunks,
-					   !mddev->bitmap_info.external);
-		if (err)
-			goto error;
-	}
 	printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
-		pages, bmname(bitmap));
+	       bitmap->counts.pages, bmname(bitmap));
 
 	mddev->bitmap = bitmap;
-
-
 	return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
 
  error:
@@ -1807,6 +1782,170 @@ void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 	seq_printf(seq, "\n");
 }
 
+int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
+		  int chunksize, int init)
+{
+	/* If chunk_size is 0, choose an appropriate chunk size.
+	 * Then possibly allocate new storage space.
+	 * Then quiesce, copy bits, replace bitmap, and re-start
+	 *
+	 * This function is called both to set up the initial bitmap
+	 * and to resize the bitmap while the array is active.
+	 * If this happens as a result of the array being resized,
+	 * chunksize will be zero, and we need to choose a suitable
+	 * chunksize, otherwise we use what we are given.
+	 */
+	struct bitmap_storage store;
+	struct bitmap_counts old_counts;
+	unsigned long chunks;
+	sector_t block;
+	sector_t old_blocks, new_blocks;
+	int chunkshift;
+	int ret = 0;
+	long pages;
+	struct bitmap_page *new_bp;
+
+	if (chunksize == 0) {
+		/* If there is enough space, leave the chunk size unchanged,
+		 * else increase by factor of two until there is enough space.
+		 */
+		long bytes;
+		long space = bitmap->mddev->bitmap_info.space;
+
+		if (space == 0) {
+			/* We don't know how much space there is, so limit
+			 * to current size - in sectors.
+			 */
+			bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8);
+			if (!bitmap->mddev->bitmap_info.external)
+				bytes += sizeof(bitmap_super_t);
+			space = DIV_ROUND_UP(bytes, 512);
+			bitmap->mddev->bitmap_info.space = space;
+		}
+		chunkshift = bitmap->counts.chunkshift;
+		chunkshift--;
+		do {
+			/* 'chunkshift' is shift from block size to chunk size */
+			chunkshift++;
+			chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
+			bytes = DIV_ROUND_UP(chunks, 8);
+			if (!bitmap->mddev->bitmap_info.external)
+				bytes += sizeof(bitmap_super_t);
+		} while (bytes > (space << 9));
+	} else
+		chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
+
+	chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
+	memset(&store, 0, sizeof(store));
+	if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
+		ret = bitmap_storage_alloc(&store, chunks,
+					   !bitmap->mddev->bitmap_info.external);
+	if (ret)
+		goto err;
+
+	pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
+
+	new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL);
+	ret = -ENOMEM;
+	if (!new_bp) {
+		bitmap_file_unmap(&store);
+		goto err;
+	}
+
+	if (!init)
+		bitmap->mddev->pers->quiesce(bitmap->mddev, 1);
+
+	store.file = bitmap->storage.file;
+	bitmap->storage.file = NULL;
+
+	if (store.sb_page && bitmap->storage.sb_page)
+		memcpy(page_address(store.sb_page),
+		       page_address(bitmap->storage.sb_page),
+		       sizeof(bitmap_super_t));
+	bitmap_file_unmap(&bitmap->storage);
+	bitmap->storage = store;
+
+	old_counts = bitmap->counts;
+	bitmap->counts.bp = new_bp;
+	bitmap->counts.pages = pages;
+	bitmap->counts.missing_pages = pages;
+	bitmap->counts.chunkshift = chunkshift;
+	bitmap->counts.chunks = chunks;
+	bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift +
+						     BITMAP_BLOCK_SHIFT);
+
+	blocks = min(old_counts.chunks << old_counts.chunkshift,
+		     chunks << chunkshift);
+
+	spin_lock_irq(&bitmap->counts.lock);
+	for (block = 0; block < blocks; ) {
+		bitmap_counter_t *bmc_old, *bmc_new;
+		int set;
+
+		bmc_old = bitmap_get_counter(&old_counts, block,
+					     &old_blocks, 0);
+		set = bmc_old && NEEDED(*bmc_old);
+
+		if (set) {
+			bmc_new = bitmap_get_counter(&bitmap->counts, block,
+						     &new_blocks, 1);
+			if (*bmc_new == 0) {
+				/* need to set on-disk bits too. */
+				sector_t end = block + new_blocks;
+				sector_t start = block >> chunkshift;
+				start <<= chunkshift;
+				while (start < end) {
+					bitmap_file_set_bit(bitmap, block);
+					start += 1 << chunkshift;
+				}
+				*bmc_new = 2;
+				bitmap_count_page(&bitmap->counts,
+						  block, 1);
+				bitmap_set_pending(&bitmap->counts,
+						   block);
+			}
+			*bmc_new |= NEEDED_MASK;
+			if (new_blocks < old_blocks)
+				old_blocks = new_blocks;
+		}
+		block += old_blocks;
+	}
+
+	if (!init) {
+		int i;
+		while (block < (chunks << chunkshift)) {
+			bitmap_counter_t *bmc;
+			bmc = bitmap_get_counter(&bitmap->counts, block,
+						 &new_blocks, 1);
+			if (bmc) {
+				/* new space.  It needs to be resynced, so
+				 * we set NEEDED_MASK.
+				 */
+				if (*bmc == 0) {
+					*bmc = NEEDED_MASK | 2;
+					bitmap_count_page(&bitmap->counts,
+							  block, 1);
+					bitmap_set_pending(&bitmap->counts,
+							   block);
+				}
+			}
+			block += new_blocks;
+		}
+		for (i = 0; i < bitmap->storage.file_pages; i++)
+			set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+	}
+	spin_unlock_irq(&bitmap->counts.lock);
+
+	if (!init) {
+		bitmap_unplug(bitmap);
+		bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
+	}
+	ret = 0;
+err:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bitmap_resize);
+
 static ssize_t
 location_show(struct mddev *mddev, char *page)
 {
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 6bde180e987..04dcde3871b 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -255,6 +255,9 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
 
 void bitmap_unplug(struct bitmap *bitmap);
 void bitmap_daemon_work(struct mddev *mddev);
+
+int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
+		  int chunksize, int init);
 #endif
 
 #endif

From b81a040481233e5171cb3fbc62f44636d5ea18b2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:26 +1000
Subject: [PATCH 45/56] md/bitmap: make sure reshape request are reflected in
 superblock.

As a reshape may change the sync_size and/or chunk_size, we need
to update these whenever we write out the bitmap superblock.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index a35561f8f57..8f681da0027 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -420,6 +420,9 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	/* Just in case these have been changed via sysfs: */
 	sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
 	sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
+	/* This might have been changed by a reshape */
+	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
+	sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
 	kunmap_atomic(sb);
 	write_page(bitmap, bitmap->storage.sb_page, 1);
 }

From a4a6125a074e1b08ee8ae34f700c5bca19eb9d18 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:27 +1000
Subject: [PATCH 46/56] md: allow array to be resized while bitmap is present.

Now that bitmaps can be resized, we can allow an array to be resized
while the bitmap is present.

This only covers resizing that involves changing the effective size
of member devices, not resizing that changes the number of devices.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c     |  6 +-----
 drivers/md/raid1.c  | 11 +++++++++--
 drivers/md/raid10.c | 10 ++++++++--
 drivers/md/raid5.c  | 14 ++++++++++----
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9e2336fbbd3..86adf4ac46c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6153,11 +6153,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 	 */
 	if (mddev->sync_thread)
 		return -EBUSY;
-	if (mddev->bitmap)
-		/* Sorry, cannot grow a bitmap yet, just remove it,
-		 * grow, and re-add.
-		 */
-		return -EBUSY;
+
 	rdev_for_each(rdev, mddev) {
 		sector_t avail = rdev->sectors;
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 22cfc6660b1..8e717bd518e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2752,9 +2752,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
 	 * any io in the removed space completes, but it hardly seems
 	 * worth it.
 	 */
-	md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
-	if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
+	sector_t newsize = raid1_size(mddev, sectors, 0);
+	if (mddev->external_size &&
+	    mddev->array_sectors > newsize)
 		return -EINVAL;
+	if (mddev->bitmap) {
+		int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0);
+		if (ret)
+			return ret;
+	}
+	md_set_array_sectors(mddev, newsize);
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	revalidate_disk(mddev->gendisk);
 	if (sectors > mddev->dev_sectors &&
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index fb9062b5022..8fe3aa46998 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3678,9 +3678,15 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
 
 	oldsize = raid10_size(mddev, 0, 0);
 	size = raid10_size(mddev, sectors, 0);
-	md_set_array_sectors(mddev, size);
-	if (mddev->array_sectors > size)
+	if (mddev->external_size &&
+	    mddev->array_sectors > size)
 		return -EINVAL;
+	if (mddev->bitmap) {
+		int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
+		if (ret)
+			return ret;
+	}
+	md_set_array_sectors(mddev, size);
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	revalidate_disk(mddev->gendisk);
 	if (sectors > mddev->dev_sectors &&
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7bfd59b313d..eab6168bb7f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5503,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
 	 * any io in the removed space completes, but it hardly seems
 	 * worth it.
 	 */
+	sector_t newsize;
 	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
-	md_set_array_sectors(mddev, raid5_size(mddev, sectors,
-					       mddev->raid_disks));
-	if (mddev->array_sectors >
-	    raid5_size(mddev, sectors, mddev->raid_disks))
+	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
+	if (mddev->external_size &&
+	    mddev->array_sectors > newsize)
 		return -EINVAL;
+	if (mddev->bitmap) {
+		int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0);
+		if (ret)
+			return ret;
+	}
+	md_set_array_sectors(mddev, newsize);
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	revalidate_disk(mddev->gendisk);
 	if (sectors > mddev->dev_sectors &&

From bb63a7019df91933de6854a87ddc5488b49edb85 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:28 +1000
Subject: [PATCH 47/56] md/raid10: resize bitmap when required during reshape.

If a reshape changes the size of the array, then we can now
update the bitmap to suit - so do so.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8fe3aa46998..ae2a5a4c6bc 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3786,8 +3786,6 @@ static int raid10_check_reshape(struct mddev *mddev)
 			/* not factor of array size */
 			return -EINVAL;
 
-	if (mddev->bitmap)
-		return -EBUSY;
 	if (!enough(conf, -1))
 		return -EINVAL;
 
@@ -3882,6 +3880,7 @@ static int raid10_start_reshape(struct mddev *mddev)
 	struct r10conf *conf = mddev->private;
 	struct md_rdev *rdev;
 	int spares = 0;
+	int ret;
 
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		return -EBUSY;
@@ -3943,6 +3942,14 @@ static int raid10_start_reshape(struct mddev *mddev)
 		conf->reshape_progress = 0;
 	spin_unlock_irq(&conf->device_lock);
 
+	if (mddev->delta_disks && mddev->bitmap) {
+		ret = bitmap_resize(mddev->bitmap,
+				    raid10_size(mddev, 0,
+						conf->geo.raid_disks),
+				    0, 0);
+		if (ret)
+			goto abort;
+	}
 	if (mddev->delta_disks > 0) {
 		rdev_for_each(rdev, mddev)
 			if (rdev->raid_disk < 0 &&
@@ -3982,22 +3989,26 @@ static int raid10_start_reshape(struct mddev *mddev)
 	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
 						"reshape");
 	if (!mddev->sync_thread) {
-		mddev->recovery = 0;
-		spin_lock_irq(&conf->device_lock);
-		conf->geo = conf->prev;
-		mddev->raid_disks = conf->geo.raid_disks;
-		rdev_for_each(rdev, mddev)
-			rdev->new_data_offset = rdev->data_offset;
-		smp_wmb();
-		conf->reshape_progress = MaxSector;
-		mddev->reshape_position = MaxSector;
-		spin_unlock_irq(&conf->device_lock);
-		return -EAGAIN;
+		ret = -EAGAIN;
+		goto abort;
 	}
 	conf->reshape_checkpoint = jiffies;
 	md_wakeup_thread(mddev->sync_thread);
 	md_new_event(mddev);
 	return 0;
+
+abort:
+	mddev->recovery = 0;
+	spin_lock_irq(&conf->device_lock);
+	conf->geo = conf->prev;
+	mddev->raid_disks = conf->geo.raid_disks;
+	rdev_for_each(rdev, mddev)
+		rdev->new_data_offset = rdev->data_offset;
+	smp_wmb();
+	conf->reshape_progress = MaxSector;
+	mddev->reshape_position = MaxSector;
+	spin_unlock_irq(&conf->device_lock);
+	return ret;
 }
 
 /* Calculate the last device-address that could contain

From 30b67645faadcdc32072f999796b99bd0a20f2d5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:28 +1000
Subject: [PATCH 48/56] md/raid5: Allow reshape while a bitmap is present.

We always should have allowed this.  A raid5 reshape doesn't change
the size of the bitmap, so not need to restrict it.

Also add a test to make sure we don't try to start a reshape on a
failed array.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index eab6168bb7f..8fba57e8f78 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5559,9 +5559,6 @@ static int check_reshape(struct mddev *mddev)
 	    mddev->new_layout == mddev->layout &&
 	    mddev->new_chunk_sectors == mddev->chunk_sectors)
 		return 0; /* nothing to do */
-	if (mddev->bitmap)
-		/* Cannot grow a bitmap yet */
-		return -EBUSY;
 	if (has_failed(conf))
 		return -EINVAL;
 	if (mddev->delta_disks < 0) {
@@ -5596,6 +5593,9 @@ static int raid5_start_reshape(struct mddev *mddev)
 	if (!check_stripe_cache(mddev))
 		return -ENOSPC;
 
+	if (has_failed(conf))
+		return -EINVAL;
+
 	rdev_for_each(rdev, mddev) {
 		if (!test_bit(In_sync, &rdev->flags)
 		    && !test_bit(Faulty, &rdev->flags))

From 47525e59e40ffb8cbc944c0055e9c4902cd3ee99 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 22 May 2012 13:55:29 +1000
Subject: [PATCH 49/56] DM RAID: Set recovery flags on resume

Properly initialize MD recovery flags when resuming device-mapper devices.

When a device-mapper device is suspended, all I/O must stop.  This is done by
calling 'md_stop_writes' and 'mddev_suspend'.  These calls in-turn manipulate
the recovery flags - including setting 'MD_RECOVERY_FROZEN'.  The DM device
may have been suspended while recovery was not yet complete, so the process
needs to pick-up where it left off.  Since 'mddev_resume' does not unset
'MD_RECOVERY_FROZEN' and set 'MD_RECOVERY_NEEDED', we must do it ourselves.
'MD_RECOVERY_NEEDED' can safely be set in 'mddev_resume', but 'MD_RECOVERY_FROZEN'
must be set outside of 'mddev_resume' due to how MD handles RAID reshaping.
(e.g.  It is possible for a user to delay reshaping a RAID5->RAID6 by purposefully
setting 'MD_RECOVERY_FROZEN'.  Clearing it in 'mddev_resume' would override the
desired behavior.)

Because 'mddev_resume' already unconditionally calls 'md_wakeup_thread(mddev->thread)'
there is no need to make this call from 'raid_resume' since it calls 'mddev_resume'.

Also clean up where  level_store calls mddev_resume() - it current
duplicates some of the funcitons of that call. - NB

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/dm-raid.c | 4 ++--
 drivers/md/md.c      | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 73a068da10d..ea2d90c78f7 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1252,9 +1252,9 @@ static void raid_resume(struct dm_target *ti)
 	if (!rs->bitmap_loaded) {
 		bitmap_load(&rs->md);
 		rs->bitmap_loaded = 1;
-	} else
-		md_wakeup_thread(rs->md.thread);
+	}
 
+	clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
 	mddev_resume(&rs->md);
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 86adf4ac46c..4c9836885d3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -402,6 +402,7 @@ void mddev_resume(struct mddev *mddev)
 	wake_up(&mddev->sb_wait);
 	mddev->pers->quiesce(mddev, 0);
 
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 }
@@ -3673,10 +3674,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
 		del_timer_sync(&mddev->safemode_timer);
 	}
 	pers->run(mddev);
-	mddev_resume(mddev);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	md_wakeup_thread(mddev->thread);
+	mddev_resume(mddev);
 	sysfs_notify(&mddev->kobj, NULL, "level");
 	md_new_event(mddev);
 	return rv;

From 81f382f9e0b25ef56b1c0283c900b86b91a5e4c7 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 22 May 2012 13:55:30 +1000
Subject: [PATCH 50/56] DM RAID: Record and handle missing devices

Missing dm-raid devices should be recorded in the superblock

When specifying the devices that compose a DM RAID array, it is possible to denote
failed or missing devices with '-'s.  When this occurs, we must record this in the
superblock.  We do this by checking if the array position's data device is missing
and then forcing MD to record the superblock by setting 'MD_CHANGE_DEVS' in
'raid_resume'.  If we do not cause the superblock to be rewritten by the resume
function, it is possible for a stale superblock to be written by an out-going
in-active table (during 'raid_dtr').

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/dm-raid.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index ea2d90c78f7..f1797c4f09c 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -614,16 +614,18 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
 
 static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 {
-	struct md_rdev *r;
+	int i;
 	uint64_t failed_devices;
 	struct dm_raid_superblock *sb;
+	struct raid_set *rs = container_of(mddev, struct raid_set, md);
 
 	sb = page_address(rdev->sb_page);
 	failed_devices = le64_to_cpu(sb->failed_devices);
 
-	rdev_for_each(r, mddev)
-		if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
-			failed_devices |= (1ULL << r->raid_disk);
+	for (i = 0; i < mddev->raid_disks; i++)
+		if (!rs->dev[i].data_dev ||
+		    test_bit(Faulty, &(rs->dev[i].rdev.flags)))
+			failed_devices |= (1ULL << i);
 
 	memset(sb, 0, sizeof(*sb));
 
@@ -1249,6 +1251,7 @@ static void raid_resume(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
 
+	set_bit(MD_CHANGE_DEVS, &rs->md.flags);
 	if (!rs->bitmap_loaded) {
 		bitmap_load(&rs->md);
 		rs->bitmap_loaded = 1;

From c32fb9e7ecee25a5b6a45bf968dfef76f323e185 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 22 May 2012 13:55:31 +1000
Subject: [PATCH 51/56] DM RAID: Use md_error() in place of simply setting
 Faulty bit

When encountering an error while reading the superblock, call md_error.

We are currently setting the 'Faulty' bit on one of the array devices when an
error is encountered while reading the superblock of a dm-raid array.  We should
be calling md_error(), as it handles the error more completely.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/dm-raid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index f1797c4f09c..017c34d78d6 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -603,7 +603,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
 	if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
 		DMERR("Failed to read superblock of device at position %d",
 		      rdev->raid_disk);
-		set_bit(Faulty, &rdev->flags);
+		md_error(rdev->mddev, rdev);
 		return -EINVAL;
 	}
 

From 4f0a5e012cf41321d611e7cad63e1017d143d138 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 22 May 2012 13:55:31 +1000
Subject: [PATCH 52/56] MD RAID1: Further conditionalize 'fullsync'

A RAID1 device does not necessarily need a fullsync if the bitmap can be used instead.

Similar to commit d6b212f4b19da5301e6b6eca562e5c7a2a6e8c8d in raid5.c, if a raid1
device can be brought back (i.e. from a transient failure) it shouldn't need a
complete resync.  Provided the bitmap is not to old, it will have recorded the areas
of the disk that need recovery.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8e717bd518e..835de7168cd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2600,7 +2600,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		if (!disk->rdev ||
 		    !test_bit(In_sync, &disk->rdev->flags)) {
 			disk->head_position = 0;
-			if (disk->rdev)
+			if (disk->rdev &&
+			    (disk->rdev->saved_raid_disk < 0))
 				conf->fullsync = 1;
 		} else if (conf->last_used < 0)
 			/*

From 0c098220e2320c1f0c9339d0ff05c5e04672133a Mon Sep 17 00:00:00 2001
From: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Date: Tue, 22 May 2012 13:55:32 +1000
Subject: [PATCH 53/56] md: check the return of mddev_find()

Check the return of mddev_find(), since it may fail due to out of
memeory or out of usable minor number.

The reason I chose -ENODEV instead of -ENOMEM or something else is
md_alloc() function chose that ;)

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4c9836885d3..1c2f9048e1a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6618,6 +6618,9 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 	struct mddev *mddev = mddev_find(bdev->bd_dev);
 	int err;
 
+	if (!mddev)
+		return -ENODEV;
+
 	if (mddev->gendisk != bdev->bd_disk) {
 		/* we are racing with mddev_put which is discarding this
 		 * bd_disk.

From da7613b8b042ab1d3ea55459ed97301e4a2097bb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:33 +1000
Subject: [PATCH 54/56] md/raid5: improve removal of extra devices after
 reshape.

After a reshape which reduced the number of devices we need
to disconnect the extra devices.
The code for this doesn't currently handle 'replacement' devices.
It is very unlikely that such devices will be present, but it is
safest to handle them anyway.

So simplify the handling.  Just clear In_sync and leave it
to remove_and_add_spaces (which will be called soon) to do
the real works.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8fba57e8f78..d26767246d2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5755,11 +5755,11 @@ static void raid5_finish_reshape(struct mddev *mddev)
 			     d < conf->raid_disks - mddev->delta_disks;
 			     d++) {
 				struct md_rdev *rdev = conf->disks[d].rdev;
-				if (rdev &&
-				    raid5_remove_disk(mddev, rdev) == 0) {
-					sysfs_unlink_rdev(mddev, rdev);
-					rdev->raid_disk = -1;
-				}
+				if (rdev)
+					clear_bit(In_sync, &rdev->flags);
+				rdev = conf->disks[d].replacement;
+				if (rdev)
+					clear_bit(In_sync, &rdev->flags);
 			}
 		}
 		mddev->layout = conf->algorithm;

From 63aced61023a054825ffd6e33ac1ab99036dadd2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:33 +1000
Subject: [PATCH 55/56] md/raid10: Remove extras after reshape to smaller
 number of devices.

When a reshape which reduced the number of devices finishes
we must remove the extra devices.

So ensure  that raid10_remove_disk won't try to keep them, and
have raid10_finish_reshape clear the 'in_sync' flag.  Then
remove_and_add_spares will be able to remove them.

Reported-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ae2a5a4c6bc..987db37cb87 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1731,6 +1731,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (!test_bit(Faulty, &rdev->flags) &&
 	    mddev->recovery_disabled != p->recovery_disabled &&
 	    (!p->replacement || p->replacement == rdev) &&
+	    number < conf->geo.raid_disks &&
 	    enough(conf, -1)) {
 		err = -EBUSY;
 		goto abort;
@@ -4485,6 +4486,18 @@ static void raid10_finish_reshape(struct mddev *mddev)
 		mddev->resync_max_sectors = size;
 		set_capacity(mddev->gendisk, mddev->array_sectors);
 		revalidate_disk(mddev->gendisk);
+	} else {
+		int d;
+		for (d = conf->geo.raid_disks ;
+		     d < conf->geo.raid_disks - mddev->delta_disks;
+		     d++) {
+			struct md_rdev *rdev = conf->mirrors[d].rdev;
+			if (rdev)
+				clear_bit(In_sync, &rdev->flags);
+			rdev = conf->mirrors[d].replacement;
+			if (rdev)
+				clear_bit(In_sync, &rdev->flags);
+		}
 	}
 	mddev->layout = mddev->new_layout;
 	mddev->chunk_sectors = 1 << conf->geo.chunk_shift;

From 1dff2b87a34a1ac1d1898ea109bf97ed396aca53 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 22 May 2012 13:55:34 +1000
Subject: [PATCH 56/56] md/bitmap: record the space available for the bitmap in
 the superblock.

Now that bitmaps can grow and shrink it is best if we record
how much space is available.  This means that when
we reduce the size of the bitmap we won't "lose" the space
for late when we might want to increase the size of the bitmap
again.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 7 +++++++
 drivers/md/bitmap.h | 4 +++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 8f681da0027..15dbe03117e 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -423,6 +423,8 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	/* This might have been changed by a reshape */
 	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
 	sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
+	sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
+					   bitmap_info.space);
 	kunmap_atomic(sb);
 	write_page(bitmap, bitmap->storage.sb_page, 1);
 }
@@ -536,6 +538,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	bitmap_super_t *sb;
 	unsigned long chunksize, daemon_sleep, write_behind;
 	unsigned long long events;
+	unsigned long sectors_reserved = 0;
 	int err = -EINVAL;
 	struct page *sb_page;
 
@@ -573,6 +576,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	chunksize = le32_to_cpu(sb->chunksize);
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
 	write_behind = le32_to_cpu(sb->write_behind);
+	sectors_reserved = le32_to_cpu(sb->sectors_reserved);
 
 	/* verify that the bitmap-specific fields are valid */
 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -633,6 +637,9 @@ out_no_sb:
 	bitmap->mddev->bitmap_info.chunksize = chunksize;
 	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
 	bitmap->mddev->bitmap_info.max_write_behind = write_behind;
+	if (bitmap->mddev->bitmap_info.space == 0 ||
+	    bitmap->mddev->bitmap_info.space > sectors_reserved)
+		bitmap->mddev->bitmap_info.space = sectors_reserved;
 	if (err)
 		bitmap_print_sb(bitmap);
 	return err;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 04dcde3871b..df4aeb6ac6f 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -128,8 +128,10 @@ typedef struct bitmap_super_s {
 	__le32 chunksize;    /* 52  the bitmap chunk size in bytes */
 	__le32 daemon_sleep; /* 56  seconds between disk flushes */
 	__le32 write_behind; /* 60  number of outstanding write-behind writes */
+	__le32 sectors_reserved; /* 64 number of 512-byte sectors that are
+				  * reserved for the bitmap. */
 
-	__u8  pad[256 - 64]; /* set to zero */
+	__u8  pad[256 - 68]; /* set to zero */
 } bitmap_super_t;
 
 /* notes: