From bc78c57388e7f447f58e30d60b1505ddaaaf3a7d Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Thu, 11 Oct 2012 13:08:02 +1100 Subject: [PATCH 01/29] md/linear: rcu_dereference outside read-lock section According to the comment in linear_stop function rcu_dereference in linear_start and linear_stop functions occurs under reconfig_mutex. The patch represents this agreement in code and prevents lockdep complaint. Found by Linux Driver Verification project (linuxtesting.org) Signed-off-by: Denis Efremov Signed-off-by: NeilBrown --- drivers/md/linear.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index fa211d80fc0..92c64d162a9 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -244,7 +244,9 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; - oldconf = rcu_dereference(mddev->private); + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held( + &mddev->reconfig_mutex)); mddev->raid_disks++; rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); @@ -256,7 +258,10 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) static int linear_stop (struct mddev *mddev) { - struct linear_conf *conf = mddev->private; + struct linear_conf *conf = + rcu_dereference_protected(mddev->private, + lockdep_is_held( + &mddev->reconfig_mutex)); /* * We do not require rcu protection here since From f1cad2b68ed12c0f82d3f56e150691f62b6f5edf Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:08:44 +1100 Subject: [PATCH 02/29] md: linear supports TRIM This makes md linear support TRIM. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/linear.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 92c64d162a9..21014836bdb 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -138,6 +138,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) struct linear_conf *conf; struct md_rdev *rdev; int i, cnt; + bool discard_supported = false; conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), GFP_KERNEL); @@ -171,6 +172,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->array_sectors += rdev->sectors; cnt++; + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } if (cnt != raid_disks) { printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", @@ -178,6 +181,11 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) goto out; } + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + /* * Here we calculate the device offsets. */ @@ -331,6 +339,14 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) bio->bi_sector = bio->bi_sector - start_sector + tmp_dev->rdev->data_offset; rcu_read_unlock(); + + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { + /* Just ignore it */ + bio_endio(bio, 0); + return; + } + generic_make_request(bio); } From c83057a1f4f987327c49448b046d9625c612ed8e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:25:44 +1100 Subject: [PATCH 03/29] md: raid 0 supports TRIM This makes md raid 0 support TRIM. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid0.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index a9e4fa95dfa..24b359717a7 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -88,6 +88,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + bool discard_supported = false; if (!conf) return -ENOMEM; @@ -195,6 +196,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; + + if (blk_queue_discard(bdev_get_queue(rdev1->bdev))) + discard_supported = true; } if (cnt != mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " @@ -272,6 +276,11 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) blk_queue_io_opt(mddev->queue, (mddev->chunk_sectors << 9) * mddev->raid_disks); + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -423,6 +432,7 @@ static int raid0_run(struct mddev *mddev) return -EINVAL; blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { @@ -510,7 +520,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) sector_t sector = bio->bi_sector; struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || + if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || bio->bi_idx != 0) goto bad_map; /* This is a one page bio that upper layers @@ -536,6 +546,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) bio->bi_sector = sector_offset + zone->dev_start + tmp_dev->data_offset; + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { + /* Just ignore it */ + bio_endio(bio, 0); + return; + } + generic_make_request(bio); return; From 2ff8cc2c6d4e323de71a42affeb3041fa17d5b10 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:28:54 +1100 Subject: [PATCH 04/29] md: raid 1 supports TRIM This makes md raid 1 support TRIM. If one disk supports discard and another not, or one has discard_zero_data and another not, there could be inconsistent between data from such disks. But this should not matter, discarded data is useless. This will add extra copy in rebuild though. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid1.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 611b5f79761..05c557e8f86 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -781,7 +781,12 @@ static void flush_pending_writes(struct r1conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } } else @@ -994,6 +999,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); + const unsigned long do_discard = (bio->bi_rw + & (REQ_DISCARD | REQ_SECURE)); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; @@ -1295,7 +1302,7 @@ read_again: conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_flush_fua | do_sync; + mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); @@ -1549,6 +1556,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; } @@ -2783,6 +2792,7 @@ static int run(struct mddev *mddev) int i; struct md_rdev *rdev; int ret; + bool discard_supported = false; if (mddev->level != 1) { printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", @@ -2812,6 +2822,8 @@ static int run(struct mddev *mddev) continue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } mddev->degraded = 0; @@ -2846,6 +2858,13 @@ static int run(struct mddev *mddev) mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); + + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); } ret = md_integrity_register(mddev); From 532a2a3fba8df076d65fdf17518eeb327b37a313 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:30:52 +1100 Subject: [PATCH 05/29] md: raid 10 supports TRIM This makes md raid 10 support TRIM. If one disk supports discard and another not, or one has discard_zero_data and another not, there could be inconsistent between data from such disks. But this should not matter, discarded data is useless. This will add extra copy in rebuild though. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid10.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1c2eb38f3c5..f92e0ed59be 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -911,7 +911,12 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } } else @@ -1061,6 +1066,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_fua = (bio->bi_rw & REQ_FUA); + const unsigned long do_discard = (bio->bi_rw + & (REQ_DISCARD | REQ_SECURE)); unsigned long flags; struct md_rdev *blocked_rdev; int sectors_handled; @@ -1081,7 +1088,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) || conf->prev.near_copies < conf->prev.raid_disks))) { struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || + if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || bio->bi_idx != 0) goto bad_map; /* This is a one page bio that upper layers @@ -1410,7 +1417,7 @@ retry_write: conf->mirrors[d].rdev)); mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1439,7 +1446,7 @@ retry_write: conf->mirrors[d].replacement)); mbio->bi_bdev = conf->mirrors[d].replacement->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1723,6 +1730,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + print_conf(conf); return err; } @@ -3480,6 +3490,7 @@ static int run(struct mddev *mddev) sector_t size; sector_t min_offset_diff = 0; int first = 1; + bool discard_supported = false; if (mddev->private == NULL) { conf = setup_conf(mddev); @@ -3496,6 +3507,8 @@ static int run(struct mddev *mddev) chunk_size = mddev->chunk_sectors << 9; if (mddev->queue) { + blk_queue_max_discard_sectors(mddev->queue, + mddev->chunk_sectors); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); @@ -3541,8 +3554,16 @@ static int run(struct mddev *mddev) rdev->data_offset << 9); disk->head_position = 0; + + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", From 57c67df48866d57b50d72eb198ffcc0cf7a6232d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:32:13 +1100 Subject: [PATCH 06/29] md/raid10: submit IO from originating thread instead of md thread. queuing writes to the md thread means that all requests go through the one processor which may not be able to keep up with very high request rates. So use the plugging infrastructure to submit all requests on unplug. If a 'schedule' is needed, we fall back on the old approach of handing the requests to the thread for it to handle. This is nearly identical to a recent patch which provided similar functionality to RAID1. Signed-off-by: NeilBrown --- drivers/md/raid10.c | 57 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f92e0ed59be..05dc96a950d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1055,6 +1055,44 @@ static sector_t choose_data_offset(struct r10bio *r10_bio, return rdev->new_data_offset; } +struct raid10_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r10conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r10conf *conf = mddev->private; @@ -1070,6 +1108,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) & (REQ_DISCARD | REQ_SECURE)); unsigned long flags; struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid10_plug_cb *plug = NULL; int sectors_handled; int max_sectors; int sectors; @@ -1421,11 +1461,22 @@ retry_write: mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); if (!r10_bio->devs[i].repl_bio) From 4ed8731d8e6bd2a88a30697fbf4f7e6e979a6c46 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:34:00 +1100 Subject: [PATCH 07/29] MD: change the parameter of md thread Change the thread parameter, so the thread can carry extra info. Next patch will use it. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 9 +++++---- drivers/md/md.h | 7 ++++--- drivers/md/multipath.c | 3 ++- drivers/md/raid1.c | 3 ++- drivers/md/raid10.c | 3 ++- drivers/md/raid5.c | 3 ++- 6 files changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7a2b0793f66..8e842b326eb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6641,7 +6641,7 @@ static int md_thread(void * arg) clear_bit(THREAD_WAKEUP, &thread->flags); if (!kthread_should_stop()) - thread->run(thread->mddev); + thread->run(thread); } return 0; @@ -6656,8 +6656,8 @@ void md_wakeup_thread(struct md_thread *thread) } } -struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev, - const char *name) +struct md_thread *md_register_thread(void (*run) (struct md_thread *), + struct mddev *mddev, const char *name) { struct md_thread *thread; @@ -7206,8 +7206,9 @@ EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -void md_do_sync(struct mddev *mddev) +void md_do_sync(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct mddev *mddev2; unsigned int currspeed = 0, window; diff --git a/drivers/md/md.h b/drivers/md/md.h index f385b038589..93ada214b50 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -540,12 +540,13 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) struct md_thread { - void (*run) (struct mddev *mddev); + void (*run) (struct md_thread *thread); struct mddev *mddev; wait_queue_head_t wqueue; unsigned long flags; struct task_struct *tsk; unsigned long timeout; + void *private; }; #define THREAD_WAKEUP 0 @@ -584,7 +585,7 @@ static inline void safe_put_page(struct page *p) extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); extern struct md_thread *md_register_thread( - void (*run)(struct mddev *mddev), + void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); extern void md_unregister_thread(struct md_thread **threadp); @@ -603,7 +604,7 @@ extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, extern void md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op); -extern void md_do_sync(struct mddev *mddev); +extern void md_do_sync(struct md_thread *thread); extern void md_new_event(struct mddev *mddev); extern int md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 61a1833ebaf..1642eae75a3 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -335,8 +335,9 @@ abort: * 3. Performs writes following reads for array syncronising. */ -static void multipathd (struct mddev *mddev) +static void multipathd(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct multipath_bh *mp_bh; struct bio *bio; unsigned long flags; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 05c557e8f86..55ccf473053 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2294,8 +2294,9 @@ read_more: } } -static void raid1d(struct mddev *mddev) +static void raid1d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r1bio *r1_bio; unsigned long flags; struct r1conf *conf = mddev->private; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 05dc96a950d..54860604d09 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2732,8 +2732,9 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } } -static void raid10d(struct mddev *mddev) +static void raid10d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r10bio *r10_bio; unsigned long flags; struct r10conf *conf = mddev->private; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index adda94df5eb..81c02d63440 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4625,8 +4625,9 @@ static int handle_active_stripes(struct r5conf *conf) * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d(struct mddev *mddev) +static void raid5d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r5conf *conf = mddev->private; int handled; struct blk_plug plug; From 1ca69c4bc4b1ef889861db39f325901eadbf9497 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:37:33 +1100 Subject: [PATCH 08/29] md: avoid taking the mutex on some ioctls. Some ioctls don't need to take the mutex and doing so can cause a delay as it is held during super-block update. So move those ioctls out of the mutex and rely on rcu locking to ensure we don't access stale data. Signed-off-by: NeilBrown --- drivers/md/md.c | 85 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 23 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8e842b326eb..feab588adb5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -674,7 +674,18 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) return NULL; } -static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) +static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->desc_nr == nr) + return rdev; + + return NULL; +} + +static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; @@ -685,6 +696,17 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) return NULL; } +static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->bdev->bd_dev == dev) + return rdev; + + return NULL; +} + static struct md_personality *find_pers(int level, char *clevel) { struct md_personality *pers; @@ -5509,8 +5531,9 @@ static int get_array_info(struct mddev * mddev, void __user * arg) int nr,working,insync,failed,spare; struct md_rdev *rdev; - nr=working=insync=failed=spare=0; - rdev_for_each(rdev, mddev) { + nr = working = insync = failed = spare = 0; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -5522,6 +5545,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) spare++; } } + rcu_read_unlock(); info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; @@ -5605,7 +5629,8 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; - rdev = find_rdev_nr(mddev, info.number); + rcu_read_lock(); + rdev = find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -5624,6 +5649,7 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) info.raid_disk = -1; info.state = (1<pers == NULL) return -ENODEV; - rdev = find_rdev(mddev, dev); + rcu_read_lock(); + rdev = find_rdev_rcu(mddev, dev); if (!rdev) - return -ENODEV; - - md_error(mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - return -EBUSY; - return 0; + err = -ENODEV; + else { + md_error(mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + err = -EBUSY; + } + rcu_read_unlock(); + return err; } /* @@ -6315,6 +6345,27 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto abort; } + /* Some actions do not requires the mutex */ + switch (cmd) { + case GET_ARRAY_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_array_info(mddev, argp); + goto abort; + + case GET_DISK_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_disk_info(mddev, argp); + goto abort; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, new_decode_dev(arg)); + goto abort; + } + err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -6387,18 +6438,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, */ switch (cmd) { - case GET_ARRAY_INFO: - err = get_array_info(mddev, argp); - goto done_unlock; - case GET_BITMAP_FILE: err = get_bitmap_file(mddev, argp); goto done_unlock; - case GET_DISK_INFO: - err = get_disk_info(mddev, argp); - goto done_unlock; - case RESTART_ARRAY_RW: err = restart_array(mddev); goto done_unlock; @@ -6480,10 +6523,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = hot_add_disk(mddev, new_decode_dev(arg)); goto done_unlock; - case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto done_unlock; - case RUN_ARRAY: err = do_md_run(mddev); goto done_unlock; From 2863b9eb44787adecba4f977d71d7fd876805b1c Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:38:58 +1100 Subject: [PATCH 09/29] MD RAID10: Prep for DM RAID10 device replacement capability MD RAID10: Fix a couple potential kernel panics if RAID10 is used by dm-raid When device-mapper uses the RAID10 personality through dm-raid.c, there is no 'gendisk' structure in mddev and some sysfs information is also not populated. This patch avoids touching those non-existent structures. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/md.c | 10 ++++++++-- drivers/md/raid10.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index feab588adb5..85e6786653e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2044,8 +2044,14 @@ EXPORT_SYMBOL(md_integrity_register); /* Disable data integrity if non-capable/non-matching disk is being added */ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { - struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); + struct blk_integrity *bi_rdev; + struct blk_integrity *bi_mddev; + + if (!mddev->gendisk) + return; + + bi_rdev = bdev_get_integrity(rdev->bdev); + bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ return; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 54860604d09..fb5bd607e15 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1694,7 +1694,7 @@ static int raid10_spare_active(struct mddev *mddev) && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); From eb6491236f283eb6ebc5421fcdb14b86701a7e36 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:40:09 +1100 Subject: [PATCH 10/29] DM RAID: Move 'rebuild' checking code to its own function DM RAID: Move chunk of code to it's own function The code that checks whether device replacements/rebuilds are possible given a specific RAID type is moved to it's own function. It will further expand when the code to check RAID10 is added. A separate function makes it easier to read. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/dm-raid.c | 75 +++++++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 25 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 982e3e390c4..10635e965fe 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -337,6 +337,52 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) return 0; } +/* + * validate_rebuild_devices + * @rs + * + * Determine if the devices specified for rebuild can result in a valid + * usable array that is capable of rebuilding the given devices. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_rebuild_devices(struct raid_set *rs) +{ + unsigned i, rebuild_cnt = 0; + + if (!(rs->print_flags & DMPF_REBUILD)) + return 0; + + for (i = 0; i < rs->md.raid_disks; i++) + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + rebuild_cnt++; + + switch (rs->raid_type->level) { + case 1: + if (rebuild_cnt >= rs->md.raid_disks) + goto too_many; + break; + case 4: + case 5: + case 6: + if (rebuild_cnt > rs->raid_type->parity_devs) + goto too_many; + break; + case 10: + default: + DMERR("The rebuild parameter is not supported for %s", + rs->raid_type->name); + rs->ti->error = "Rebuild not supported for this RAID type"; + return -EINVAL; + } + + return 0; + +too_many: + rs->ti->error = "Too many rebuild devices specified"; + return -EINVAL; +} + /* * Possible arguments are... * [optional_args] @@ -365,7 +411,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, { char *raid10_format = "near"; unsigned raid10_copies = 2; - unsigned i, rebuild_cnt = 0; + unsigned i; unsigned long value, region_size = 0; sector_t sectors_per_dev = rs->ti->len; sector_t max_io_len; @@ -461,30 +507,6 @@ static int parse_raid_params(struct raid_set *rs, char **argv, /* Parameters that take a numeric value are checked here */ if (!strcasecmp(key, "rebuild")) { - rebuild_cnt++; - - switch (rs->raid_type->level) { - case 1: - if (rebuild_cnt >= rs->md.raid_disks) { - rs->ti->error = "Too many rebuild devices specified"; - return -EINVAL; - } - break; - case 4: - case 5: - case 6: - if (rebuild_cnt > rs->raid_type->parity_devs) { - rs->ti->error = "Too many rebuild devices specified for given RAID type"; - return -EINVAL; - } - break; - case 10: - default: - DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); - rs->ti->error = "Rebuild not supported for this RAID type"; - return -EINVAL; - } - if (value > rs->md.raid_disks) { rs->ti->error = "Invalid rebuild index given"; return -EINVAL; @@ -608,6 +630,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } rs->md.dev_sectors = sectors_per_dev; + if (validate_rebuild_devices(rs)) + return -EINVAL; + /* Assume there are no metadata devices until the drives are parsed */ rs->md.persistent = 0; rs->md.external = 1; From 4ec1e369af83f7ecdfbd48a905e44fc9910115ba Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:40:24 +1100 Subject: [PATCH 11/29] DM RAID: Add rebuild capability for RAID10 DM RAID: Add code to validate replacement slots for RAID10 arrays RAID10 can handle 'copies - 1' failures for each mirror group. This code ensures the user has provided a valid array - one whose devices specified for rebuild do not exceed the amount of redundancy available. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- Documentation/device-mapper/dm-raid.txt | 9 +++++++ drivers/md/dm-raid.c | 34 ++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 1c184495716..728c38c242d 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -132,3 +132,12 @@ Here we can see the RAID type is raid4, there are 5 devices - all of which are 'A'live, and the array is 2/490221568 complete with recovery. Faulty or missing devices are marked 'D'. Devices that are out-of-sync are marked 'a'. + + +Version History +--------------- +1.0.0 Initial version. Support for RAID 4/5/6 +1.1.0 Added support for RAID 1 +1.2.0 Handle creation of arrays that contain failed devices. +1.3.0 Added support for RAID 10 +1.3.1 Allow device replacement/rebuild for RAID 10 diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 10635e965fe..4e79ebaab3c 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -349,6 +349,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) static int validate_rebuild_devices(struct raid_set *rs) { unsigned i, rebuild_cnt = 0; + unsigned rebuilds_per_group, copies, d; if (!(rs->print_flags & DMPF_REBUILD)) return 0; @@ -369,6 +370,37 @@ static int validate_rebuild_devices(struct raid_set *rs) goto too_many; break; case 10: + copies = raid10_md_layout_to_copies(rs->md.layout); + if (rebuild_cnt < copies) + break; + + /* + * It is possible to have a higher rebuild count for RAID10, + * as long as the failed devices occur in different mirror + * groups (i.e. different stripes). + * + * Right now, we only allow for "near" copies. When other + * formats are added, we will have to check those too. + * + * When checking "near" format, make sure no adjacent devices + * have failed beyond what can be handled. In addition to the + * simple case where the number of devices is a multiple of the + * number of copies, we must also handle cases where the number + * of devices is not a multiple of the number of copies. + * E.g. dev1 dev2 dev3 dev4 dev5 + * A A B B C + * C D D E E + */ + rebuilds_per_group = 0; + for (i = 0; i < rs->md.raid_disks * copies; i++) { + d = i % rs->md.raid_disks; + if (!test_bit(In_sync, &rs->dev[d].rdev.flags) && + (++rebuilds_per_group >= copies)) + goto too_many; + if (!((i + 1) % copies)) + rebuilds_per_group = 0; + } + break; default: DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); @@ -1385,7 +1417,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 3, 0}, + .version = {1, 3, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, From 7386199c471f70526bbcc629f072a5a8effe218f Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:40:36 +1100 Subject: [PATCH 12/29] DM RAID: Fix comparison of index and quantity for "rebuild" parameter DM RAID: Fix comparison of index and quantity for "rebuild" parameter The "rebuild" parameter takes an index argument that starts counting from zero. The conditional used to validate the index was using '>' rather than '>=', leaving the door open for an index value that would be 1 too large. Reported-by: Neil Brown Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 4e79ebaab3c..89a06a36133 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -539,7 +539,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, /* Parameters that take a numeric value are checked here */ if (!strcasecmp(key, "rebuild")) { - if (value > rs->md.raid_disks) { + if (value >= rs->md.raid_disks) { rs->ti->error = "Invalid rebuild index given"; return -EINVAL; } From 761becff016b82a6a7a1b2ef224248da5f46bae9 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:42:19 +1100 Subject: [PATCH 13/29] DM RAID: Fix for "sync" directive ineffectiveness There are two table arguments that can be given to a DM RAID target that control whether the array is forced to (re)synchronize or skip initialization: "sync" and "nosync". When "sync" is given, we set mddev->recovery_cp to 0 in order to cause the device to resynchronize. This is insufficient if there is a bitmap in use, because the array will simply look at the bitmap and see that there is no recovery necessary. The fix is to skip over the loading of the superblocks when "sync" is given, causing new superblocks to be written that will force the array to go through initialization (i.e. synchronization). Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/dm-raid.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 89a06a36133..45d94a7e7f6 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1017,6 +1017,19 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) freshest = NULL; rdev_for_each_safe(rdev, tmp, mddev) { + /* + * Skipping super_load due to DMPF_SYNC will cause + * the array to undergo initialization again as + * though it were new. This is the intended effect + * of the "sync" directive. + * + * When reshaping capability is added, we must ensure + * that the "sync" directive is disallowed during the + * reshape. + */ + if (rs->print_flags & DMPF_SYNC) + continue; + if (!rdev->meta_bdev) continue; From af7cf25dd15b74a78bff8ed5dc969b20dae9554d Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 11 Oct 2012 13:42:32 +1100 Subject: [PATCH 14/29] add further __init annotations to crypto/xor.c Allow particularly do_xor_speed() to be discarded post-init. Signed-off-by: Jan Beulich Signed-off-by: NeilBrown --- crypto/xor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crypto/xor.c b/crypto/xor.c index 65c7b416b4a..35d6b3adf23 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -56,11 +56,11 @@ xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs) EXPORT_SYMBOL(xor_blocks); /* Set of all registered templates. */ -static struct xor_block_template *template_list; +static struct xor_block_template *__initdata template_list; #define BENCH_SIZE (PAGE_SIZE) -static void +static void __init do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) { int speed; From fd177481b440c3f7b5ee9b821a76b29fdf2a6712 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 11 Oct 2012 13:43:21 +1100 Subject: [PATCH 15/29] raid: replace list_for_each_continue_rcu with new interface This patch replaces list_for_each_continue_rcu() with list_for_each_entry_continue_rcu() to save a few lines of code and allow removing list_for_each_continue_rcu(). Reviewed-by: Paul E. McKenney Signed-off-by: Michael Wang Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 94e7f6ba2e1..df73375c160 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -163,20 +163,17 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde * As devices are only added or removed when raid_disk is < 0 and * nr_pending is 0 and In_sync is clear, the entries we return will * still be in the same position on the list when we re-enter - * list_for_each_continue_rcu. + * list_for_each_entry_continue_rcu. */ - struct list_head *pos; rcu_read_lock(); if (rdev == NULL) /* start at the beginning */ - pos = &mddev->disks; + rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set); else { /* release the previous rdev and start from there. */ rdev_dec_pending(rdev, mddev); - pos = &rdev->same_set; } - list_for_each_continue_rcu(pos, &mddev->disks) { - rdev = list_entry(pos, struct md_rdev, same_set); + list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { /* this is a usable devices */ From 7ad4d4a68a1a19f21c7b39cb3f51bf17fba6e3d0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:44:30 +1100 Subject: [PATCH 16/29] md/raid1: Don't release reference to device while handling read error. When we get a read error, we arrange for raid1d to handle it. Currently we release the reference on the device. This can result in conf->mirrors[read_disk].rdev being NULL in fix_read_error, if the device happens to get removed before the read error is handled. So instead keep the reference until the read error has been fully handled. Reported-by: hank Signed-off-by: NeilBrown --- drivers/md/raid1.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 55ccf473053..e913356b257 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -333,9 +333,10 @@ static void raid1_end_read_request(struct bio *bio, int error) spin_unlock_irqrestore(&conf->device_lock, flags); } - if (uptodate) + if (uptodate) { raid_end_bio_io(r1_bio); - else { + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + } else { /* * oops, read error: */ @@ -349,9 +350,8 @@ static void raid1_end_read_request(struct bio *bio, int error) (unsigned long long)r1_bio->sector); set_bit(R1BIO_ReadError, &r1_bio->state); reschedule_retry(r1_bio); + /* don't drop the reference on read_disk yet */ } - - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } static void close_write(struct r1bio *r1_bio) @@ -2229,6 +2229,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) unfreeze_array(conf); } else md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); bio = r1_bio->bios[r1_bio->read_disk]; bdevname(bio->bi_bdev, b); From 582e2e056a5c3410174c23f5134e6b00e0db9101 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 11 Oct 2012 13:45:36 +1100 Subject: [PATCH 17/29] md/bitmap:Don't use IS_ERR to judge alloc_page(). Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index df73375c160..7155945f8eb 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -470,14 +470,10 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) { bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; - int err = -EINVAL; bitmap->storage.sb_page = alloc_page(GFP_KERNEL); - if (IS_ERR(bitmap->storage.sb_page)) { - err = PTR_ERR(bitmap->storage.sb_page); - bitmap->storage.sb_page = NULL; - return err; - } + if (bitmap->storage.sb_page == NULL) + return -ENOMEM; bitmap->storage.sb_page->index = 0; sb = kmap_atomic(bitmap->storage.sb_page); From 620125f2bf8ff0c4969b79653b54d7bcc9d40637 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:49:05 +1100 Subject: [PATCH 18/29] MD: raid5 trim support Discard for raid4/5/6 has limitation. If discard request size is small, we do discard for one disk, but we need calculate parity and write parity disk. To correctly calculate parity, zero_after_discard must be guaranteed. Even it's true, we need do discard for one disk but write another disks, which makes the parity disks wear out fast. This doesn't make sense. So an efficient discard for raid4/5/6 should discard all data disks and parity disks, which requires the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is smaller than chunk_size, such pattern is almost impossible in practice. So in this patch, I only handle the case that A's size equals to chunk_size. That is discard request should be aligned to stripe size and its size is multiple of stripe size. Since we can only handle request with specific alignment and size (or part of the request fitting stripes), we can't guarantee zero_after_discard even zero_after_discard is true in low level drives. The block layer doesn't send down correctly aligned requests even correct discard alignment is set, so I must filter out. For raid4/5/6 parity calculation, if data is 0, parity is 0. So if zero_after_discard is true for all disks, data is consistent after discard. Otherwise, data might be lost. Let's consider a scenario: discard a stripe, write data to one disk and write parity disk. The stripe could be still inconsistent till then depending on using data from other data disks or parity disks to calculate new parity. If the disk is broken, we can't restore it. So in this patch, we only enable discard support if all disks have zero_after_discard. If discard fails in one disk, we face the similar inconsistent issue above. The patch will make discard follow the same path as normal write request. If discard fails, a resync will be scheduled to make the data consistent. This isn't good to have extra writes, but data consistency is important. If a subsequent read/write request hits raid5 cache of a discarded stripe, the discarded dev page should have zero filled, so the data is consistent. This patch will always zero dev page for discarded request stripe. This isn't optimal because discard request doesn't need such payload. Next patch will avoid it. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 168 ++++++++++++++++++++++++++++++++++++++++++++- drivers/md/raid5.h | 1 + 2 files changed, 166 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 81c02d63440..74dcf19cfe6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rw = WRITE_FUA; else rw = WRITE; + if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags)) + rw |= REQ_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; else if (test_and_clear_bit(R5_WantReplace, @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_rw & REQ_SYNC) set_bit(R5_SyncIO, &dev->flags); - tx = async_copy_data(1, wbi, dev->page, - dev->sector, tx); + if (wbi->bi_rw & REQ_DISCARD) { + memset(page_address(dev->page), 0, + STRIPE_SECTORS << 9); + set_bit(R5_Discard, &dev->flags); + } else + tx = async_copy_data(1, wbi, dev->page, + dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); } } @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = 0; i < sh->disks; i++) { + if (pd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + memset(page_address(sh->dev[pd_idx].page), 0, + STRIPE_SECTORS << 9); + set_bit(R5_Discard, &sh->dev[pd_idx].flags); + ops_complete_reconstruct(sh); + return; + } /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, { struct async_submit_ctl submit; struct page **blocks = percpu->scribble; - int count; + int count, i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = 0; i < sh->disks; i++) { + if (sh->pd_idx == i || sh->qd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + memset(page_address(sh->dev[sh->pd_idx].page), 0, + STRIPE_SECTORS << 9); + memset(page_address(sh->dev[sh->qd_idx].page), 0, + STRIPE_SECTORS << 9); + set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + ops_complete_reconstruct(sh); + return; + } + count = set_syndrome_sources(blocks, sh); atomic_inc(&sh->count); @@ -4067,6 +4106,88 @@ static void release_stripe_plug(struct mddev *mddev, release_stripe(sh); } +static void make_discard_request(struct mddev *mddev, struct bio *bi) +{ + struct r5conf *conf = mddev->private; + sector_t logical_sector, last_sector; + struct stripe_head *sh; + int remaining; + int stripe_sectors; + + if (mddev->reshape_position != MaxSector) + /* Skip discard while reshape is happening */ + return; + + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_sector + (bi->bi_size>>9); + + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + + stripe_sectors = conf->chunk_sectors * + (conf->raid_disks - conf->max_degraded); + logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, + stripe_sectors); + sector_div(last_sector, stripe_sectors); + + logical_sector *= conf->chunk_sectors; + last_sector *= conf->chunk_sectors; + + for (; logical_sector < last_sector; + logical_sector += STRIPE_SECTORS) { + DEFINE_WAIT(w); + int d; + again: + sh = get_active_stripe(conf, logical_sector, 0, 0, 0); + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); + spin_lock_irq(&sh->stripe_lock); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + if (sh->dev[d].towrite || sh->dev[d].toread) { + set_bit(R5_Overlap, &sh->dev[d].flags); + spin_unlock_irq(&sh->stripe_lock); + release_stripe(sh); + schedule(); + goto again; + } + } + finish_wait(&conf->wait_for_overlap, &w); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + sh->dev[d].towrite = bi; + set_bit(R5_OVERWRITE, &sh->dev[d].flags); + raid5_inc_bi_active_stripes(bi); + } + spin_unlock_irq(&sh->stripe_lock); + if (conf->mddev->bitmap) { + for (d = 0; + d < conf->raid_disks - conf->max_degraded; + d++) + bitmap_startwrite(mddev->bitmap, + sh->sector, + STRIPE_SECTORS, + 0); + sh->bm_seq = conf->seq_flush + 1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe_plug(mddev, sh); + } + + remaining = raid5_dec_bi_active_stripes(bi); + if (remaining == 0) { + md_write_end(mddev); + bio_endio(bi, 0); + } +} + static void make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; @@ -4089,6 +4210,11 @@ static void make_request(struct mddev *mddev, struct bio * bi) chunk_aligned_read(mddev,bi)) return; + if (unlikely(bi->bi_rw & REQ_DISCARD)) { + make_discard_request(mddev, bi); + return; + } + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; @@ -5362,6 +5488,7 @@ static int run(struct mddev *mddev) if (mddev->queue) { int chunk_size; + bool discard_supported = true; /* read-ahead size must cover two whole stripes, which * is 2 * (datadisks) * chunksize where 'n' is the * number of raid devices @@ -5381,13 +5508,48 @@ static int run(struct mddev *mddev) blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); + /* + * We can only discard a whole stripe. It doesn't make sense to + * discard data disk but write parity disk + */ + stripe = stripe * PAGE_SIZE; + mddev->queue->limits.discard_alignment = stripe; + mddev->queue->limits.discard_granularity = stripe; + /* + * unaligned part of discard request will be ignored, so can't + * guarantee discard_zerors_data + */ + mddev->queue->limits.discard_zeroes_data = 0; rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->new_data_offset << 9); + /* + * discard_zeroes_data is required, otherwise data + * could be lost. Consider a scenario: discard a stripe + * (the stripe could be inconsistent if + * discard_zeroes_data is 0); write one disk of the + * stripe (the stripe could be inconsistent again + * depending on which disks are used to calculate + * parity); the disk is broken; The stripe data of this + * disk is lost. + */ + if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || + !bdev_get_queue(rdev->bdev)-> + limits.discard_zeroes_data) + discard_supported = false; } + + if (discard_supported && + mddev->queue->limits.max_discard_sectors >= stripe && + mddev->queue->limits.discard_granularity >= stripe) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); } return 0; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a9fc24901ed..18b2c4a8a1f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -298,6 +298,7 @@ enum r5dev_flags { R5_WantReplace, /* We need to update the replacement, we have read * data in, and now is a good time to write it out. */ + R5_Discard, /* Discard the stripe */ }; /* From 9e44476851e91c86c98eb92b9bc27fb801f89072 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:49:49 +1100 Subject: [PATCH 19/29] MD: raid5 avoid unnecessary zero page for trim We want to avoid zero discarded dev page, because it's useless for discard. But if we don't zero it, another read/write hit such page in the cache and will get inconsistent data. To avoid zero the page, we don't set R5_UPTODATE flag after construction is done. In this way, discard write request is still issued and finished, but read will not hit the page. If the stripe gets accessed soon, we need reread the stripe, but since the chance is low, the reread isn't a big deal. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 74dcf19cfe6..758b7729640 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -547,7 +547,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rw = WRITE_FUA; else rw = WRITE; - if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags)) + if (test_bit(R5_Discard, &sh->dev[i].flags)) rw |= REQ_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; @@ -1172,11 +1172,9 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_rw & REQ_SYNC) set_bit(R5_SyncIO, &dev->flags); - if (wbi->bi_rw & REQ_DISCARD) { - memset(page_address(dev->page), 0, - STRIPE_SECTORS << 9); + if (wbi->bi_rw & REQ_DISCARD) set_bit(R5_Discard, &dev->flags); - } else + else tx = async_copy_data(1, wbi, dev->page, dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); @@ -1194,7 +1192,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; int i; - bool fua = false, sync = false; + bool fua = false, sync = false, discard = false; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1202,13 +1200,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) for (i = disks; i--; ) { fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); + discard |= test_bit(R5_Discard, &sh->dev[i].flags); } for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - set_bit(R5_UPTODATE, &dev->flags); + if (!discard) + set_bit(R5_UPTODATE, &dev->flags); if (fua) set_bit(R5_WantFUA, &dev->flags); if (sync) @@ -1252,8 +1252,6 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, } if (i >= sh->disks) { atomic_inc(&sh->count); - memset(page_address(sh->dev[pd_idx].page), 0, - STRIPE_SECTORS << 9); set_bit(R5_Discard, &sh->dev[pd_idx].flags); ops_complete_reconstruct(sh); return; @@ -1314,10 +1312,6 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, } if (i >= sh->disks) { atomic_inc(&sh->count); - memset(page_address(sh->dev[sh->pd_idx].page), 0, - STRIPE_SECTORS << 9); - memset(page_address(sh->dev[sh->qd_idx].page), 0, - STRIPE_SECTORS << 9); set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); ops_complete_reconstruct(sh); @@ -2775,7 +2769,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_and_clear_bit(R5_Discard, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; pr_debug("Return write for disc %d\n", i); @@ -3493,10 +3488,12 @@ static void handle_stripe(struct stripe_head *sh) if (s.written && (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) && !test_bit(R5_LOCKED, &pdev->flags) - && test_bit(R5_UPTODATE, &pdev->flags)))) && + && (test_bit(R5_UPTODATE, &pdev->flags) || + test_bit(R5_Discard, &pdev->flags))))) && (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) - && test_bit(R5_UPTODATE, &qdev->flags))))) + && (test_bit(R5_UPTODATE, &qdev->flags) || + test_bit(R5_Discard, &qdev->flags)))))) handle_stripe_clean_event(conf, sh, disks, &s.return_bi); /* Now we might consider reading some blocks, either to check/generate @@ -3523,9 +3520,11 @@ static void handle_stripe(struct stripe_head *sh) /* All the 'written' buffers and the parity block are ready to * be written back to disk */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); BUG_ON(sh->qd_idx >= 0 && - !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); + !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_LOCKED, &dev->flags) && From 143c4d0573caebe0ae017097614349697e2280eb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:50:12 +1100 Subject: [PATCH 20/29] md/raid5: add some missing locking in handle_failed_stripe. We really should hold the stripe_lock while accessing 'toread' else we could race with add_stripe_bio and corrupt a list. Reported-by: "Jianpeng Ma" Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 758b7729640..36c0a158730 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2552,8 +2552,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && (!test_bit(R5_Insync, &sh->dev[i].flags) || test_bit(R5_ReadError, &sh->dev[i].flags))) { + spin_lock_irq(&sh->stripe_lock); bi = sh->dev[i].toread; sh->dev[i].toread = NULL; + spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); if (bi) s->to_read--; From b97390aec4756373168ad2976e1f117b610513ea Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:50:12 +1100 Subject: [PATCH 21/29] md/raid5: protect debug message against NULL derefernce. The pr_debug in add_stripe_bio could race with something changing *bip, so it is best to hold the lock until after the pr_debug. Reported-by: "Jianpeng Ma" Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 36c0a158730..d11012604e2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2436,11 +2436,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } - spin_unlock_irq(&sh->stripe_lock); pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)(*bip)->bi_sector, (unsigned long long)sh->sector, dd_idx); + spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap && firstwrite) { bitmap_startwrite(conf->mddev->bitmap, sh->sector, From a7854487cd7128a30a7f4f5259de9f67d5efb95f Mon Sep 17 00:00:00 2001 From: Alexander Lyakas Date: Thu, 11 Oct 2012 13:50:12 +1100 Subject: [PATCH 22/29] md: When RAID5 is dirty, force reconstruct-write instead of read-modify-write. Signed-off-by: Alex Lyakas Suggested-by: Yair Hershko Signed-off-by: NeilBrown --- drivers/md/raid5.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d11012604e2..9de8221f64e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2806,12 +2806,25 @@ static void handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - if (conf->max_degraded == 2) { - /* RAID6 requires 'rcw' in current implementation - * Calculate the real rcw later - for now fake it + sector_t recovery_cp = conf->mddev->recovery_cp; + + /* RAID6 requires 'rcw' in current implementation. + * Otherwise, check whether resync is now happening or should start. + * If yes, then the array is dirty (after unclean shutdown or + * initial creation), so parity in some stripes might be inconsistent. + * In this case, we need to always do reconstruct-write, to ensure + * that in case of drive failure or read-error correction, we + * generate correct data from the parity. + */ + if (conf->max_degraded == 2 || + (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { + /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; + pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", + conf->max_degraded, (unsigned long long)recovery_cp, + (unsigned long long)sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; From 1ed850f356a0a422013846b5291acff08815008b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:50:13 +1100 Subject: [PATCH 23/29] md/raid5: make sure to_read and to_write never go negative. to_read and to_write are part of the result of analysing a stripe before handling it. Their use is to avoid some loops and tests if the values are known to be zero. Thus it is not a problem if they are a little bit larger than they should be. So decrementing them in handle_failed_stripe serves little value, and due to races it could cause some loops to be skipped incorrectly. So remove those decrements. Reported-by: "Jianpeng Ma" Signed-off-by: NeilBrown --- drivers/md/raid5.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9de8221f64e..ab613efbbea 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2507,10 +2507,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; spin_unlock_irq(&sh->stripe_lock); - if (bi) { - s->to_write--; + if (bi) bitmap_end = 1; - } if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -2558,7 +2556,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); - if (bi) s->to_read--; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = From 7f7583d420231b9d09897afd57a957011b606a5b Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 11 Oct 2012 14:17:59 +1100 Subject: [PATCH 24/29] Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid races Now that multiple threads can handle stripes, it is safer to use an atomic64_t for resync_mismatches, to avoid update races. Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++++--- drivers/md/md.h | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 4 ++-- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 85e6786653e..7564c44b804 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4269,7 +4269,8 @@ static ssize_t mismatch_cnt_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", - (unsigned long long) mddev->resync_mismatches); + (unsigned long long) + atomic64_read(&mddev->resync_mismatches)); } static struct md_sysfs_entry md_scan_mode = @@ -5235,7 +5236,7 @@ static void md_clean(struct mddev *mddev) mddev->new_layout = 0; mddev->new_chunk_sectors = 0; mddev->curr_resync = 0; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; mddev->recovery = 0; @@ -7357,7 +7358,7 @@ void md_do_sync(struct md_thread *thread) * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); /* we don't use the checkpoint if there's a bitmap */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min; diff --git a/drivers/md/md.h b/drivers/md/md.h index 93ada214b50..af443ab868d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -282,7 +282,7 @@ struct mddev { sector_t resync_max_sectors; /* may be set by personality */ - sector_t resync_mismatches; /* count of sectors where + atomic64_t resync_mismatches; /* count of sectors where * parity/replica mismatch found */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e913356b257..8034fbd6190 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1876,7 +1876,7 @@ static int process_checks(struct r1bio *r1_bio) } else j = 0; if (j >= 0) - mddev->resync_mismatches += r1_bio->sectors; + atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { /* No need to write to this device. */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index fb5bd607e15..146749b277c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2011,7 +2011,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) break; if (j == vcnt) continue; - mddev->resync_mismatches += r10_bio->sectors; + atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) /* Don't fix anything. */ continue; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ab613efbbea..4cc6b0e398b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2973,7 +2973,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, */ set_bit(STRIPE_INSYNC, &sh->state); else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -3125,7 +3125,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, */ } } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); From 48c26ddc9f85808632a63b3ae50b868c86a2435f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 14:19:39 +1100 Subject: [PATCH 25/29] md: writing to sync_action should clear the read-auto state. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases array are started in 'read-auto' state where in nothing gets written to any device until the array is written to. The purpose of this is to make accidental auto-assembly of the wrong arrays less of a risk, and to allow arrays to be started to read suspend-to-disk images without actually changing anything (as might happen if the array were dirty and a resync seemed necessary). Explicitly writing the 'sync_action' for a read-auto array currently doesn't clear the read-auto state, so the sync action doesn't happen, which can be confusing. So allow any successful write to sync_action to clear any read-auto state. Reported-by: Alexander Kühn Signed-off-by: NeilBrown --- drivers/md/md.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7564c44b804..200d0862335 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4259,6 +4259,13 @@ action_store(struct mddev *mddev, const char *page, size_t len) set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + md_wakeup_thread(mddev->sync_thread); + } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); From 91502f099dfc5a1e8812898e26ee280713e1d002 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 11 Oct 2012 14:20:58 +1100 Subject: [PATCH 26/29] md/raid10: use correct limit variable Clang complains that we are assigning a variable to itself. This should be using bad_sectors like the similar earlier check does. Bug has been present since 3.1-rc1. It is minor but could conceivably cause corruption or other bad behaviour. Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Signed-off-by: NeilBrown --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 146749b277c..867d1b4e963 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3218,7 +3218,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, else { bad_sectors -= (sector - first_bad); if (max_sync > bad_sectors) - max_sync = max_sync; + max_sync = bad_sectors; continue; } } From db07d85ef6395d4768c6cdceeb2ea4878bd2a3f4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 14:22:17 +1100 Subject: [PATCH 27/29] md: make sure manual changes to recovery checkpoint are saved. If you make an array bigger but suppress resync of the new region with mdadm --grow /dev/mdX --size=max --assume-clean then stop the array before anything is written to it, the effect of the "--assume-clean" is lost and the array will resync the new space when restarted. So ensure that we update the metadata in the case. Reported-by: Sebastian Riemer Signed-off-by: NeilBrown --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 200d0862335..8d93f867adb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3782,6 +3782,8 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) return -EINVAL; mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); return len; } static struct md_sysfs_entry md_resync_start = From e56108d65f8705170d238858616728359542aebb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 14:24:13 +1100 Subject: [PATCH 28/29] md/raid5: be careful not to resize_stripes too big. When a RAID5 is reshaping, conf->raid_disks is increased before mddev->delta_disks becomes zero. This can result in check_reshape calling resize_stripes with a number that is too large. This particularly happens when md_check_recovery calls ->check_reshape(). If we use ->previous_raid_disks, we don't risk this. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4cc6b0e398b..2737247d2a0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5871,7 +5871,8 @@ static int check_reshape(struct mddev *mddev) if (!check_stripe_cache(mddev)) return -ENOSPC; - return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); + return resize_stripes(conf, (conf->previous_raid_disks + + mddev->delta_disks)); } static int raid5_start_reshape(struct mddev *mddev) From 72f36d5972a166197036c1281963f6863c429bf2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 14:25:57 +1100 Subject: [PATCH 29/29] md: refine reporting of resync/reshape delays. If 'resync_max' is set to 0 (as is often done when starting a reshape, so the mdadm can remain in control during a sensitive period), and if the reshape request is initially delayed because another array using the same array is resyncing or reshaping etc, when user-space cannot easily tell when the delay changes from being due to a conflicting reshape, to being due to resync_max = 0. So introduce a new state: (curr_resync == 3) to reflect this, make sure it is visible both via /proc/mdstat and via the "sync_completed" sysfs attribute, and ensure that the event transition from one delay state to the other is properly notified. Signed-off-by: NeilBrown --- drivers/md/md.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8d93f867adb..3c6eaab0b6c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4400,6 +4400,10 @@ sync_completed_show(struct mddev *mddev, char *page) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); + if (mddev->curr_resync == 1 || + mddev->curr_resync == 2) + return sprintf(page, "delayed\n"); + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; @@ -6807,7 +6811,11 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) int scale; unsigned int per_milli; - resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); + if (mddev->curr_resync <= 3) + resync = 0; + else + resync = mddev->curr_resync + - atomic_read(&mddev->recovery_active); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) @@ -7033,7 +7041,7 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev->curr_resync > 2) { status_resync(seq, mddev); seq_printf(seq, "\n "); - } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + } else if (mddev->curr_resync >= 1) seq_printf(seq, "\tresync=DELAYED\n "); else if (mddev->recovery_cp < MaxSector) seq_printf(seq, "\tresync=PENDING\n "); @@ -7423,8 +7431,11 @@ void md_do_sync(struct md_thread *thread) "md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; - } + } else + mddev->curr_resync = 3; /* no longer delayed */ mddev->curr_resync_completed = j; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + md_new_event(mddev); blk_start_plug(&plug); while (j < max_sectors) { @@ -7477,7 +7488,8 @@ void md_do_sync(struct md_thread *thread) break; j += sectors; - if (j>1) mddev->curr_resync = j; + if (j > 2) + mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be @@ -7599,8 +7611,6 @@ static int remove_and_add_spares(struct mddev *mddev) int spares = 0; int removed = 0; - mddev->curr_resync_completed = 0; - rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && @@ -7791,6 +7801,7 @@ void md_check_recovery(struct mddev *mddev) /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ + mddev->curr_resync_completed = 0; set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); /* Clear some bits that don't mean anything, but * might be left set @@ -7804,7 +7815,7 @@ void md_check_recovery(struct mddev *mddev) /* no recovery is running. * remove any failed drives, then * add spares if possible. - * Spare are also removed and re-added, to allow + * Spares are also removed and re-added, to allow * the personality to fail the re-add. */