diff --git a/drivers/md/md.c b/drivers/md/md.c index c842e34d850..177d2a7d7ce 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -218,6 +218,8 @@ static mddev_t * mddev_find(dev_t unit) INIT_LIST_HEAD(&new->all_mddevs); init_timer(&new->safemode_timer); atomic_set(&new->active, 1); + bio_list_init(&new->write_list); + spin_lock_init(&new->write_lock); new->queue = blk_alloc_queue(GFP_KERNEL); if (!new->queue) { @@ -1251,9 +1253,11 @@ static void md_update_sb(mddev_t * mddev) int err, count = 100; struct list_head *tmp; mdk_rdev_t *rdev; + int sync_req; - mddev->sb_dirty = 0; repeat: + spin_lock(&mddev->write_lock); + sync_req = mddev->in_sync; mddev->utime = get_seconds(); mddev->events ++; @@ -1272,8 +1276,12 @@ repeat: * do not write anything to disk if using * nonpersistent superblocks */ - if (!mddev->persistent) + if (!mddev->persistent) { + mddev->sb_dirty = 0; + spin_unlock(&mddev->write_lock); return; + } + spin_unlock(&mddev->write_lock); dprintk(KERN_INFO "md: updating %s RAID superblock on device (in sync %d)\n", @@ -1304,6 +1312,15 @@ repeat: printk(KERN_ERR \ "md: excessive errors occurred during superblock update, exiting\n"); } + spin_lock(&mddev->write_lock); + if (mddev->in_sync != sync_req) { + /* have to write it out again */ + spin_unlock(&mddev->write_lock); + goto repeat; + } + mddev->sb_dirty = 0; + spin_unlock(&mddev->write_lock); + } /* @@ -3178,19 +3195,31 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) } -void md_write_start(mddev_t *mddev) +/* md_write_start(mddev, bi) + * If we need to update some array metadata (e.g. 'active' flag + * in superblock) before writing, queue bi for later writing + * and return 0, else return 1 and it will be written now + */ +int md_write_start(mddev_t *mddev, struct bio *bi) { - if (!atomic_read(&mddev->writes_pending)) { - mddev_lock_uninterruptible(mddev); - if (mddev->in_sync) { - mddev->in_sync = 0; - del_timer(&mddev->safemode_timer); - md_update_sb(mddev); - } - atomic_inc(&mddev->writes_pending); - mddev_unlock(mddev); - } else - atomic_inc(&mddev->writes_pending); + if (bio_data_dir(bi) != WRITE) + return 1; + + atomic_inc(&mddev->writes_pending); + spin_lock(&mddev->write_lock); + if (mddev->in_sync == 0 && mddev->sb_dirty == 0) { + spin_unlock(&mddev->write_lock); + return 1; + } + bio_list_add(&mddev->write_list, bi); + + if (mddev->in_sync) { + mddev->in_sync = 0; + mddev->sb_dirty = 1; + } + spin_unlock(&mddev->write_lock); + md_wakeup_thread(mddev->thread); + return 0; } void md_write_end(mddev_t *mddev) @@ -3472,6 +3501,7 @@ void md_check_recovery(mddev_t *mddev) mddev->sb_dirty || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || + mddev->write_list.head || (mddev->safemode == 1) || (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) @@ -3480,7 +3510,9 @@ void md_check_recovery(mddev_t *mddev) if (mddev_trylock(mddev)==0) { int spares =0; + struct bio *blist; + spin_lock(&mddev->write_lock); if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; @@ -3488,9 +3520,22 @@ void md_check_recovery(mddev_t *mddev) } if (mddev->safemode == 1) mddev->safemode = 0; + blist = bio_list_get(&mddev->write_list); + spin_unlock(&mddev->write_lock); if (mddev->sb_dirty) md_update_sb(mddev); + + while (blist) { + struct bio *b = blist; + blist = blist->bi_next; + b->bi_next = NULL; + generic_make_request(b); + /* we already counted this, so need to un-count */ + md_write_end(mddev); + } + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { /* resync/recovery still happening */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b34ad56362d..3f1280bbaf3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -530,6 +530,8 @@ static int make_request(request_queue_t *q, struct bio * bio) * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ + if (md_write_start(mddev, bio)==0) + return 0; spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); conf->nr_pending++; @@ -611,7 +613,7 @@ static int make_request(request_queue_t *q, struct bio * bio) rcu_read_unlock(); atomic_set(&r1_bio->remaining, 1); - md_write_start(mddev); + for (i = 0; i < disks; i++) { struct bio *mbio; if (!r1_bio->bios[i]) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 9ae21504db8..bfc9f52f0ec 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -700,6 +700,9 @@ static int make_request(request_queue_t *q, struct bio * bio) return 0; } + if (md_write_start(mddev, bio) == 0) + return 0; + /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. @@ -774,7 +777,7 @@ static int make_request(request_queue_t *q, struct bio * bio) rcu_read_unlock(); atomic_set(&r10_bio->remaining, 1); - md_write_start(mddev); + for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 63b1c59d36f..677ce49078d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1411,6 +1411,9 @@ static int make_request (request_queue_t *q, struct bio * bi) sector_t logical_sector, last_sector; struct stripe_head *sh; + if (md_write_start(mddev, bi)==0) + return 0; + if (bio_data_dir(bi)==WRITE) { disk_stat_inc(mddev->gendisk, writes); disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); @@ -1423,8 +1426,7 @@ static int make_request (request_queue_t *q, struct bio * bi) last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - if ( bio_data_dir(bi) == WRITE ) - md_write_start(mddev); + for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 9d0e0e42a3b..fede16c4e8f 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -1570,6 +1570,9 @@ static int make_request (request_queue_t *q, struct bio * bi) sector_t logical_sector, last_sector; struct stripe_head *sh; + if (md_write_start(mddev, bi)==0) + return 0; + if (bio_data_dir(bi)==WRITE) { disk_stat_inc(mddev->gendisk, writes); disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); @@ -1583,8 +1586,7 @@ static int make_request (request_queue_t *q, struct bio * bi) bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - if ( bio_data_dir(bi) == WRITE ) - md_write_start(mddev); + for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index a6a67d102bf..cfde8f497d6 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -69,7 +69,7 @@ extern mdk_thread_t * md_register_thread (void (*run) (mddev_t *mddev), extern void md_unregister_thread (mdk_thread_t *thread); extern void md_wakeup_thread(mdk_thread_t *thread); extern void md_check_recovery(mddev_t *mddev); -extern void md_write_start(mddev_t *mddev); +extern int md_write_start(mddev_t *mddev, struct bio *bi); extern void md_write_end(mddev_t *mddev); extern void md_handle_safemode(mddev_t *mddev); extern void md_done_sync(mddev_t *mddev, int blocks, int ok); diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index c9a0d4013be..d92db54255a 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -15,6 +15,9 @@ #ifndef _MD_K_H #define _MD_K_H +/* and dm-bio-list.h is not under include/linux because.... ??? */ +#include "../../../drivers/md/dm-bio-list.h" + #define MD_RESERVED 0UL #define LINEAR 1UL #define RAID0 2UL @@ -252,6 +255,10 @@ struct mddev_s atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; sector_t recovery_cp; + + spinlock_t write_lock; + struct bio_list write_list; + unsigned int safemode; /* if set, update "clean" superblock * when no writes pending. */