From: NeilBrown <neilb@suse.de>
Subject: Use FAILFAST on metadata writes where appropriate
Patch-mainline: 3.3?
References: FATE#311379

If we get a failure writing metadata but the device doesn't fail,
it must be the last device so we re-write without FAILFAST to
improve chance of success.

Signed-off-by: NeilBrown <neilb@suse.de>


Acked-by: Neil Brown <neilb@suse.de>
Signed-off-by: Neil Brown <neilb@suse.de>

---
 drivers/md/md.c |   64 ++++++++++++++++++++++++++++++++++++++++----------------
 drivers/md/md.h |    7 +++++-
 2 files changed, 52 insertions(+), 19 deletions(-)

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -782,7 +782,13 @@ static void super_written(struct bio *bi
 		       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
 		WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
 		md_error(mddev, rdev);
-	}
+		if (!test_bit(Faulty, &rdev->flags)
+		    && (bio->bi_rw & REQ_FAILFAST_DEV)) {
+			set_bit(MD_NEED_REWRITE, &mddev->flags);
+			set_bit(LastDev, &rdev->flags);
+		}
+	} else
+		clear_bit(LastDev, &rdev->flags);
 
 	if (atomic_dec_and_test(&mddev->pending_writes))
 		wake_up(&mddev->sb_wait);
@@ -798,7 +804,13 @@ void md_super_write(mddev_t *mddev, mdk_
 	 * if zero is reached.
 	 * If an error occurred, call md_error
 	 */
-	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
+	struct bio *bio;
+	int ff = 0;
+
+	if (test_bit(Faulty, &rdev->flags))
+		return;
+
+	bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
 
 	bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
 	bio->bi_sector = sector;
@@ -806,11 +818,15 @@ void md_super_write(mddev_t *mddev, mdk_
 	bio->bi_private = rdev;
 	bio->bi_end_io = super_written;
 
+	if (test_bit(FailFast, &rdev->flags) &&
+	    !test_bit(LastDev, &rdev->flags))
+		ff = REQ_FAILFAST_DEV;
+
 	atomic_inc(&mddev->pending_writes);
-	submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
+	submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA | ff, bio);
 }
 
-void md_super_wait(mddev_t *mddev)
+int md_super_wait(mddev_t *mddev)
 {
 	/* wait for all superblock writes that were scheduled to complete */
 	DEFINE_WAIT(wq);
@@ -821,6 +837,9 @@ void md_super_wait(mddev_t *mddev)
 		schedule();
 	}
 	finish_wait(&mddev->sb_wait, &wq);
+	if (test_and_clear_bit(MD_NEED_REWRITE, &mddev->flags))
+		return -EAGAIN;
+	return 0;
 }
 
 static void bi_complete(struct bio *bio, int error)
@@ -1388,9 +1407,10 @@ super_90_rdev_size_change(mdk_rdev_t *rd
 	 */
 	if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
 		num_sectors = (2ULL << 32) - 2;
-	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
+	do
+		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
-	md_super_wait(rdev->mddev);
+	while (md_super_wait(rdev->mddev) < 0);
 	return num_sectors;
 }
 
@@ -1761,9 +1781,10 @@ super_1_rdev_size_change(mdk_rdev_t *rde
 	sb->data_size = cpu_to_le64(num_sectors);
 	sb->super_offset = rdev->sb_start;
 	sb->sb_csum = calc_sb_1_csum(sb);
-	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
-		       rdev->sb_page);
-	md_super_wait(rdev->mddev);
+	do
+		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
+			       rdev->sb_page);
+	while (md_super_wait(rdev->mddev) < 0);
 	return num_sectors;
 }
 
@@ -2305,6 +2326,7 @@ repeat:
 		"md: updating %s RAID superblock on device (in sync %d)\n",
 		mdname(mddev),mddev->in_sync);
 
+rewrite:
 	bitmap_update_sb(mddev->bitmap);
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		char b[BDEVNAME_SIZE];
@@ -2330,7 +2352,8 @@ repeat:
 			/* only need to write one superblock... */
 			break;
 	}
-	md_super_wait(mddev);
+	if (md_super_wait(mddev) < 0)
+		goto rewrite;
 	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
 
 	spin_lock_irq(&mddev->write_lock);
@@ -4675,7 +4698,8 @@ int md_run(mddev_t *mddev)
 	
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	
-	if (mddev->flags)
+	if (test_bit(MD_CHANGE_DEVS, &mddev->flags) ||
+	    test_bit(MD_CHANGE_CLEAN, &mddev->flags))
 		md_update_sb(mddev, 0);
 
 	md_new_event(mddev);
@@ -4813,10 +4837,13 @@ static void __md_stop_writes(mddev_t *md
 
 	del_timer_sync(&mddev->safemode_timer);
 
-	bitmap_flush(mddev);
-	md_super_wait(mddev);
-
-	if (!mddev->in_sync || mddev->flags) {
+	do
+		bitmap_flush(mddev);
+	while (md_super_wait(mddev) < 0);
+
+	if (!mddev->in_sync ||
+	    test_bit(MD_CHANGE_DEVS, &mddev->flags) ||
+	    test_bit(MD_CHANGE_CLEAN, &mddev->flags)) {
 		/* mark array as shutdown cleanly */
 		mddev->in_sync = 1;
 		md_update_sb(mddev, 1);
@@ -7253,8 +7280,8 @@ void md_check_recovery(mddev_t *mddev)
 
 	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return;
-	if ( ! (
-		(mddev->flags & ~ (1<<MD_CHANGE_PENDING)) ||
+	if ( ! (test_bit(MD_CHANGE_DEVS, &mddev->flags) ||
+		test_bit(MD_CHANGE_CLEAN, &mddev->flags) ||
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
@@ -7306,7 +7333,8 @@ void md_check_recovery(mddev_t *mddev)
 				sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
 
-		if (mddev->flags)
+		if (test_bit(MD_CHANGE_DEVS, &mddev->flags) ||
+		    test_bit(MD_CHANGE_CLEAN, &mddev->flags))
 			md_update_sb(mddev, 0);
 
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -83,6 +83,10 @@ struct mdk_rdev_s
 					 * It is expects that no bad block log
 					 * is present.
 					 */
+#define LastDev		10		/* Seems to be the last working dev as
+					 * it didn't fail, so don't use FailFast
+					 * any more for metadata
+					 */
 	wait_queue_head_t blocked_wait;
 
 	int desc_nr;			/* descriptor index in the superblock */
@@ -131,6 +135,7 @@ struct mddev_s
 #define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
 #define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
 #define MD_ARRAY_FIRST_USE 3    /* First use of array, needs initialization */
+#define MD_NEED_REWRITE 4	/* metadata write need to be repeated */
 
 	int				suspended;
 	atomic_t			active_io;
@@ -493,7 +498,7 @@ extern int mddev_congested(mddev_t *mdde
 extern void md_flush_request(mddev_t *mddev, struct bio *bio);
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 			   sector_t sector, int size, struct page *page);
-extern void md_super_wait(mddev_t *mddev);
+extern int md_super_wait(mddev_t *mddev);
 extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 
 			struct page *page, int rw, bool metadata_op);
 extern void md_do_sync(mddev_t *mddev);
