From: NeilBrown <neilb@suse.de>
Subject: Use FAILFAST on metadata writes where appropriate
Git-commit: 46533ff7fefb7e9e3539494f5873b00091caa8eb
Patch-mainline: v4.10
References: FATE#311379

If we get a failure writing metadata but the device doesn't fail,
it must be the last device so we re-write without FAILFAST to
improve chance of success.

Signed-off-by: NeilBrown <neilb@suse.de>


Acked-by: Neil Brown <neilb@suse.de>
Signed-off-by: Neil Brown <neilb@suse.de>

---
 drivers/md/md.c |   48 ++++++++++++++++++++++++++++++++++++------------
 drivers/md/md.h |    7 ++++++-
 2 files changed, 42 insertions(+), 13 deletions(-)

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -725,7 +725,13 @@ static void super_written(struct bio *bi
 	if (bio->bi_error) {
 		printk("md: super_written gets error=%d\n", bio->bi_error);
 		md_error(mddev, rdev);
-	}
+		if (!test_bit(Faulty, &rdev->flags)
+		    && (bio->bi_rw & REQ_FAILFAST_DEV)) {
+			set_bit(MD_NEED_REWRITE, &mddev->flags);
+			set_bit(LastDev, &rdev->flags);
+		}
+	} else
+		clear_bit(LastDev, &rdev->flags);
 
 	if (atomic_dec_and_test(&mddev->pending_writes))
 		wake_up(&mddev->sb_wait);
@@ -741,7 +747,13 @@ void md_super_write(struct mddev *mddev,
 	 * if zero is reached.
 	 * If an error occurred, call md_error
 	 */
-	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
+	struct bio *bio;
+	int ff = 0;
+
+	if (test_bit(Faulty, &rdev->flags))
+		return;
+
+	bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
 
 	atomic_inc(&rdev->nr_pending);
 
@@ -749,14 +761,21 @@ void md_super_write(struct mddev *mddev,
 	bio->bi_private = rdev;
 	bio->bi_end_io = super_written;
 
+	if (test_bit(FailFast, &rdev->flags) &&
+	    !test_bit(LastDev, &rdev->flags))
+		ff = REQ_FAILFAST_DEV;
+
 	atomic_inc(&mddev->pending_writes);
-	submit_bio(WRITE_FLUSH_FUA, bio);
+	submit_bio(WRITE_FLUSH_FUA | ff, bio);
 }
 
-void md_super_wait(struct mddev *mddev)
+int md_super_wait(struct mddev *mddev)
 {
 	/* wait for all superblock writes that were scheduled to complete */
 	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
+	if (test_and_clear_bit(MD_NEED_REWRITE, &mddev->flags))
+		return -EAGAIN;
+	return 0;
 }
 
 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
@@ -1330,9 +1349,10 @@ super_90_rdev_size_change(struct md_rdev
 	if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
 	    rdev->mddev->level >= 1)
 		num_sectors = (sector_t)(2ULL << 32) - 2;
-	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
+	do
+		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
-	md_super_wait(rdev->mddev);
+	while (md_super_wait(rdev->mddev) < 0);
 	return num_sectors;
 }
 
@@ -1880,9 +1900,10 @@ super_1_rdev_size_change(struct md_rdev
 	sb->data_size = cpu_to_le64(num_sectors);
 	sb->super_offset = cpu_to_le64(rdev->sb_start);
 	sb->sb_csum = calc_sb_1_csum(sb);
-	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
-		       rdev->sb_page);
-	md_super_wait(rdev->mddev);
+	do
+		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
+			       rdev->sb_page);
+	while (md_super_wait(rdev->mddev) < 0);
 	return num_sectors;
 
 }
@@ -2412,6 +2433,7 @@ repeat:
 	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
 		 mdname(mddev), mddev->in_sync);
 
+rewrite:
 	bitmap_update_sb(mddev->bitmap);
 	rdev_for_each(rdev, mddev) {
 		char b[BDEVNAME_SIZE];
@@ -2443,7 +2465,8 @@ repeat:
 			/* only need to write one superblock... */
 			break;
 	}
-	md_super_wait(mddev);
+	if (md_super_wait(mddev) < 0)
+		goto rewrite;
 	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
 
 	if (mddev_is_clustered(mddev) && ret == 0)
@@ -5484,8 +5507,9 @@ static void __md_stop_writes(struct mdde
 
 	del_timer_sync(&mddev->safemode_timer);
 
-	bitmap_flush(mddev);
-	md_super_wait(mddev);
+	do
+		bitmap_flush(mddev);
+	while (md_super_wait(mddev) < 0);
 
 	if (mddev->ro == 0 &&
 	    ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -189,6 +189,10 @@ enum flag_bits {
 				 * It is expects that no bad block log
 				 * is present.
 				 */
+	LastDev,		/* Seems to be the last working dev as
+				 * it didn't fail, so don't use FailFast
+				 * any more for metadata
+				 */
 };
 
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -240,6 +244,7 @@ struct mddev {
 				 * it then */
 #define MD_JOURNAL_CLEAN 5	/* A raid with journal is already clean */
 #define MD_HAS_JOURNAL	6	/* The raid array has journal feature set */
+#define MD_NEED_REWRITE 7	/* metadata write need to be repeated */
 #define MD_RELOAD_SB	8	/* Reload the superblock because another node
 				 * updated it.
 				 */
@@ -653,7 +658,7 @@ extern int mddev_congested(struct mddev
 extern void md_flush_request(struct mddev *mddev, struct bio *bio);
 extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 			   sector_t sector, int size, struct page *page);
-extern void md_super_wait(struct mddev *mddev);
+extern int md_super_wait(struct mddev *mddev);
 extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 			struct page *page, int rw, bool metadata_op);
 extern void md_do_sync(struct md_thread *thread);
