From: NeilBrown <neilb@suse.de>
Date: Wed, 7 Mar 2012 13:53:31 +1100
Subject: [PATCH] md: allow last device to be forcibly removed from RAID1/RAID10.
Patch-mainline: no
References: bnc#746717

When the 'last' device in a RAID1 or RAID10 reports an error,
we do not mark it as failed.  This would serve little purpose
as there is no risk of losing data beyond that which is obviously
lost (as there is with RAID5), and there could be other sectors
on the device which are readable, and only readable from this device.
This in general this maximises access to data.

However the current implementation also stops an admin from removing
the last device by direct action.  This is rarely useful, but in many
case is not harmful and can make automation easier by removing special
cases.

Also, if an attempt to write metadata fails the device must be marked
as faulty, else an infinite loop will result, attempting to update
the metadata on all non-faulty devices.

So add a 'force' option to 'md_error()' and '*errorhandler()' which
bypasses the 'last disk' checks for RAID1 and RAID10.
Set it when the removal is explicitly requested by user-space, or
when it is the result of a failed metadata write.

Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: NeilBrown <neilb@suse.de>

---
 drivers/md/md.c        |   19 ++++++++++++++-----
 drivers/md/md.h        |    4 ++++
 drivers/md/multipath.c |    8 +++++++-
 drivers/md/raid1.c     |    7 +++++++
 drivers/md/raid10.c    |    7 +++++++
 drivers/md/raid5.c     |    6 ++++++
 6 files changed, 45 insertions(+), 6 deletions(-)

--- linux-3.0-SLE11-SP2.orig/drivers/md/md.c
+++ linux-3.0-SLE11-SP2/drivers/md/md.c
@@ -769,6 +769,7 @@ static void free_disk_sb(mdk_rdev_t * rd
 	}
 }
 
+static void md_error_force(mddev_t *mddev, mdk_rdev_t *rdev);
 
 static void super_written(struct bio *bio, int error)
 {
@@ -779,7 +780,7 @@ static void super_written(struct bio *bi
 		printk("md: super_written gets error=%d, uptodate=%d\n",
 		       error, test_bit(BIO_UPTODATE, &bio->bi_flags));
 		WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
-		md_error(mddev, rdev);
+		md_error_force(mddev, rdev);
 		if (!test_bit(Faulty, &rdev->flags)
 		    && (bio->bi_rw & REQ_FAILFAST_DEV)) {
 			set_bit(MD_NEED_REWRITE, &mddev->flags);
@@ -2439,7 +2440,7 @@ state_store(mdk_rdev_t *rdev, const char
 	 */
 	int err = -EINVAL;
 	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
-		md_error(rdev->mddev, rdev);
+		md_error_force(rdev->mddev, rdev);
 		err = 0;
 	} else if (cmd_match(buf, "remove")) {
 		if (rdev->raid_disk >= 0)
@@ -5835,7 +5836,7 @@ static int set_disk_faulty(mddev_t *mdde
 	if (!rdev)
 		return -ENODEV;
 
-	md_error(mddev, rdev);
+	md_error_force(mddev, rdev);
 	return 0;
 }
 
@@ -6281,7 +6282,7 @@ void md_unregister_thread(mdk_thread_t *
 	kfree(thread);
 }
 
-void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
+static void __md_error(mddev_t *mddev, mdk_rdev_t *rdev, int force)
 {
 	if (!mddev) {
 		MD_BUG();
@@ -6304,7 +6305,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t
 		return;
 	if (!mddev->pers->error_handler)
 		return;
-	mddev->pers->error_handler(mddev,rdev);
+	mddev->pers->error_handler(mddev, rdev, force);
 	if (mddev->degraded)
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 	sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -6315,6 +6316,14 @@ void md_error(mddev_t *mddev, mdk_rdev_t
 		queue_work(md_misc_wq, &mddev->event_work);
 	md_new_event_inintr(mddev);
 }
+static void md_error_force(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	__md_error(mddev, rdev, 1);
+}
+void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	__md_error(mddev, rdev, 0);
+}
 
 /* seq_file implementation /proc/mdstat */
 
--- linux-3.0-SLE11-SP2.orig/drivers/md/md.h
+++ linux-3.0-SLE11-SP2/drivers/md/md.h
@@ -371,7 +371,11 @@ struct mdk_personality
 	/* error_handler must set ->faulty and clear ->in_sync
 	 * if appropriate, and should abort recovery if needed 
 	 */
+#ifdef __GENKSYMS__
 	void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
+#else
+	void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev, int force);
+#endif
 	int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
 	int (*hot_remove_disk) (mddev_t *mddev, int number);
 	int (*spare_active) (mddev_t *mddev);
--- linux-3.0-SLE11-SP2.orig/drivers/md/multipath.c
+++ linux-3.0-SLE11-SP2/drivers/md/multipath.c
@@ -183,12 +183,18 @@ static int multipath_congested(void *dat
 /*
  * Careful, this can execute in IRQ contexts as well!
  */
+#ifdef __GENKSYMS__
 static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
 {
+	int force = 0;
+#else
+static void multipath_error(mddev_t *mddev, mdk_rdev_t *rdev, int force)
+{
+#endif
 	multipath_conf_t *conf = mddev->private;
 	char b[BDEVNAME_SIZE];
 
-	if (conf->raid_disks - mddev->degraded <= 1) {
+	if (conf->raid_disks - mddev->degraded <= 1 && !force) {
 		/*
 		 * Uh oh, we can do nothing if this is our last path, but
 		 * first check if this is a queued request for a device
--- linux-3.0-SLE11-SP2.orig/drivers/md/raid1.c
+++ linux-3.0-SLE11-SP2/drivers/md/raid1.c
@@ -989,8 +989,14 @@ static void status(struct seq_file *seq,
 }
 
 
+#ifdef __GENKSYMS__
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
+	int force = 0;
+#else
+static void error(mddev_t *mddev, mdk_rdev_t *rdev, int force)
+{
+#endif
 	char b[BDEVNAME_SIZE];
 	conf_t *conf = mddev->private;
 	unsigned long flags;
@@ -1003,6 +1009,7 @@ static void error(mddev_t *mddev, mdk_rd
 	 */
 	spin_lock_irqsave(&conf->device_lock, flags);
 	if (test_bit(In_sync, &rdev->flags)
+	    && !force
 	    && (conf->raid_disks - mddev->degraded) == 1) {
 		/*
 		 * Don't fail the drive, act as though we were just a
--- linux-3.0-SLE11-SP2.orig/drivers/md/raid10.c
+++ linux-3.0-SLE11-SP2/drivers/md/raid10.c
@@ -1023,8 +1023,14 @@ static int enough(conf_t *conf, int igno
 	return 1;
 }
 
+#ifdef __GENKSYMS__
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
+	int force = 0;
+#else
+static void error(mddev_t *mddev, mdk_rdev_t *rdev, int force)
+{
+#endif
 	char b[BDEVNAME_SIZE];
 	conf_t *conf = mddev->private;
 	unsigned long flags;
@@ -1037,6 +1043,7 @@ static void error(mddev_t *mddev, mdk_rd
 	 */
 	spin_lock_irqsave(&conf->device_lock, flags);
 	if (test_bit(In_sync, &rdev->flags)
+	    && !force
 	    && !enough(conf, rdev->raid_disk)) {
 		/*
 		 * Don't fail the drive, just return an IO error.
--- linux-3.0-SLE11-SP2.orig/drivers/md/raid5.c
+++ linux-3.0-SLE11-SP2/drivers/md/raid5.c
@@ -1694,8 +1694,14 @@ static void raid5_build_block(struct str
 	dev->sector = compute_blocknr(sh, i, previous);
 }
 
+#ifdef __GENKSYMS__
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
+	int force = 0;
+#else
+static void error(mddev_t *mddev, mdk_rdev_t *rdev, int force)
+{
+#endif
 	char b[BDEVNAME_SIZE];
 	raid5_conf_t *conf = mddev->private;
 	pr_debug("raid456: error called\n");
