From: NeilBrown <neilb@suse.de>
Subject: Throttle number of pending write requests in md/raid1
References: bnc#663678
Git-commit: 34db0cd60f8a1f4ab73d118a8be3797c20388223
Patch-mainline: v3.2

Currently write requests are added to a queue to be processed later by
the raid1d thread.  This allows an arbitrary number of requests to
queue up which is bad for latency.

So count the number of requests and allow it to be limited.

Currently the limit is virtually infinity and must be set by changing
  /sys/modules/raid1/parameters/max_queued
to be effective.

This is because there is still some uncertainty about the exact
behaviour required.  Once this is resolved the default might be
changed.  For now it is here as a place-holder and subsequent patches
change this code, so we wont to some something in place.

Signed-off-by: NeilBrown <neilb@suse.de>


---
 drivers/md/raid1.c |   17 +++++++++++++++++
 drivers/md/raid1.h |    1 +
 2 files changed, 18 insertions(+)

--- linux-3.0-SLE11-SP2-3.0-neilb.orig/drivers/md/raid1.c
+++ linux-3.0-SLE11-SP2-3.0-neilb/drivers/md/raid1.c
@@ -497,11 +497,17 @@ static int read_balance(conf_t *conf, r1
 	return best_disk;
 }
 
+static int max_queued = INT_MAX;
+
 int md_raid1_congested(mddev_t *mddev, int bits)
 {
 	conf_t *conf = mddev->private;
 	int i, ret = 0;
 
+	if ((bits & (1 << BDI_async_congested)) &&
+	    conf->pending_count >= max_queued)
+		return 1;
+
 	rcu_read_lock();
 	for (i = 0; i < mddev->raid_disks; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -542,7 +548,9 @@ static void flush_pending_writes(conf_t
 	if (conf->pending_bio_list.head) {
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
+		conf->pending_count = 0;
 		spin_unlock_irq(&conf->device_lock);
+		wake_up(&conf->wait_barrier);
 		/* flush any pending bitmap writes to
 		 * disk before proceeding w/ I/O */
 		bitmap_unplug(conf->mddev->bitmap);
@@ -800,6 +808,11 @@ static int make_request(mddev_t *mddev,
 	/*
 	 * WRITE:
 	 */
+	if (conf->pending_count >= max_queued) {
+		md_wakeup_thread(mddev->thread);
+		wait_event(conf->wait_barrier,
+			   conf->pending_count < max_queued);
+	}
 	/* first select target devices under spinlock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
@@ -902,6 +915,7 @@ static int make_request(mddev_t *mddev,
 		atomic_inc(&r1_bio->remaining);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
+		conf->pending_count++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	r1_bio_write_done(r1_bio);
@@ -1900,6 +1914,7 @@ static conf_t *setup_conf(mddev_t *mddev
 	init_waitqueue_head(&conf->wait_barrier);
 
 	bio_list_init(&conf->pending_bio_list);
+	conf->pending_count = 0;
 
 	conf->last_used = -1;
 	for (i = 0; i < conf->raid_disks; i++) {
@@ -2264,3 +2279,5 @@ MODULE_DESCRIPTION("RAID1 (mirroring) pe
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
 MODULE_ALIAS("md-raid1");
 MODULE_ALIAS("md-level-1");
+
+module_param(max_queued, int, S_IRUGO|S_IWUSR);
--- linux-3.0-SLE11-SP2-3.0-neilb.orig/drivers/md/raid1.h
+++ linux-3.0-SLE11-SP2-3.0-neilb/drivers/md/raid1.h
@@ -35,6 +35,7 @@ struct r1_private_data_s {
 	struct list_head	retry_list;
 	/* queue pending writes and submit them on unplug */
 	struct bio_list		pending_bio_list;
+	int			pending_count;
 
 	/* for use when syncing mirrors: */
 
