From: jbeulich@suse.com
Subject: blkback: allow using indirect request segment descriptors
Patch-mainline: n/a
References: fate#316876

--- sle11sp4.orig/drivers/xen/blkback/blkback.c	2013-06-07 10:52:00.000000000 +0200
+++ sle11sp4/drivers/xen/blkback/blkback.c	2014-02-06 15:35:21.000000000 +0100
@@ -98,6 +98,11 @@ module_param_call(max_ring_page_order,
 		  &blkif_max_ring_page_order, 0644);
 MODULE_PARM_DESC(max_ring_page_order, "log2 of maximum ring size (in pages)");
 
+/* Maximum number of indirect segments advertised to the front end. */
+unsigned int blkif_max_segs_per_req = BITS_PER_LONG;
+module_param_named(max_indirect_segments, blkif_max_segs_per_req, uint, 0444);
+MODULE_PARM_DESC(max_indirect_segments, "maximum number of indirect segments");
+
 /*
  * Each outstanding request that we've passed to the lower device layers has a 
  * 'pending_req' allocated to it. Each buffer_head that completes decrements 
@@ -125,7 +130,7 @@ static grant_handle_t *pending_grant_han
 
 static inline int vaddr_pagenr(pending_req_t *req, int seg)
 {
-	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+	return (req - pending_reqs) * blkif_max_segs_per_req + seg;
 }
 
 #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
@@ -141,9 +146,8 @@ static inline unsigned long vaddr(pendin
 
 
 static int do_block_io_op(blkif_t *blkif);
-static void dispatch_rw_block_io(blkif_t *blkif,
-				 blkif_request_t *req,
-				 pending_req_t *pending_req);
+static void _dispatch_rw_block_io(blkif_t *, pending_req_t *,
+				  unsigned int max_seg);
 static void make_response(blkif_t *blkif, u64 id,
 			  unsigned short op, int st);
 
@@ -179,10 +183,9 @@ static void free_req(pending_req_t *req)
 
 static void fast_flush_area(pending_req_t *req)
 {
-	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct gnttab_unmap_grant_ref unmap[32];
 	unsigned int i, invcount = 0;
 	grant_handle_t handle;
-	int ret;
 
 	for (i = 0; i < req->nr_pages; i++) {
 		handle = pending_handle(req, i);
@@ -192,12 +195,18 @@ static void fast_flush_area(pending_req_
 		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
 				    GNTMAP_host_map, handle);
 		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
-		invcount++;
+		if (++invcount == ARRAY_SIZE(unmap)) {
+			if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+						      unmap, invcount))
+				BUG();
+			invcount = 0;
+		}
 	}
 
-	ret = HYPERVISOR_grant_table_op(
-		GNTTABOP_unmap_grant_ref, unmap, invcount);
-	BUG_ON(ret);
+	if (invcount &&
+	    HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+				      unmap, invcount))
+		BUG();
 }
 
 /******************************************************************
@@ -418,6 +427,65 @@ static void dispatch_discard(blkif_t *bl
 	make_response(blkif, req->id, req->operation, status);
 }
 
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *greq,
+				 pending_req_t *pending_req)
+{
+	struct blkbk_request *req = blkif->req;
+
+	req->operation     = greq->operation;
+	req->handle        = greq->handle;
+	req->nr_segments   = greq->nr_segments;
+	req->id            = greq->id;
+	req->sector_number = greq->sector_number;
+	if (likely(req->nr_segments <= BLKIF_MAX_SEGMENTS_PER_REQUEST))
+		memcpy(req->seg, greq->seg, req->nr_segments * sizeof(*req->seg));
+	_dispatch_rw_block_io(blkif, pending_req,
+			      BLKIF_MAX_SEGMENTS_PER_REQUEST);
+}
+
+static void dispatch_indirect(blkif_t *blkif,
+			      struct blkif_request_indirect *greq,
+			      pending_req_t *pending_req)
+{
+	struct blkbk_request *req = blkif->req;
+
+	req->operation     = greq->indirect_op;
+	req->handle        = greq->handle;
+	req->nr_segments   = greq->nr_segments;
+	req->id            = greq->id;
+	req->sector_number = greq->sector_number;
+	if (likely(req->nr_segments <= blkif_max_segs_per_req)) {
+		struct gnttab_copy gc[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+		unsigned int i, n = BLKIF_INDIRECT_PAGES(req->nr_segments);
+
+		for (i = 0; i < n; ++i) {
+			gc[i].source.u.ref = greq->indirect_grefs[i];
+			gc[i].source.domid = blkif->domid;
+			gc[i].source.offset = 0;
+			gc[i].dest.u.gmfn = blkif->seg_mfn[i];
+			gc[i].dest.domid = DOMID_SELF;
+			gc[i].dest.offset = blkif->seg_offs;
+			gc[i].len = PAGE_SIZE;
+			gc[i].flags = GNTCOPY_source_gref;
+		}
+		if ((req->nr_segments * sizeof(*req->seg)) & ~PAGE_MASK)
+			gc[n - 1].len = (req->nr_segments * sizeof(*req->seg))
+					& ~PAGE_MASK;
+		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, gc, n))
+			BUG();
+		for (i = 0; i < n; ++i) {
+			if (unlikely(gc[i].status == GNTST_eagain))
+				gnttab_check_GNTST_eagain_do_while(GNTTABOP_copy,
+								   &gc[i]);
+			if (gc[i].status != GNTST_okay)
+				/* force failure in _dispatch_rw_block_io() */
+				req->operation = BLKIF_OP_INDIRECT;
+		}
+	}
+	_dispatch_rw_block_io(blkif, pending_req, blkif_max_segs_per_req);
+}
+
 static int _do_block_io_op(blkif_t *blkif)
 {
 	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
@@ -486,6 +554,16 @@ static int _do_block_io_op(blkif_t *blki
 			barrier();
 			dispatch_discard(blkif, (void *)&req);
 			break;
+		case BLKIF_OP_INDIRECT:
+			pending_req = alloc_req();
+			if (!pending_req) {
+				blkif->st_oo_req++;
+				return 1;
+			}
+			blk_rings->common.req_cons = rc;
+			barrier();
+			dispatch_indirect(blkif, (void *)&req, pending_req);
+			break;
 		default:
 			/* A good sign something is wrong: sleep for a while to
 			 * avoid excessive CPU consumption by a bad guest. */
@@ -523,16 +601,14 @@ do_block_io_op(blkif_t *blkif)
 	return more_to_do;
 }
 
-static void dispatch_rw_block_io(blkif_t *blkif,
-				 blkif_request_t *req,
-				 pending_req_t *pending_req)
+static void _dispatch_rw_block_io(blkif_t *blkif,
+				  pending_req_t *pending_req,
+				  unsigned int max_seg)
 {
-	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct blkbk_request *req = blkif->req;
+	struct gnttab_map_grant_ref *map = blkif->map;
 	struct phys_req preq;
-	union {
-		unsigned int nsec;
-		struct bio *bio;
-	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	union blkif_seg *seg = blkif->seg;
 	unsigned int nseg, i, nbio = 0;
 	struct bio *bio = NULL;
 	uint32_t flags;
@@ -557,14 +633,13 @@ static void dispatch_rw_block_io(blkif_t
 		operation = WRITE_FLUSH;
 		break;
 	default:
-		operation = 0; /* make gcc happy */
-		BUG();
+		goto fail_response;
 	}
 
 	/* Check that number of segments is sane. */
 	nseg = req->nr_segments;
 	if (unlikely(nseg == 0 && !(operation & REQ_FLUSH)) ||
-	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+	    unlikely(nseg > max_seg)) {
 		DPRINTK("Bad number of segments in request (%d)\n", nseg);
 		goto fail_response;
 	}
@@ -764,17 +839,23 @@ static void make_response(blkif_t *blkif
 
 static int __init blkif_init(void)
 {
-	int i, mmap_pages;
+	unsigned long i, mmap_pages;
 
 	if (!is_running_on_xen())
 		return -ENODEV;
 
-	mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	if (blkif_max_segs_per_req < BLKIF_MAX_SEGMENTS_PER_REQUEST)
+		blkif_max_segs_per_req = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	else if (BLKIF_INDIRECT_PAGES(blkif_max_segs_per_req) >
+		 BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST)
+		blkif_max_segs_per_req = BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST *
+					 BLKIF_SEGS_PER_INDIRECT_FRAME;
+	mmap_pages = blkif_reqs * 1UL * blkif_max_segs_per_req;
 
 	pending_reqs          = kzalloc(sizeof(pending_reqs[0]) *
 					blkif_reqs, GFP_KERNEL);
-	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
-					mmap_pages, GFP_KERNEL);
+	pending_grant_handles = vmalloc(sizeof(pending_grant_handles[0]) *
+					mmap_pages);
 	pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
 
 	if (blkback_pagemap_init(mmap_pages))
--- sle11sp4.orig/drivers/xen/blkback/common.h	2013-06-07 10:52:01.000000000 +0200
+++ sle11sp4/drivers/xen/blkback/common.h	2014-01-31 14:20:23.000000000 +0100
@@ -82,6 +82,21 @@ typedef struct blkif_st {
 	/* Private fields. */
 	spinlock_t       blk_ring_lock;
 	atomic_t         refcnt;
+	struct gnttab_map_grant_ref *map;
+	union blkif_seg {
+		unsigned int nsec;
+		struct bio *bio;
+	}                *seg;
+	struct blkbk_request {
+		uint8_t        operation;
+		blkif_vdev_t   handle;
+		unsigned int   nr_segments;
+		uint64_t       id;
+		blkif_sector_t sector_number;
+		struct blkif_request_segment seg[];
+	}                *req;
+	xen_pfn_t         seg_mfn[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+	unsigned int      seg_offs;
 
 	wait_queue_head_t   wq;
 	/* for barrier (drain) requests */
@@ -116,6 +131,7 @@ struct backend_info
 };
 
 extern unsigned int blkif_max_ring_page_order;
+extern unsigned int blkif_max_segs_per_req;
 
 blkif_t *blkif_alloc(domid_t domid);
 void blkif_disconnect(blkif_t *blkif);
--- sle11sp4.orig/drivers/xen/blkback/xenbus.c	2013-01-08 17:24:20.000000000 +0100
+++ sle11sp4/drivers/xen/blkback/xenbus.c	2014-01-31 14:20:38.000000000 +0100
@@ -329,6 +329,15 @@ static int blkback_probe(struct xenbus_d
 	if (err)
 		xenbus_dev_error(dev, err, "writing max-ring-page-order");
 
+	if (blkif_max_segs_per_req > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+		err = xenbus_printf(XBT_NIL, dev->nodename,
+				    "feature-max-indirect-segments", "%u",
+				    blkif_max_segs_per_req);
+		if (err)
+			xenbus_dev_error(dev, err,
+					 "writing feature-max-indirect-segments");
+	}
+
 	err = xenbus_switch_state(dev, XenbusStateInitWait);
 	if (err)
 		goto fail;
--- sle11sp4.orig/drivers/xen/blkback/interface.c	2013-06-07 10:52:01.000000000 +0200
+++ sle11sp4/drivers/xen/blkback/interface.c	2014-01-31 14:21:16.000000000 +0100
@@ -35,16 +35,52 @@
 #include <linux/kthread.h>
 #include <linux/vmalloc.h>
 
+#define BLKBK_REQ_SIZE(req, segs) \
+	(offsetof(typeof(*(req)), seg[segs]) <= PAGE_SIZE \
+	 ? offsetof(typeof(*(req)), seg[segs]) \
+	 : offsetof(typeof(*(req)), seg) + \
+	   PAGE_ALIGN((segs) * sizeof(*(req)->seg)))
+
 static struct kmem_cache *blkif_cachep;
 
 blkif_t *blkif_alloc(domid_t domid)
 {
 	blkif_t *blkif;
+	unsigned int size;
+	void *ptr;
 
 	blkif = kmem_cache_zalloc(blkif_cachep, GFP_KERNEL);
 	if (!blkif)
 		return ERR_PTR(-ENOMEM);
 
+	smp_rmb();
+	blkif->seg = kcalloc(blkif_max_segs_per_req, sizeof(*blkif->seg),
+			     GFP_KERNEL);
+	blkif->map = kcalloc(blkif_max_segs_per_req, sizeof(*blkif->map),
+			     GFP_KERNEL);
+	size = BLKBK_REQ_SIZE(blkif->req, blkif_max_segs_per_req);
+	ptr = alloc_pages_exact(size, GFP_KERNEL);
+	if (!blkif->seg || !blkif->map || !ptr) {
+		if (ptr)
+			free_pages_exact(ptr, size);
+		kfree(blkif->seg);
+		kfree(blkif->map);
+		kmem_cache_free(blkif_cachep, blkif);
+		return ERR_PTR(-ENOMEM);
+	}
+	if (size > PAGE_SIZE) {
+		struct blkif_request_segment *seg = ptr + PAGE_SIZE;
+		unsigned int i;
+
+		blkif->req = container_of(seg, struct blkbk_request, seg[0]);
+		for (i = 0; i < BLKIF_INDIRECT_PAGES(blkif_max_segs_per_req); ++i)
+			blkif->seg_mfn[i] = virt_to_mfn(ptr + (i + 1) * PAGE_SIZE);
+	} else {
+		blkif->req = ptr;
+		blkif->seg_mfn[0] = virt_to_mfn(ptr);
+		blkif->seg_offs = offsetof(struct blkbk_request, seg);
+	}
+
 	blkif->domid = domid;
 	spin_lock_init(&blkif->blk_ring_lock);
 	atomic_set(&blkif->refcnt, 1);
@@ -153,6 +189,11 @@ void blkif_free(blkif_t *blkif)
 {
 	if (!atomic_dec_and_test(&blkif->refcnt))
 		BUG();
+	free_pages_exact((void*)((unsigned long)blkif->req & PAGE_MASK),
+			 BLKBK_REQ_SIZE(blkif->req,
+					blkif_max_segs_per_req));
+	kfree(blkif->seg);
+	kfree(blkif->map);
 	kmem_cache_free(blkif_cachep, blkif);
 }
 
