From: jbeulich@suse.com
Subject: netback: coalesce (guest) RX SKBs as needed
Patch-mainline: Never, SUSE-Xen specific
References: bsc#1056504

--- a/drivers/xen/netback/common.h
+++ b/drivers/xen/netback/common.h
@@ -97,6 +97,7 @@ typedef struct netif_st {
 
 	/* Statistics */
 	unsigned long nr_copied_skbs;
+	unsigned long nr_coalesced_skbs;
 	unsigned long rx_gso_csum_fixups;
 
 	/* Miscellaneous private stuff. */
@@ -230,8 +231,8 @@ typedef unsigned int pending_ring_idx_t;
 struct netbk_rx_meta {
 	skb_frag_t frag;
 	u16 id;
-	u8 copy:2;
-	u8 tail:1;
+	u8 copy;
+	u8 tail;
 };
 
 struct netbk_tx_pending_inuse {
@@ -280,7 +281,14 @@ struct xen_netbk {
 		unsigned int alloc_index;
 		struct multicall_entry mcl[NET_RX_RING_SIZE+3];
 		struct mmu_update mmu[NET_RX_RING_SIZE];
-		struct gnttab_copy grant_copy_op[2 * NET_RX_RING_SIZE];
+		/*
+		 * An skb may have a maximal number of frags but still be
+		 * small in overall size. Thus the worst case number of copy
+		 * operations is one more (for the head) than MAX_SKB_FRAGS
+		 * per ring slot.
+		 */
+		struct gnttab_copy grant_copy_op[(MAX_SKB_FRAGS + 1) *
+						 NET_RX_RING_SIZE];
 		struct gnttab_transfer grant_trans_op;
 		struct netbk_rx_meta meta[NET_RX_RING_SIZE];
 		unsigned long mfn_list[MAX_MFN_ALLOC];
--- a/drivers/xen/netback/interface.c
+++ b/drivers/xen/netback/interface.c
@@ -148,6 +148,7 @@ static const struct netif_stat {
 	u16 offset;
 } netbk_stats[] = {
 	{ "copied_skbs", offsetof(netif_t, nr_copied_skbs) / sizeof(long) },
+	{ "coalesced_skbs", offsetof(netif_t, nr_coalesced_skbs) / sizeof(long) },
 	{ "rx_gso_csum_fixups", offsetof(netif_t, rx_gso_csum_fixups) / sizeof(long) },
 };
 
--- a/drivers/xen/netback/netback.c
+++ b/drivers/xen/netback/netback.c
@@ -58,6 +58,7 @@ static bool __initdata bind_threads;
 struct netbk_rx_cb {
 	unsigned int nr_frags;
 	unsigned int nr_slots;
+	bool coalesce;
 };
 #define netbk_rx_cb(skb) ((struct netbk_rx_cb *)skb->cb)
 
@@ -168,6 +169,9 @@ MODULE_PARM_DESC(max_tx_slots, "Maximum
 static bool MODPARM_copy_skb = true;
 module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
 MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
+static int always_coalesce = -1;
+module_param(always_coalesce, bool, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(always_coalesce, "Always fully coalesce RX data");
 static bool MODPARM_permute_returns;
 module_param_named(permute_returns, MODPARM_permute_returns, bool, S_IRUSR|S_IWUSR);
 MODULE_PARM_DESC(permute_returns, "Randomly permute the order in which TX responses are sent to the frontend");
@@ -382,7 +386,7 @@ static unsigned int netbk_count_slots(co
 int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	netif_t *netif = netdev_priv(dev);
-	unsigned int group = GET_GROUP_INDEX(netif);
+	unsigned int group = GET_GROUP_INDEX(netif), slots;
 	struct xen_netbk_rx *netbk;
 
 	BUG_ON(skb->dev != dev);
@@ -398,12 +402,21 @@ int netif_be_start_xmit(struct sk_buff *
 
 	/*
 	 * Copy the packet here if it's destined for a flipping interface
-	 * but isn't flippable (e.g. extra references to data).
-	 * XXX For now we also copy skbuffs whose head crosses a page
-	 * boundary, because netbk_gop_skb can't handle them.
+	 * but isn't flippable (e.g. extra references to data; XXX for now,
+	 * copy always when flipping).
+	 * XXX For now we also copy skbuffs whose head exceeds PAGE_SIZE,
+	 * because netbk_gop_skb can't handle them.
 	 */
-	if (!netif->copying_receiver ||
-	    ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE)) {
+	netbk_rx_cb(skb)->coalesce = false;
+	if (netif->copying_receiver && skb_headlen(skb) <= PAGE_SIZE) {
+		slots = 1 + netbk_count_slots(skb_shinfo(skb), true);
+		if (always_coalesce || slots >= XEN_NETIF_NR_SLOTS_MIN ||
+		    offset_in_page(skb->data) + skb_headlen(skb) > PAGE_SIZE) {
+			netbk_rx_cb(skb)->coalesce = true;
+			netif->nr_coalesced_skbs++;
+			slots = PFN_UP(skb->len);
+		}
+	} else {
 		struct sk_buff *nskb = netbk_copy_skb(skb);
 		if ( unlikely(nskb == NULL) )
 			goto drop;
@@ -412,12 +425,11 @@ int netif_be_start_xmit(struct sk_buff *
 		nskb->ip_summed = skb->ip_summed;
 		dev_kfree_skb(skb);
 		skb = nskb;
+		slots = 1 + netbk_count_slots(skb_shinfo(nskb), false);
 	}
 
 	netbk_rx_cb(skb)->nr_frags = skb_shinfo(skb)->nr_frags;
-	netbk_rx_cb(skb)->nr_slots = 1 + !!skb_is_gso(skb) +
-				     netbk_count_slots(skb_shinfo(skb),
-						       netif->copying_receiver);
+	netbk_rx_cb(skb)->nr_slots = slots + !!skb_is_gso(skb);
 	netif->rx_req_cons_peek += netbk_rx_cb(skb)->nr_slots;
 	netif_get(netif);
 
@@ -484,6 +496,79 @@ struct netrx_pending_operations {
 	struct netbk_rx_meta *meta;
 };
 
+static void set_copy_source(gnttab_copy_t *cop, struct page *page)
+{
+	struct xen_netbk *netbk;
+	unsigned int idx;
+
+	netif_get_page_ext(page, netbk, idx);
+	if (netbk) {
+		struct pending_tx_info *src_pend;
+		unsigned int grp;
+
+		src_pend = &netbk->tx.pending_info[idx];
+		grp = GET_GROUP_INDEX(src_pend->netif);
+		BUG_ON(netbk != &xen_netbk[grp] && grp != UINT_MAX);
+		cop->source.domid = src_pend->netif->domid;
+		cop->source.u.ref = src_pend->req.gref;
+		cop->flags |= GNTCOPY_source_gref;
+	} else {
+		cop->source.domid = DOMID_SELF;
+		cop->source.u.gmfn = pfn_to_mfn(page_to_pfn(page));
+	}
+}
+
+struct netrx_coalesce {
+	netif_rx_request_t *req;
+	struct netbk_rx_meta *meta;
+	unsigned int nr;
+};
+
+/* Set up coalescing grant operations for this fragment. */
+static void netbk_coalesce_frag(netif_t *netif,
+				struct netrx_pending_operations *npo,
+				struct netrx_coalesce *rxc,
+				struct page *page, unsigned int size,
+				unsigned int offset)
+{
+	struct netbk_rx_meta *meta = rxc->meta;
+
+	while (size) {
+		gnttab_copy_t *cop;
+		unsigned int limit;
+
+		if (meta->frag.size == PAGE_SIZE) {
+			rxc->req = RING_GET_REQUEST(&netif->rx,
+						    netif->rx.req_cons + rxc->nr++);
+			meta = npo->meta + npo->meta_prod++;
+			meta->frag.size = 0;
+			meta->copy = 0;
+			meta->tail = 0;
+			meta->id = rxc->req->id;
+		}
+		meta->copy++;
+		cop = npo->copy + npo->copy_prod++;
+		cop->flags = GNTCOPY_dest_gref;
+		set_copy_source(cop, page);
+		cop->source.offset = offset;
+		cop->dest.domid = netif->domid;
+		cop->dest.offset = meta->frag.size;
+		cop->dest.u.ref = rxc->req->gref;
+		limit = PAGE_SIZE - max_t(unsigned int, offset,
+					  meta->frag.size);
+		cop->len = min(size, limit);
+		size -= cop->len;
+		meta->frag.size += cop->len;
+		if ((offset += cop->len) == PAGE_SIZE) {
+			++page;
+			offset = 0;
+		}
+	}
+
+	meta->tail++;
+	rxc->meta = meta;
+}
+
 /* Set up the grant operations for this fragment.  If it's a flipping
    interface, we also set up the unmap request from here. */
 static void netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
@@ -500,29 +585,12 @@ static void netbk_gop_frag(netif_t *neti
 
 	req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
 	if (netif->copying_receiver) {
-		struct xen_netbk *netbk;
-		unsigned int idx;
-
 		/* The fragment needs to be copied rather than
 		   flipped. */
 		meta->copy++;
 		copy_gop = npo->copy + npo->copy_prod++;
 		copy_gop->flags = GNTCOPY_dest_gref;
-		netif_get_page_ext(page, netbk, idx);
-		if (netbk) {
-			struct pending_tx_info *src_pend;
-			unsigned int grp;
-
-			src_pend = &netbk->tx.pending_info[idx];
-			grp = GET_GROUP_INDEX(src_pend->netif);
-			BUG_ON(netbk != &xen_netbk[grp] && grp != UINT_MAX);
-			copy_gop->source.domid = src_pend->netif->domid;
-			copy_gop->source.u.ref = src_pend->req.gref;
-			copy_gop->flags |= GNTCOPY_source_gref;
-		} else {
-			copy_gop->source.domid = DOMID_SELF;
-			copy_gop->source.u.gmfn = pfn_to_mfn(page_to_pfn(page));
-		}
+		set_copy_source(copy_gop, page);
 		copy_gop->source.offset = offset;
 		copy_gop->dest.domid = netif->domid;
 		copy_gop->dest.offset = i ? meta->frag.size : 0;
@@ -569,9 +637,42 @@ static unsigned int netbk_gop_skb(struct
 
 	head_meta = npo->meta + npo->meta_prod++;
 	head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
-	head_meta->frag.size = skb_shinfo(skb)->gso_size;
 	head_meta->copy = 0;
-	n = !!head_meta->frag.size + 1;
+	n = !!skb_shinfo(skb)->gso_size + 1;
+
+	if (netbk_rx_cb(skb)->coalesce) {
+		struct netrx_coalesce rxc = {
+			.req = RING_GET_REQUEST(&netif->rx,
+						netif->rx.req_cons),
+			.meta = head_meta, .nr = n
+		};
+
+		head_meta->id = rxc.req->id;
+		head_meta->frag.size = 0;
+		head_meta->tail = 0;
+		netbk_coalesce_frag(netif, npo, &rxc,
+				    virt_to_page(skb->data), skb_headlen(skb),
+				    offset_in_page(skb->data));
+		/* Head must not exceed PAGE_SIZE. */
+		BUG_ON(rxc.meta != head_meta);
+
+		for (i = 0; i < nr_frags; i++) {
+			const skb_frag_t *frag = skb_shinfo(skb)->frags + i;
+			unsigned int offset = frag->page_offset;
+
+			netbk_coalesce_frag(netif, npo, &rxc,
+					    skb_frag_page(frag) +
+						PFN_DOWN(offset),
+					    skb_frag_size(frag),
+					    offset & ~PAGE_MASK);
+		}
+
+		head_meta->frag.size = skb_shinfo(skb)->gso_size;
+		netif->rx.req_cons += rxc.nr;
+		return rxc.nr;
+	}
+
+	head_meta->frag.size = skb_shinfo(skb)->gso_size;
 
 	for (i = 0; i < nr_frags; i++, n++) {
 		const skb_frag_t *frag = skb_shinfo(skb)->frags + i;
@@ -712,8 +813,8 @@ static unsigned int netbk_add_frag_respo
 {
 	unsigned int i, n;
 
-	for (n = i = 0; i < nr_frags; meta++, n++) {
-		int flags = (meta->tail && ++i == nr_frags)
+	for (i = meta[-1].tail - 1, n = 0; i < nr_frags; meta++, n++) {
+		int flags = ((i += meta->tail) == nr_frags)
 			    ? 0 : XEN_NETRXF_more_data;
 
 		make_rx_response(netif, meta->id, status,
@@ -846,7 +947,8 @@ static void net_rx_action(unsigned long
 		skb->dev->stats.tx_packets++;
 
 		id = netbk->meta[npo.meta_cons].id;
-		flags = nr_frags ? XEN_NETRXF_more_data : 0;
+		flags = 1 + nr_frags - netbk->meta[npo.meta_cons].tail
+			? XEN_NETRXF_more_data : 0;
 
 		switch (skb->ip_summed) {
 		case CHECKSUM_PARTIAL: /* local packet? */
@@ -863,7 +965,11 @@ static void net_rx_action(unsigned long
 		else
 			offset = offset_in_page(skb->data);
 		resp = make_rx_response(netif, id, status, offset,
-					skb_headlen(skb), flags);
+					netbk_rx_cb(skb)->coalesce
+				        ? min_t(unsigned int, skb->len,
+						PAGE_SIZE)
+				        : skb_headlen(skb),
+					flags);
 
 		if (netbk->meta[npo.meta_cons].frag.size) {
 			struct netif_extra_info *gso =
@@ -2080,6 +2186,10 @@ static int __init netback_init(void)
 		max_tx_slots = XEN_NETIF_NR_SLOTS_MIN;
 	}
 
+	if (always_coalesce < 0)
+		always_coalesce = HYPERVISOR_xen_version(XENVER_version, NULL)
+				  >= 0x00040006;
+
 	group = netbk_nr_groups;
 	if (!netbk_nr_groups)
 		netbk_nr_groups = (num_online_cpus() + 1) / 2;
