From: jbeulich@suse.com
Subject: netback: backport from SLE12 SP1's xen3-patch-3.3
Patch-mainline: Never, SUSE-Xen specific
References: bsc#1056504

This is a preparatory step for "netback: coalesce (guest) RX SKBs as needed".

--- a/drivers/xen/netback/netback.c
+++ b/drivers/xen/netback/netback.c
@@ -51,6 +51,12 @@ struct netbk_rx_meta {
 	u8 copy:1;
 };
 
+struct netbk_tx_cb {
+	u16 copy_slots;
+	u16 pending_idx[1 + XEN_NETIF_NR_SLOTS_MIN];
+};
+#define netbk_tx_cb(skb) ((struct netbk_tx_cb *)skb->cb)
+
 struct netbk_tx_pending_inuse {
 	struct list_head list;
 	unsigned long alloc_time;
@@ -155,6 +161,8 @@ static struct sk_buff_head tx_queue;
 static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
 static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
 static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
+static gnttab_copy_t tx_copy_ops[2 * MAX_PENDING_REQS];
+static netif_tx_request_t tx_slots[XEN_NETIF_NR_SLOTS_MIN];
 
 static struct list_head net_schedule_list;
 static spinlock_t net_schedule_list_lock;
@@ -163,11 +171,19 @@ static spinlock_t net_schedule_list_lock
 static unsigned long mfn_list[MAX_MFN_ALLOC];
 static unsigned int alloc_index = 0;
 
+/*
+ * This is the maximum slots a TX request can have. If a guest sends a TX
+ * request which exceeds this limit it is considered malicious.
+ */
+static unsigned int max_tx_slots = XEN_NETIF_NR_SLOTS_MIN;
+module_param(max_tx_slots, uint, 0444);
+MODULE_PARM_DESC(max_tx_slots, "Maximum number of slots accepted in netfront TX requests");
+
 /* Setting this allows the safe use of this driver without netloop. */
-static int MODPARM_copy_skb = 1;
+static bool MODPARM_copy_skb = true;
 module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
 MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
-static int MODPARM_permute_returns = 0;
+static bool MODPARM_permute_returns;
 module_param_named(permute_returns, MODPARM_permute_returns, bool, S_IRUSR|S_IWUSR);
 MODULE_PARM_DESC(permute_returns, "Randomly permute the order in which TX responses are sent to the frontend");
 
@@ -1049,26 +1065,48 @@ static int netbk_count_requests(netif_t
 				netif_tx_request_t *txp, int work_to_do)
 {
 	RING_IDX cons = netif->tx.req_cons;
-	int frags = 0, drop_err = 0;
+	int slots = 0, drop_err = 0;
 
 	if (!(first->flags & XEN_NETTXF_more_data))
 		return 0;
 
 	do {
-		if (frags >= work_to_do) {
-			netdev_err(netif->dev, "Need more frags\n");
+		if (slots >= work_to_do) {
+			netdev_err(netif->dev, "Need more slots\n");
 			netbk_fatal_tx_err(netif);
 			return -ENODATA;
 		}
 
-		if (unlikely(frags >= MAX_SKB_FRAGS)) {
-			netdev_err(netif->dev, "Too many frags\n");
+		if (unlikely(slots >= max_tx_slots)) {
+			netdev_err(netif->dev, "Too many slots\n");
 			netbk_fatal_tx_err(netif);
 			return -E2BIG;
 		}
 
-		memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
-		       sizeof(*txp));
+		/*
+		 * The Xen network protocol had an implicit dependency on
+		 * MAX_SKB_FRAGS. XEN_NETIF_NR_SLOTS_MIN is set to the
+		 * historical MAX_SKB_FRAGS value 18 to honor the same
+		 * behavior as before. Any packet using more than 18 slots
+		 * but less than max_tx_slots slots is dropped.
+		 */
+		switch (slots) {
+		case 0 ... XEN_NETIF_NR_SLOTS_MIN - 1:
+			break;
+		case XEN_NETIF_NR_SLOTS_MIN:
+			if (net_ratelimit())
+				netdev_dbg(netif->dev,
+					   "slot count exceeding limit of %d, dropping packet\n",
+					   XEN_NETIF_NR_SLOTS_MIN);
+			if (!drop_err)
+				drop_err = -E2BIG;
+			/* fall through */
+		default:
+			--txp;
+			break;
+		}
+
+		*txp = *RING_GET_REQUEST(&netif->tx, cons + slots);
 		barrier();
 
 		/*
@@ -1088,7 +1126,7 @@ static int netbk_count_requests(netif_t
 		}
 
 		first->size -= txp->size;
-		frags++;
+		slots++;
 
 		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
 			netdev_err(netif->dev, "txp->offset: %x, size: %u\n",
@@ -1099,30 +1137,77 @@ static int netbk_count_requests(netif_t
 	} while ((txp++)->flags & XEN_NETTXF_more_data);
 
 	if (drop_err) {
-		netbk_tx_err(netif, first, cons + frags);
+		netbk_tx_err(netif, first, cons + slots);
 		return drop_err;
 	}
 
-	return frags;
+	return slots;
+}
+
+struct netbk_tx_gop {
+	gnttab_map_grant_ref_t *map;
+	gnttab_copy_t *copy;
+	void *ptr;
+};
+
+static void netbk_fill_tx_copy(const netif_tx_request_t *txreq,
+			       struct netbk_tx_gop *gop, domid_t domid)
+{
+	gop->copy--;
+	gop->copy->source.u.ref = txreq->gref;
+	gop->copy->source.domid = domid;
+	gop->copy->source.offset = txreq->offset;
+	gop->copy->dest.u.gmfn = virt_to_mfn(gop->ptr);
+	gop->copy->dest.domid = DOMID_SELF;
+	gop->copy->dest.offset = offset_in_page(gop->ptr);
+	gop->copy->flags = GNTCOPY_source_gref;
+
+	if (gop->copy->dest.offset + txreq->size > PAGE_SIZE) {
+		unsigned int first = PAGE_SIZE - gop->copy->dest.offset;
+
+		gop->copy->len = first;
+		gop->ptr += first;
+
+		gop->copy--;
+		gop->copy->source = gop->copy[1].source;
+		gop->copy->source.offset += first;
+		gop->copy->dest.u.gmfn = virt_to_mfn(gop->ptr);
+		gop->copy->dest.domid = DOMID_SELF;
+		gop->copy->dest.offset = 0;
+		gop->copy->flags = GNTCOPY_source_gref;
+		gop->copy->len = txreq->size - first;
+	} else
+		gop->copy->len = txreq->size;
+
+	gop->ptr += gop->copy->len;
 }
 
-static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
-						  struct sk_buff *skb,
-						  netif_tx_request_t *txp,
-						  gnttab_map_grant_ref_t *mop)
+void netbk_get_requests(netif_t *netif, struct sk_buff *skb,
+			netif_tx_request_t *txp, struct netbk_tx_gop *gop)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
-	u16 pending_idx = *(u16 *)skb->data;
+	u16 pending_idx = netbk_tx_cb(skb)->pending_idx[0];
 	int i, start;
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(frags) == pending_idx);
 
+	for (i = 0; i < netbk_tx_cb(skb)->copy_slots; ++i, txp++) {
+		pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
+
+		netbk_fill_tx_copy(txp, gop, netif->domid);
+
+		pending_tx_info[pending_idx].req = *txp;
+		netif_get(netif);
+		pending_tx_info[pending_idx].netif = netif;
+		netbk_tx_cb(skb)->pending_idx[1 + i] = pending_idx;
+	}
+
 	for (i = start; i < shinfo->nr_frags; i++, txp++) {
 		pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
 
-		gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
+		gnttab_set_map_op(gop->map++, idx_to_kaddr(pending_idx),
 				  GNTMAP_host_map | GNTMAP_readonly,
 				  txp->gref, netif->domid);
 
@@ -1132,14 +1217,17 @@ static gnttab_map_grant_ref_t *netbk_get
 		frag_set_pending_idx(&frags[i], pending_idx);
 	}
 
-	return mop;
+	if ((void *)gop->map > (void *)gop->copy && net_ratelimit())
+		netdev_warn(netif->dev, "Grant op overrun (%p > %p)\n",
+			    gop->map, gop->copy);
 }
 
-static int netbk_tx_check_mop(struct sk_buff *skb,
-			       gnttab_map_grant_ref_t **mopp)
+static int netbk_tx_check_gop(struct sk_buff *skb,
+			      struct netbk_tx_gop *gop, bool hdr_copied)
 {
-	gnttab_map_grant_ref_t *mop = *mopp;
-	u16 pending_idx = *(u16 *)skb->data;
+	gnttab_copy_t *cop = gop->copy;
+	gnttab_map_grant_ref_t *mop = gop->map;
+	u16 pending_idx = netbk_tx_cb(skb)->pending_idx[0];
 	netif_t *netif = pending_tx_info[pending_idx].netif;
 	netif_tx_request_t *txp;
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
@@ -1147,8 +1235,18 @@ static int netbk_tx_check_mop(struct sk_
 	int i, err, start;
 
 	/* Check status of header. */
-	err = mop->status;
-	if (unlikely(err != GNTST_okay)) {
+	if (hdr_copied) {
+		err = (--cop)->status;
+		txp = &pending_tx_info[pending_idx].req;
+		if (txp->size > cop->len)
+			cmpxchg_local(&err, GNTST_okay, (--cop)->status);
+		make_tx_response(netif, txp,
+				 err == GNTST_okay ? XEN_NETIF_RSP_OKAY
+						   : XEN_NETIF_RSP_ERROR);
+		pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+		netif_put(netif);
+	} else if (unlikely((err = mop->status) != GNTST_okay)) {
+		++mop;
 		txp = &pending_tx_info[pending_idx].req;
 		make_tx_response(netif, txp, XEN_NETIF_RSP_ERROR);
 		pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
@@ -1156,19 +1254,34 @@ static int netbk_tx_check_mop(struct sk_
 	} else {
 		set_phys_to_machine(idx_to_pfn(pending_idx),
 			FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
-		grant_tx_handle[pending_idx] = mop->handle;
+		grant_tx_handle[pending_idx] = mop++->handle;
 	}
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(shinfo->frags) == pending_idx);
 
-	for (i = start; i < nr_frags; i++) {
+	for (i = 0; i < netbk_tx_cb(skb)->copy_slots; ++i) {
+		int newerr = (--cop)->status;
+
+		pending_idx = netbk_tx_cb(skb)->pending_idx[1 + i];
+		txp = &pending_tx_info[pending_idx].req;
+		if (txp->size > cop->len)
+			cmpxchg_local(&newerr, GNTST_okay, (--cop)->status);
+		make_tx_response(netif, txp,
+				 newerr == GNTST_okay ? XEN_NETIF_RSP_OKAY
+						      : XEN_NETIF_RSP_ERROR);
+		cmpxchg_local(&err, GNTST_okay, newerr);
+		pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+		netif_put(netif);
+	}
+
+	for (i = start; i < nr_frags; i++, mop++) {
 		int j, newerr;
 
 		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
 
 		/* Check error status: if okay then remember grant handle. */
-		newerr = (++mop)->status;
+		newerr = mop->status;
 		if (likely(newerr == GNTST_okay)) {
 			set_phys_to_machine(idx_to_pfn(pending_idx),
 				FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
@@ -1190,8 +1303,10 @@ static int netbk_tx_check_mop(struct sk_
 			continue;
 
 		/* First error: invalidate header and preceding fragments. */
-		pending_idx = *((u16 *)skb->data);
-		netif_idx_release(pending_idx);
+		if (!hdr_copied) {
+			pending_idx = netbk_tx_cb(skb)->pending_idx[0];
+			netif_idx_release(pending_idx);
+		}
 		for (j = start; j < i; j++) {
 			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
 			netif_idx_release(pending_idx);
@@ -1201,7 +1316,11 @@ static int netbk_tx_check_mop(struct sk_
 		err = newerr;
 	}
 
-	*mopp = mop + 1;
+	gop->map = mop;
+	gop->copy = cop;
+	if ((void *)mop > (void *)cop && net_ratelimit())
+		netdev_warn(netif->dev, "Grant op check overrun (%p > %p)\n",
+			    mop, cop);
 	return err;
 }
 
@@ -1292,21 +1411,23 @@ static void net_tx_action(unsigned long
 {
 	struct sk_buff *skb;
 	netif_t *netif;
-	netif_tx_request_t txreq;
-	netif_tx_request_t txfrags[MAX_SKB_FRAGS];
+	netif_tx_request_t txreq, *txslot;
 	struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
 	u16 pending_idx;
 	RING_IDX i;
-	gnttab_map_grant_ref_t *mop;
+	struct netbk_tx_gop gop;
+	multicall_entry_t mcl[2];
 	unsigned int data_len;
 	int ret, work_to_do;
 
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct netbk_tx_cb));
+
 	net_tx_action_dealloc();
 
-	mop = tx_map_ops;
-	BUILD_BUG_ON(MAX_SKB_FRAGS >= MAX_PENDING_REQS);
-	while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
-		!list_empty(&net_schedule_list)) {
+	gop.map = tx_map_ops;
+	gop.copy = tx_copy_ops + ARRAY_SIZEOF(tx_copy_ops);
+	while (NR_PENDING_REQS + XEN_NETIF_NR_SLOTS_MIN < MAX_PENDING_REQS
+	       && !list_empty(&net_schedule_list)) {
 		/* Get a netif from the list with work to do. */
 		netif = poll_net_schedule_list();
 		/*
@@ -1389,7 +1510,8 @@ static void net_tx_action(unsigned long
 				continue;
 		}
 
-		ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
+		txslot = netbk->tx.slots;
+		ret = netbk_count_requests(netif, &txreq, txslot, work_to_do);
 		if (unlikely(ret < 0))
 			continue;
 
@@ -1417,6 +1539,12 @@ static void net_tx_action(unsigned long
 		data_len = (txreq.size > PKT_PROT_LEN &&
 			    ret < MAX_SKB_FRAGS) ?
 			PKT_PROT_LEN : txreq.size;
+		while (ret > MAX_SKB_FRAGS ||
+		       (ret && (data_len + txslot->size <= PKT_PROT_LEN ||
+				netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB))) {
+			data_len += txslot++->size;
+			--ret;
+		}
 
 		skb = alloc_skb(data_len + 16 + NET_IP_ALIGN,
 				GFP_ATOMIC | __GFP_NOWARN);
@@ -1442,39 +1570,38 @@ static void net_tx_action(unsigned long
 			}
 		}
 
-		gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
-				  GNTMAP_host_map | GNTMAP_readonly,
-				  txreq.gref, netif->domid);
-		mop++;
-
 		memcpy(&pending_tx_info[pending_idx].req,
 		       &txreq, sizeof(txreq));
 		pending_tx_info[pending_idx].netif = netif;
-		*((u16 *)skb->data) = pending_idx;
+		netbk_tx_cb(skb)->pending_idx[0] = pending_idx;
+		netbk_tx_cb(skb)->copy_slots = txslot - netbk->tx.slots;
 
 		__skb_put(skb, data_len);
+		gop.ptr = skb->data;
 
 		skb_shinfo(skb)->nr_frags = ret;
-		if (data_len < txreq.size)
+		if (data_len < txreq.size) {
+			gnttab_set_map_op(gop.map++, idx_to_kaddr(pending_idx),
+					  GNTMAP_host_map | GNTMAP_readonly,
+					  txreq.gref, netif->domid);
 			skb_shinfo(skb)->nr_frags++;
-		else
+		} else {
+			netbk_fill_tx_copy(&txreq, &gop, netif->domid);
 			pending_idx = INVALID_PENDING_IDX;
+		}
 		frag_set_pending_idx(skb_shinfo(skb)->frags, pending_idx);
 
 		__skb_queue_tail(&tx_queue, skb);
 
 		pending_cons++;
 
-		mop = netbk_get_requests(netif, skb, txfrags, mop);
+		netbk_get_requests(netif, skb, netbk->tx.slots, &gop);
 
 		netif->tx.req_cons = i;
 		netif_schedule_work(netif);
-
-		if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
-			break;
 	}
 
-	if (mop == tx_map_ops)
+	if (skb_queue_empty(&tx_queue))
 		goto out;
 
     /* NOTE: some maps may fail with GNTST_eagain, which could be successfully
@@ -1482,22 +1609,28 @@ static void net_tx_action(unsigned long
      * req and let the frontend resend the relevant packet again. This is fine
      * because it is unlikely that a network buffer will be paged out or shared,
      * and therefore it is unlikely to fail with GNTST_eagain. */
-	ret = HYPERVISOR_grant_table_op(
-		GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
-	BUG_ON(ret);
+	MULTI_grant_table_op(&mcl[0], GNTTABOP_copy, gop.copy,
+			     tx_copy_ops + ARRAY_SIZE(tx_copy_ops) - gop.copy);
+	MULTI_grant_table_op(&mcl[1], GNTTABOP_map_grant_ref,
+			     tx_map_ops, gop.map - tx_map_ops);
+	if (HYPERVISOR_multicall_check(mcl, 2, NULL))
+		BUG();
 
-	mop = tx_map_ops;
+	gop.map = tx_map_ops;
+	gop.copy = tx_copy_ops + ARRAY_SIZE(tx_copy_ops);
 	while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
 		struct net_device *dev;
 		netif_tx_request_t *txp;
 
-		pending_idx = *((u16 *)skb->data);
+		pending_idx = netbk_tx_cb(skb)->pending_idx[0];
 		netif       = pending_tx_info[pending_idx].netif;
 		dev         = netif->dev;
 		txp         = &pending_tx_info[pending_idx].req;
+		data_len    = skb->len;
 
-		/* Check the remap error code. */
-		if (unlikely(netbk_tx_check_mop(skb, &mop))) {
+		/* Check the remap/copy error code. */
+		if (unlikely(netbk_tx_check_gop(skb, &gop,
+						data_len >= txp->size))) {
 			netdev_dbg(dev, "netback grant failed.\n");
 			skb_shinfo(skb)->nr_frags = 0;
 			kfree_skb(skb);
@@ -1505,17 +1638,13 @@ static void net_tx_action(unsigned long
 			continue;
 		}
 
-		data_len = skb->len;
-		memcpy(skb->data,
-		       (void *)(idx_to_kaddr(pending_idx)|txp->offset),
-		       data_len);
 		if (data_len < txp->size) {
+			memcpy(skb->data,
+			       (void *)(idx_to_kaddr(pending_idx) + txp->offset),
+			       data_len);
 			/* Append the packet payload as a fragment. */
 			txp->offset += data_len;
 			txp->size -= data_len;
-		} else {
-			/* Schedule a response immediately. */
-			netif_idx_release(pending_idx);
 		}
 
 		if (txp->flags & XEN_NETTXF_csum_blank)
@@ -1547,15 +1676,6 @@ static void net_tx_action(unsigned long
 			continue;
 		}
 
-		if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
-		    unlikely(skb_linearize(skb))) {
-			netdev_dbg(dev,
-			           "Can't linearize skb in net_tx_action.\n");
-			kfree_skb(skb);
-			dev->stats.rx_dropped++;
-			continue;
-		}
-
 		dev->stats.rx_bytes += skb->len;
 		dev->stats.rx_packets++;
 
@@ -1712,6 +1832,13 @@ static int __init netback_init(void)
 	if (!is_running_on_xen())
 		return -ENODEV;
 
+	BUILD_BUG_ON(XEN_NETIF_NR_SLOTS_MIN >= MAX_PENDING_REQS);
+	if (max_tx_slots < XEN_NETIF_NR_SLOTS_MIN) {
+		pr_info("netback: max_tx_slots too small (%u), using XEN_NETIF_NR_SLOTS_MIN (%d)\n",
+			max_tx_slots, XEN_NETIF_NR_SLOTS_MIN);
+		max_tx_slots = XEN_NETIF_NR_SLOTS_MIN;
+	}
+
 	/* We can increase reservation by this much in net_rx_action(). */
 	balloon_update_driver_allowance(NET_RX_RING_SIZE);
 
