From: Dongxiao Xu <dongxiao.xu@intel.com>
Subject: Use Kernel thread to replace the tasklet.
Patch-mainline: Never, SUSE-Xen specific

 Kernel thread has more control over QoS, and could improve
 dom0's userspace responseness.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>

Subject2: xen: ensure locking gnttab_copy_grant_page is safe against interrupts.

Now that netback processing occurs in a thread instead of a tasklet
gnttab_copy_grant_page needs to be safe against interrupts.

The code is currently commented out in this tree but on 2.6.18 we observed a
deadlock where the netback thread called gnttab_copy_grant_page, locked
gnttab_dma_lock for writing, was interrupted and on return from interrupt the
network stack's TX tasklet ended up calling __gnttab_dma_map_page via the
hardware driver->swiotlb and tries to take gnttab_dma_lock for reading.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>#
Cc: "Xu, Dongxiao" <dongxiao.xu@intel.com>

Subject3: Add a missing test to tx_work_todo.

Add a test so that, when netback is using worker threads, net_tx_action()
gets called in a timely manner when the pending_inuse list is populated.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>

jb: changed write_seq{,un}lock_irq() to write_seq{,un}lock_bh(), and
    made the use of kernel threads optional (but default)
Acked-by: jbeulich@novell.com

--- a/drivers/xen/core/gnttab.c
+++ b/drivers/xen/core/gnttab.c
@@ -659,14 +659,14 @@ int gnttab_copy_grant_page(grant_ref_t r
 	pfn = page_to_pfn(page);
 	new_mfn = virt_to_mfn(new_addr);
 
-	write_seqlock(&gnttab_dma_lock);
+	write_seqlock_bh(&gnttab_dma_lock);
 
 	/* Make seq visible before checking page_mapped. */
 	smp_mb();
 
 	/* Has the page been DMA-mapped? */
 	if (unlikely(page_mapped(page))) {
-		write_sequnlock(&gnttab_dma_lock);
+		write_sequnlock_bh(&gnttab_dma_lock);
 		put_page(new_page);
 		err = -EBUSY;
 		goto out;
@@ -697,7 +697,7 @@ int gnttab_copy_grant_page(grant_ref_t r
 	BUG_ON(err);
 	BUG_ON(unmap.status != GNTST_okay);
 
-	write_sequnlock(&gnttab_dma_lock);
+	write_sequnlock_bh(&gnttab_dma_lock);
 
 	new_page->mapping = page->mapping;
 	new_page->index = page->index;
--- a/drivers/xen/netback/common.h
+++ b/drivers/xen/netback/common.h
@@ -270,6 +270,9 @@ struct xen_netbk {
 		netif_tx_request_t slots[XEN_NETIF_NR_SLOTS_MIN];
 	} tx;
 
+	wait_queue_head_t action_wq;
+	struct task_struct *task;
+
 	struct xen_netbk_rx {
 		struct sk_buff_head queue;
 		struct tasklet_struct tasklet;
--- a/drivers/xen/netback/netback.c
+++ b/drivers/xen/netback/netback.c
@@ -36,6 +36,7 @@
 
 #include "common.h"
 #include <linux/if_vlan.h>
+#include <linux/kthread.h>
 #include <linux/pfn.h>
 #include <linux/vmalloc.h>
 #include <net/tcp.h>
@@ -49,6 +50,8 @@
 
 struct xen_netbk *__read_mostly xen_netbk;
 unsigned int __read_mostly netbk_nr_groups;
+static bool __read_mostly use_kthreads = true;
+static bool __initdata bind_threads;
 
 #define GET_GROUP_INDEX(netif) ((netif)->group)
 
@@ -169,7 +172,11 @@ static bool MODPARM_permute_returns;
 module_param_named(permute_returns, MODPARM_permute_returns, bool, S_IRUSR|S_IWUSR);
 MODULE_PARM_DESC(permute_returns, "Randomly permute the order in which TX responses are sent to the frontend");
 module_param_named(groups, netbk_nr_groups, uint, 0);
-MODULE_PARM_DESC(groups, "Specify the number of tasklet pairs to use");
+MODULE_PARM_DESC(groups, "Specify the number of tasklet pairs/threads to use");
+module_param_named(tasklets, use_kthreads, invbool, 0);
+MODULE_PARM_DESC(tasklets, "Use tasklets instead of kernel threads");
+module_param_named(bind, bind_threads, bool, 0);
+MODULE_PARM_DESC(bind, "Bind kernel threads to (v)CPUs");
 
 int netbk_copy_skb_mode;
 
@@ -226,6 +233,22 @@ static void flush_notify_list(netif_t *l
 		BUG();
 }
 
+static void netbk_rx_schedule(struct xen_netbk_rx *netbk)
+{
+	if (use_kthreads)
+		wake_up(&container_of(netbk, struct xen_netbk, rx)->action_wq);
+	else
+		tasklet_schedule(&netbk->tasklet);
+}
+
+static void netbk_tx_schedule(struct xen_netbk *netbk)
+{
+	if (use_kthreads)
+		wake_up(&netbk->action_wq);
+	else
+		tasklet_schedule(&netbk->tx.tasklet);
+}
+
 static inline void maybe_schedule_tx_action(netif_t *netif)
 {
 	struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
@@ -233,7 +256,7 @@ static inline void maybe_schedule_tx_act
 	smp_mb();
 	if ((nr_pending_reqs(netbk) < (MAX_PENDING_REQS/2)) &&
 	    !list_empty(&netbk->tx.schedule_list))
-		tasklet_schedule(&netbk->tx.tasklet);
+		netbk_tx_schedule(netbk);
 }
 
 static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
@@ -419,7 +442,7 @@ int netif_be_start_xmit(struct sk_buff *
 
 	netbk = &xen_netbk[group].rx;
 	skb_queue_tail(&netbk->queue, skb);
-	tasklet_schedule(&netbk->tasklet);
+	netbk_rx_schedule(netbk);
 
 	return NETDEV_TX_OK;
 
@@ -867,8 +890,11 @@ static void net_rx_action(unsigned long
 
 		if (netif_queue_stopped(netif->dev) &&
 		    netif_schedulable(netif) &&
-		    !netbk_queue_full(netif))
+		    !netbk_queue_full(netif)) {
+			local_bh_disable();
 			netif_wake_queue(netif->dev);
+			local_bh_enable();
+		}
 
 		if (ret && netif != notify_tail && !netif->rx_notify_link) {
 			if (notify_tail)
@@ -891,7 +917,7 @@ static void net_rx_action(unsigned long
 	/* More work to do? */
 	if (!skb_queue_empty(&netbk->queue) &&
 	    !timer_pending(&netbk->timer))
-		tasklet_schedule(&netbk->tasklet);
+		netbk_rx_schedule(netbk);
 #if 0
 	else
 		xen_network_done_notify();
@@ -900,12 +926,12 @@ static void net_rx_action(unsigned long
 
 static void net_alarm(unsigned long group)
 {
-	tasklet_schedule(&xen_netbk[group].rx.tasklet);
+	netbk_rx_schedule(&xen_netbk[group].rx);
 }
 
 static void netbk_tx_pending_timeout(unsigned long group)
 {
-	tasklet_schedule(&xen_netbk[group].tx.tasklet);
+	netbk_tx_schedule(&xen_netbk[group]);
 }
 
 static int __on_net_schedule_list(netif_t *netif)
@@ -1814,7 +1840,10 @@ static void net_tx_action(unsigned long
 		dev->stats.rx_bytes += skb->len;
 		dev->stats.rx_packets++;
 
-		netif_rx(skb);
+		if (use_kthreads)
+			netif_rx_ni(skb);
+		else
+			netif_rx(skb);
 	}
 
 	if (gop.notify.head)
@@ -1844,7 +1873,7 @@ static void netif_idx_release(struct xen
 	netbk->tx.dealloc_prod++;
 	spin_unlock_irqrestore(&netbk->tx.release_lock, flags);
 
-	tasklet_schedule(&netbk->tx.tasklet);
+	netbk_tx_schedule(netbk);
 }
 
 static void netif_page_release(struct page *page, unsigned int order)
@@ -1991,6 +2020,50 @@ static struct irqaction netif_be_dbg_act
 };
 #endif
 
+static inline int rx_work_todo(struct xen_netbk *netbk)
+{
+	return !skb_queue_empty(&netbk->rx.queue);
+}
+
+static inline int tx_work_todo(struct xen_netbk *netbk)
+{
+	if (netbk->tx.dealloc_cons != netbk->tx.dealloc_prod)
+		return 1;
+
+	if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
+	    !list_empty(&netbk->tx.pending_inuse_head))
+		return 1;
+
+	if (nr_pending_reqs(netbk) + XEN_NETIF_NR_SLOTS_MIN < MAX_PENDING_REQS
+	    && !list_empty(&netbk->tx.schedule_list))
+		return 1;
+
+	return 0;
+}
+
+static int netbk_action_thread(void *index)
+{
+	unsigned long group = (unsigned long)index;
+	struct xen_netbk *netbk = &xen_netbk[group];
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(netbk->action_wq,
+					 rx_work_todo(netbk) ||
+					 tx_work_todo(netbk) ||
+					 kthread_should_stop());
+		cond_resched();
+
+		if (rx_work_todo(netbk))
+			net_rx_action(group);
+
+		if (tx_work_todo(netbk))
+			net_tx_action(group);
+	}
+
+	return 0;
+}
+
+
 static int __init netback_init(void)
 {
 	unsigned int i, group;
@@ -2028,9 +2101,6 @@ static int __init netback_init(void)
 	for (group = 0; group < netbk_nr_groups; group++) {
 		struct xen_netbk *netbk = &xen_netbk[group];
 
-		tasklet_init(&netbk->tx.tasklet, net_tx_action, group);
-		tasklet_init(&netbk->rx.tasklet, net_rx_action, group);
-
 		skb_queue_head_init(&netbk->rx.queue);
 		skb_queue_head_init(&netbk->tx.queue);
 
@@ -2066,6 +2136,26 @@ static int __init netback_init(void)
 			netbk->tx.pending_ring[i] = i;
 			INIT_LIST_HEAD(&netbk->tx.pending_inuse[i].list);
 		}
+
+		if (use_kthreads) {
+			init_waitqueue_head(&netbk->action_wq);
+			netbk->task = kthread_create(netbk_action_thread,
+						     (void *)(long)group,
+						     "netback/%u", group);
+
+			if (IS_ERR(netbk->task)) {
+				pr_err("netback: kthread_create() failed\n");
+				rc = PTR_ERR(netbk->task);
+				goto failed_init;
+			}
+			if (bind_threads)
+				kthread_bind(netbk->task,
+					     group % num_online_cpus());
+			wake_up_process(netbk->task);
+		} else {
+			tasklet_init(&netbk->tx.tasklet, net_tx_action, group);
+			tasklet_init(&netbk->rx.tasklet, net_rx_action, group);
+		}
 	}
 
 	netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
@@ -2092,12 +2182,15 @@ static int __init netback_init(void)
 	return 0;
 
 failed_init:
-	while (group-- > 0) {
+	do {
 		struct xen_netbk *netbk = &xen_netbk[group];
 
-		free_empty_pages_and_pagevec(netbk->tx.mmap_pages,
-					     MAX_PENDING_REQS);
-	}
+		if (use_kthreads && netbk->task && !IS_ERR(netbk->task))
+			kthread_stop(netbk->task);
+		if (netbk->tx.mmap_pages)
+			free_empty_pages_and_pagevec(netbk->tx.mmap_pages,
+						     MAX_PENDING_REQS);
+	} while (group--);
 	vfree(xen_netbk);
 	balloon_update_driver_allowance(-(long)netbk_nr_groups
 					* NET_RX_RING_SIZE);
