From: jbeulich@suse.com
Subject: blktap: refine mm tracking
Patch-mainline: Never, SUSE-Xen specific
References: bsc#952976

As already noted in c/s 1013:eb21d96a6aae ("xen/blktap: fix cleanup
after unclean application exit") and 1015:2a4b455b1fba ("blktap: fix
cleanup after unclean application exit #2"), the extra reference
obtained on tapdisk's mm (from c/s 867:978499ee4f39 ["linux/blktap: fix
vma_close() for partial munmap"]) is problematic. However, while the
two c/s fixed what they claim to, the reconnect case got broken
(tap_blkif_schedule() clearing init->mm left no way for it to get set
again).

Do away with the extra reference: The mm of interest can't go away as
long as we have a VMA in it. To take care of VMA splitting, track the
amount of outstanding mapped space, and zap info->mm when that value
drops to zero.

At the same time also fix the oversight in the first of the mentioned
c/s of not keeping ring_ok up to date: The zapping of info->mm without
clearing info->ring_ok led to NULL pointer accesses in down_read()
called from dispatch_rw_block_io(). We don't really need the extra
flag, we can use info->mm for that purpose.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/drivers/xen/blktap/blktap.c
+++ b/drivers/xen/blktap/blktap.c
@@ -104,10 +104,10 @@ typedef struct domid_translate_ext {
 typedef struct tap_blkif {
 	struct mm_struct *mm;         /*User address space                   */
 	unsigned long rings_vstart;   /*Kernel memory mapping                */
+	unsigned long rings_total;    /*Kernel memory mapping size           */
 	unsigned long user_vstart;    /*User memory mapping                  */
 	unsigned long dev_inuse;      /*One process opens device at a time.  */
 	unsigned long dev_pending;    /*In process of being opened           */
-	unsigned long ring_ok;        /*make this ring->state                */
 	blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
 	wait_queue_head_t wait;       /*for poll                             */
 	unsigned long mode;           /*current switching mode               */
@@ -407,12 +407,22 @@ static void blktap_vma_open(struct vm_ar
  */
 static void blktap_vma_close(struct vm_area_struct *vma)
 {
+	tap_blkif_t *info;
 	struct vm_area_struct *next = vma->vm_next;
 
+	if (vma->vm_file == NULL)
+		return;
+
+	info = vma->vm_file->private_data;
+	info->rings_total -= vma->vm_end - vma->vm_start;
+	if (info->rings_total == 0) {
+		info->mm = NULL;
+		return;
+	}
+
 	if (next == NULL ||
 	    vma->vm_ops != next->vm_ops ||
 	    vma->vm_end != next->vm_start ||
-	    vma->vm_file == NULL ||
 	    vma->vm_file != next->vm_file)
 		return;
 
@@ -546,7 +556,6 @@ void signal_tapdisk(int idx)
 {
 	tap_blkif_t *info;
 	struct task_struct *ptask;
-	struct mm_struct *mm;
 
 	/*
 	 * if the userland tools set things up wrong, this could be negative;
@@ -566,10 +575,7 @@ void signal_tapdisk(int idx)
 			info->status = CLEANSHUTDOWN;
 	}
 	info->blkif = NULL;
-
-	mm = xchg(&info->mm, NULL);
-	if (mm)
-		mmput(mm);
+	info->mm = NULL;
 }
 
 static int blktap_open(struct inode *inode, struct file *filp)
@@ -641,19 +647,16 @@ static int blktap_open(struct inode *ino
 static int blktap_release(struct inode *inode, struct file *filp)
 {
 	tap_blkif_t *info = filp->private_data;
-	struct mm_struct *mm;
 	
 	/* check for control device */
 	if (!info)
 		return 0;
 
-	info->ring_ok = 0;
+	info->mm = NULL;
 	smp_wmb();
 	info->rings_vstart = 0;
+	info->rings_total = 0;
 
-	mm = xchg(&info->mm, NULL);
-	if (mm)
-		mmput(mm);
 	kfree(info->foreign_maps->map);
 	kfree(info->foreign_maps);
 	info->foreign_maps = NULL;
@@ -712,7 +715,7 @@ static int blktap_mmap(struct file *filp
 		return -ENOMEM;
 	}
 
-	if (info->rings_vstart) {
+	if (info->rings_total) {
 		WPRINTK("mmap already called on filp %p (minor %d)\n",
 			filp, info->minor);
 		return -EPERM;
@@ -730,6 +733,7 @@ static int blktap_mmap(struct file *filp
 
 	size >>= PAGE_SHIFT;
 	info->rings_vstart = vma->vm_start;
+	info->rings_total  = vma->vm_end - vma->vm_start;
 	info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
     
 	/* Map the ring pages to the start of the region and reserve it. */
@@ -766,15 +770,15 @@ static int blktap_mmap(struct file *filp
 	vma->vm_mm->context.has_foreign_mappings = 1;
 #endif
 
-	info->mm = get_task_mm(current);
 	smp_wmb();
-	info->ring_ok = 1;
+	info->mm = vma->vm_mm;
 	return 0;
  fail:
 	/* Clear any active mappings. */
 	zap_page_range(vma, vma->vm_start, 
 		       vma->vm_end - vma->vm_start, NULL);
 	info->rings_vstart = 0;
+	info->rings_total = 0;
 
 	return -ENOMEM;
 }
@@ -1071,6 +1075,7 @@ static void fast_flush_area(pending_req_
 	unsigned long uvaddr;
 	struct mm_struct *mm = info->mm;
 
+	smp_rmb();
 	if (mm != NULL)
 		down_read(&mm->mmap_sem);
 
@@ -1202,13 +1207,6 @@ int tap_blkif_schedule(void *arg)
 	info = tapfds[blkif->dev_num];
 	blkif_put(blkif);
 
-	if (info) {
-		struct mm_struct *mm = xchg(&info->mm, NULL);
-
-		if (mm)
-			mmput(mm);
-	}
-
 	return 0;
 }
 
@@ -1497,11 +1495,12 @@ static void dispatch_rw_block_io(blkif_t
 	}
 	
 	/* Make sure userspace is ready. */
-	if (!info->ring_ok) {
+	mm = info->mm;
+	smp_rmb();
+	if (!mm) {
 		WPRINTK("ring not ready for requests!\n");
 		goto fail_response;
 	}
-	smp_rmb();
 
 	if (RING_FULL(&info->ufe_ring)) {
 		WPRINTK("fe_ring is full, "
@@ -1524,7 +1523,6 @@ static void dispatch_rw_block_io(blkif_t
 	}
 
 	op = 0;
-	mm = info->mm;
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		down_read(&mm->mmap_sem);
 	for (i = 0; i < nseg; i++) {
