From 4d26c22d2d33879d9bd7e93eb89e0986f1c9cb91 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hp.com>
Date: Wed, 16 Oct 2013 10:49:43 -0400
Subject: [PATCH 7/9] mutex: Make more scalable by doing fewer atomic operations
References: bnc#849256
Git-commit: 0dc8c730c98a06a4d927f8d08bd0dd6de973b8dd
Git-commit: cc189d2513d1f45cde87a9043fe3be28559c7490
Patch-mainline: v3.10

In the __mutex_lock_common() function, an initial entry into
the lock slow path will cause two atomic_xchg instructions to be
issued. Together with the atomic decrement in the fast path, a
total of three atomic read-modify-write instructions will be
issued in rapid succession. This can cause a lot of cache
bouncing when many tasks are trying to acquire the mutex at the
same time.

This patch will reduce the number of atomic_xchg instructions
used by checking the counter value first before issuing the
instruction. The atomic_read() function is just a simple memory
read. The atomic_xchg() function, on the other hand, can be up
to 2 order of magnitude or even more in cost when compared with
atomic_read(). By using atomic_read() to check the value first
before calling atomic_xchg(), we can avoid a lot of unnecessary
cache coherency traffic. The only downside with this change is
that a task on the slow path will have a tiny bit less chance of
getting the mutex when competing with another task in the fast
path.

The same is true for the atomic_cmpxchg() function in the
mutex-spin-on-owner loop. So an atomic_read() is also performed
before calling atomic_cmpxchg().

The mutex locking and unlocking code for the x86 architecture
can allow any negative number to be used in the mutex count to
indicate that some tasks are waiting for the mutex. That is
probably true for other architectures as well.

This patch is based on the following 3.10 kernel patches:

1) 0dc8c730c98a06a4d927f8d08bd0dd6de973b8dd
   mutex: Make more scalable by doing less atomic operations

2) cc189d2513d1f45cde87a9043fe3be28559c7490
   mutex: Back out architecture specific check for negative mutex count

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Acked-by: Jeff Mahoney <jeffm@suse.com>
---
 kernel/mutex.c |   14 +++++++++++---
 1 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/kernel/mutex.c b/kernel/mutex.c
index d607ed5..0efcffb 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -36,6 +36,12 @@
 # include <asm/mutex.h>
 #endif
 
+/*
+ * A negative mutex count indicates that waiters are sleeping waiting for the
+ * mutex.
+ */
+#define	MUTEX_SHOW_NO_WAITER(mutex)	(atomic_read(&(mutex)->count) >= 0)
+
 void
 __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 {
@@ -170,7 +176,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		if (owner && !mutex_spin_on_owner(lock, owner))
 			break;
 
-		if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+		if ((atomic_read(&lock->count) == 1) &&
+		    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
 			lock_acquired(&lock->dep_map, ip);
 			mutex_set_owner(lock);
 			preempt_enable();
@@ -204,7 +211,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
 
-	if (atomic_xchg(&lock->count, -1) == 1)
+	if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
 		goto done;
 
 	lock_contended(&lock->dep_map, ip);
@@ -219,7 +226,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * that when we release the lock, we properly wake up the
 		 * other waiters:
 		 */
-		if (atomic_xchg(&lock->count, -1) == 1)
+		if (MUTEX_SHOW_NO_WAITER(lock) &&
+		   (atomic_xchg(&lock->count, -1) == 1))
 			break;
 
 		/*
-- 
1.7.1

