[PATCH v6 6/6] locking/pvqspinlock: Queue node adaptive spinning

All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed

From: Waiman Long <Waiman.Long@hpe.com>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	"H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org, linux-kernel@vger.kernel.org,
	Scott J Norton <scott.norton@hp.com>,
	Douglas Hatch <doug.hatch@hp.com>,
	Davidlohr Bueso <dave@stgolabs.net>,
	Waiman Long <Waiman.Long@hpe.com>
Subject: [PATCH v6 6/6] locking/pvqspinlock: Queue node adaptive spinning
Date: Fri, 11 Sep 2015 14:37:38 -0400	[thread overview]
Message-ID: <1441996658-62854-7-git-send-email-Waiman.Long@hpe.com> (raw)
In-Reply-To: <1441996658-62854-1-git-send-email-Waiman.Long@hpe.com>

In an overcommitted guest where some vCPUs have to be halted to make
forward progress in other areas, it is highly likely that a vCPU later
in the spinlock queue will be spinning while the ones earlier in the
queue would have been halted. The spinning in the later vCPUs is then
just a waste of precious CPU cycles because they are not going to
get the lock soon as the earlier ones have to be woken up and take
their turn to get the lock.

This patch implements an adaptive spinning mechanism where the vCPU
will call pv_wait() if the following conditions are true:

 1) the vCPU has not been halted before;
 2) the previous vCPU is not running.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

		    Westmere			Haswell
  Patch		32 vCPUs    48 vCPUs	32 vCPUs    48 vCPUs
  -----		--------    --------    --------    --------
  Before patch   3m02.3s     5m00.2s     1m43.7s     3m03.5s
  After patch    3m03.0s     4m37.5s	 1m43.0s     2m47.2s

For 32 vCPUs, this patch doesn't cause any noticeable change in
performance. For 48 vCPUs (over-committed), there is about 8%
performance improvement.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
---
 kernel/locking/qspinlock.c          |    5 ++-
 kernel/locking/qspinlock_paravirt.h |   52 +++++++++++++++++++++++++++++++++-
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 1be1aab..319e823 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -247,7 +247,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
  */
 
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+					   struct mcs_spinlock *prev) { }
 static __always_inline bool __pv_wait_head_and_lock(struct qspinlock *lock,
 						    struct mcs_spinlock *node,
 						    u32 tail)
@@ -398,7 +399,7 @@ queue:
 		prev = decode_tail(old);
 		WRITE_ONCE(prev->next, node);
 
-		pv_wait_node(node);
+		pv_wait_node(node, prev);
 		arch_mcs_spin_lock_contended(&node->locked);
 	}
 
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 9fd49a2..57ed38b 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,22 @@
 #define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
 
 /*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the following conditions are true:
+ * 1) vCPU in the previous node is not running
+ * 2) current vCPU has not been halted before
+ *
+ * The one lock stealing attempt allowed at slowpath entry mitigates the
+ * slight slowdown for non-overcommitted guest with this aggressive wait-early
+ * mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK.
+ */
+#define PV_PREV_CHECK_MASK	0xff
+
+/*
  * Queue node uses: vcpu_running & vcpu_halted.
  * Queue head uses: vcpu_running & vcpu_hashed.
  */
@@ -71,6 +87,7 @@ enum pv_qlock_stat {
 	pvstat_wait_head,
 	pvstat_wait_node,
 	pvstat_wait_again,
+	pvstat_wait_early,
 	pvstat_kick_wait,
 	pvstat_kick_unlock,
 	pvstat_spurious,
@@ -90,6 +107,7 @@ static const char * const stat_fsnames[pvstat_num] = {
 	[pvstat_wait_head]   = "wait_head_count",
 	[pvstat_wait_node]   = "wait_node_count",
 	[pvstat_wait_again]  = "wait_again_count",
+	[pvstat_wait_early]  = "wait_early_count",
 	[pvstat_kick_wait]   = "kick_wait_count",
 	[pvstat_kick_unlock] = "kick_unlock_count",
 	[pvstat_spurious]    = "spurious_wakeup",
@@ -328,6 +346,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
 }
 
 /*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *node, struct pv_node *prev, int loop)
+{
+
+	if ((loop & PV_PREV_CHECK_MASK) != 0)
+		return false;
+
+	return READ_ONCE(prev->state) != vcpu_running;
+}
+
+/*
  * Initialize the PV part of the mcs_spinlock node.
  */
 static void pv_init_node(struct mcs_spinlock *node)
@@ -345,16 +377,25 @@ static void pv_init_node(struct mcs_spinlock *node)
  * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
  * behalf.
  */
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 {
 	struct pv_node *pn = (struct pv_node *)node;
+	struct pv_node *pp = (struct pv_node *)prev;
 	int waitcnt = 0;
 	int loop;
+	bool wait_early;
 
 	for (;; waitcnt++) {
-		for (loop = SPIN_THRESHOLD; loop; loop--) {
+		for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
 			if (READ_ONCE(node->locked))
 				return;
+			/*
+			 * Wait early only if it has not been halted before.
+			 */
+			if (!waitcnt && pv_wait_early(pn, pp, loop)) {
+				wait_early = true;
+				break;
+			}
 			cpu_relax();
 		}
 
@@ -457,6 +498,12 @@ static int pv_wait_head_and_lock(struct qspinlock *lock,
 		lp = (struct qspinlock **)1;
 
 	for (;; waitcnt++) {
+		/*
+		 * Set correct vCPU state to be used by queue node wait-early
+		 * mechanism.
+		 */
+		WRITE_ONCE(pn->state, vcpu_running);
+
 		for (loop = SPIN_THRESHOLD; loop; loop--) {
 			/*
 			 * Try to acquire the lock when it is free.
@@ -492,6 +539,7 @@ static int pv_wait_head_and_lock(struct qspinlock *lock,
 				goto gotlock;
 			}
 		}
+		WRITE_ONCE(pn->state, vcpu_halted);
 		pvstat_inc(pvstat_wait_head);
 		if (waitcnt)
 			pvstat_inc(pvstat_wait_again);
-- 
1.7.1

next prev parent reply	other threads:[~2015-09-11 18:38 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-09-11 18:37 [PATCH v6 0/6] locking/qspinlock: Enhance pvqspinlock performance Waiman Long
2015-09-11 18:37 ` [PATCH v6 1/6] locking/qspinlock: relaxes cmpxchg & xchg ops in native code Waiman Long
2015-09-11 22:27   ` Davidlohr Bueso
2015-09-14 12:06     ` Peter Zijlstra
2015-09-14 18:40       ` Waiman Long
2015-09-14 15:16     ` Waiman Long
2015-09-11 18:37 ` [PATCH v6 2/6] locking/pvqspinlock: Unconditional PV kick with _Q_SLOW_VAL Waiman Long
2015-09-18  8:50   ` [tip:locking/core] locking/pvqspinlock: Kick the PV CPU unconditionally when _Q_SLOW_VAL tip-bot for Waiman Long
2015-09-11 18:37 ` [PATCH v6 3/6] locking/pvqspinlock, x86: Optimize PV unlock code path Waiman Long
2015-09-11 18:37 ` [PATCH v6 4/6] locking/pvqspinlock: Collect slowpath lock statistics Waiman Long
2015-09-11 23:13   ` Davidlohr Bueso
2015-09-14 15:25     ` Waiman Long
2015-09-14 21:41       ` Davidlohr Bueso
2015-09-15  3:47         ` Waiman Long
2015-09-11 18:37 ` [PATCH v6 5/6] locking/pvqspinlock: Allow 1 lock stealing attempt Waiman Long
2015-09-14 13:57   ` Peter Zijlstra
2015-09-14 19:02     ` Waiman Long
2015-09-14 14:00   ` Peter Zijlstra
2015-09-14 19:15     ` Waiman Long
2015-09-14 19:38       ` Waiman Long
2015-09-15  8:24       ` Peter Zijlstra
2015-09-15 15:29         ` Waiman Long
2015-09-16 15:01           ` Peter Zijlstra
2015-09-17 15:08             ` Waiman Long
2015-09-14 14:04   ` Peter Zijlstra
2015-09-14 19:19     ` Waiman Long
2015-09-11 18:37 ` Waiman Long [this message]
2015-09-14 14:10   ` [PATCH v6 6/6] locking/pvqspinlock: Queue node adaptive spinning Peter Zijlstra
2015-09-14 19:37     ` Waiman Long
2015-09-15  8:38       ` Peter Zijlstra
2015-09-15 15:32         ` Waiman Long
2015-09-16 15:03           ` Peter Zijlstra

find likely ancestor, descendant, or conflicting patches for this message:
dfblob:1be1aab dfblob:9fd49a2 dfblob:319e823 dfblob:57ed38b
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1441996658-62854-7-git-send-email-Waiman.Long@hpe.com \
    --to=waiman.long@hpe.com \
    --cc=dave@stgolabs.net \
    --cc=doug.hatch@hp.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=scott.norton@hp.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.