All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
From: Waiman Long <Waiman.Long@hp.com>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	"H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org, linux-kernel@vger.kernel.org,
	Scott J Norton <scott.norton@hp.com>,
	Douglas Hatch <doug.hatch@hp.com>,
	Davidlohr Bueso <dave@stgolabs.net>,
	Waiman Long <Waiman.Long@hp.com>
Subject: [PATCH v2 4/6] locking/pvqspinlock: Allow vCPUs kick-ahead
Date: Tue, 14 Jul 2015 22:13:35 -0400	[thread overview]
Message-ID: <1436926417-20256-5-git-send-email-Waiman.Long@hp.com> (raw)
In-Reply-To: <1436926417-20256-1-git-send-email-Waiman.Long@hp.com>

Frequent CPU halting (vmexit) and CPU kicking (vmenter) lengthens
critical section and block forward progress.  This patch implements
a kick-ahead mechanism where the unlocker will kick the queue head
vCPUs as well as up to four additional vCPUs next to the queue head
if they were halted.  The kickings are done after exiting the critical
section to improve parallelism.

The amount of kick-ahead allowed depends on the number of vCPUs
in the VM guest.  This patch, by itself, won't do much as most of
the kickings are currently done at lock time. Coupled with the next
patch that defers lock time kicking to unlock time, it should improve
overall system performance in a busy overcommitted guest.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

		    Westmere			Haswell
  Patch		32 vCPUs    48 vCPUs	32 vCPUs    48 vCPUs
  -----		--------    --------    --------    --------
  Before patch	 3m25.0s    10m34.1s	 2m02.0s    15m35.9s
  After patch    3m27.4s    10m32.0s	 2m00.8s    14m52.5s

There wasn't too much difference before and after the patch.

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
---
 kernel/locking/qspinlock_paravirt.h |   77 +++++++++++++++++++++++++++++++++-
 1 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index c8485c4..f3ceeff 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -51,6 +51,7 @@ enum pv_qlock_stat {
 	pvstat_kick_time,
 	pvstat_lock_kick,
 	pvstat_unlock_kick,
+	pvstat_kick_ahead,
 	pvstat_pend_lock,
 	pvstat_pend_fail,
 	pvstat_spurious,
@@ -72,6 +73,7 @@ static const char * const stat_fsnames[pvstat_num] = {
 	[pvstat_kick_time]   = "kick_time_count",
 	[pvstat_lock_kick]   = "lock_kick_count",
 	[pvstat_unlock_kick] = "unlock_kick_count",
+	[pvstat_kick_ahead]  = "kick_ahead_count",
 	[pvstat_pend_lock]   = "pending_lock_count",
 	[pvstat_pend_fail]   = "pending_fail_count",
 	[pvstat_spurious]    = "spurious_wakeup",
@@ -85,7 +87,8 @@ static atomic_t pvstats[pvstat_num];
  * pv_kick_latencies = sum of all pv_kick latencies in ns
  * pv_wake_latencies = sum of all wakeup latencies in ns
  *
- * Avg kick latency = pv_kick_latencies/(lock_kick_count + unlock_kick_count)
+ * Avg kick latency = pv_kick_latencies/
+ *		     (lock_kick_count + unlock_kick_count + kick_ahead_count)
  * Avg wake latency = pv_wake_latencies/kick_time_count
  */
 static atomic64_t pv_kick_latencies, pv_wake_latencies;
@@ -217,6 +220,12 @@ static struct pv_hash_entry *pv_lock_hash;
 static unsigned int pv_lock_hash_bits __read_mostly;
 
 /*
+ * Allow kick-ahead of vCPUs at unlock time
+ */
+#define PV_KICK_AHEAD_MAX	4
+static int pv_kick_ahead __read_mostly;
+
+/*
  * Allocate memory for the PV qspinlock hash buckets
  *
  * This function should be called from the paravirt spinlock initialization
@@ -224,7 +233,16 @@ static unsigned int pv_lock_hash_bits __read_mostly;
  */
 void __init __pv_init_lock_hash(void)
 {
-	int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+	int ncpus = num_possible_cpus();
+	int pv_hash_size = ALIGN(4 * ncpus, PV_HE_PER_LINE);
+	int i;
+
+	/*
+	 * The minimum number of vCPUs required in each kick-ahead level
+	 */
+	static const u8 kick_ahead_threshold[PV_KICK_AHEAD_MAX] = {
+		4, 8, 16, 32
+	};
 
 	if (pv_hash_size < PV_HE_MIN)
 		pv_hash_size = PV_HE_MIN;
@@ -238,6 +256,18 @@ void __init __pv_init_lock_hash(void)
 					       pv_hash_size, 0, HASH_EARLY,
 					       &pv_lock_hash_bits, NULL,
 					       pv_hash_size, pv_hash_size);
+	/*
+	 * Enable the unlock kick ahead mode according to the number of
+	 * vCPUs available.
+	 */
+	for (i = PV_KICK_AHEAD_MAX; i > 0; i--)
+		if (ncpus >= kick_ahead_threshold[i - 1]) {
+			pv_kick_ahead = i;
+			break;
+		}
+	if (pv_kick_ahead)
+		pr_info("PV unlock kick ahead level %d enabled\n",
+			pv_kick_ahead);
 }
 
 #define for_each_hash_entry(he, offset, hash)						\
@@ -424,6 +454,25 @@ static void pv_wait_node(struct mcs_spinlock *node)
 }
 
 /*
+ * Helper to get the address of the next kickable node
+ * The node has to be in the halted state and is being transitioned to
+ * running state by this function. Otherwise, NULL will be returned.
+ */
+static inline struct pv_node *pv_get_kick_node(struct pv_node *node)
+{
+	struct pv_node *next = (struct pv_node *)READ_ONCE(node->mcs.next);
+
+	if (!next)
+		return NULL;
+
+	if ((READ_ONCE(next->state) != vcpu_halted) ||
+	    (xchg(&next->state, vcpu_running) != vcpu_halted))
+		next = NULL;	/* No kicking is needed */
+
+	return next;
+}
+
+/*
  * Called after setting next->locked = 1, used to wake those stuck in
  * pv_wait_node().
  */
@@ -510,7 +559,8 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 {
 	struct __qspinlock *l = (void *)lock;
-	struct pv_node *node;
+	struct pv_node *node, *nxt, *next[PV_KICK_AHEAD_MAX];
+	int i, nr_kick;
 
 	/*
 	 * We must not unlock if SLOW, because in that case we must first
@@ -527,6 +577,19 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	node = pv_unhash(lock);
 
 	/*
+	 * Implement unlock kick-ahead
+	 *
+	 * Access the next group of nodes, if available, and prepare to kick
+	 * them after releasing the lock if they are in the halted state. This
+	 * should improve performance on an overcommitted system.
+	 */
+	for (nr_kick = 0, nxt = node; nr_kick < pv_kick_ahead; nr_kick++) {
+		nxt = next[nr_kick] = pv_get_kick_node(nxt);
+		if (!nxt)
+			break;
+	}
+
+	/*
 	 * Now that we have a reference to the (likely) blocked pv_node,
 	 * release the lock.
 	 */
@@ -538,6 +601,14 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 */
 	pvstat_inc(pvstat_unlock_kick);
 	pv_kick(node->cpu);
+
+	/*
+	 * Kick the next group of vCPUs, if available.
+	 */
+	for (i = 0; i < nr_kick; i++) {
+		pvstat_inc(pvstat_kick_ahead);
+		pv_kick(next[i]->cpu);
+	}
 }
 /*
  * Include the architecture specific callee-save thunk of the
-- 
1.7.1


  parent reply	other threads:[~2015-07-15  2:15 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-07-15  2:13 [PATCH 0/6 v2] locking/qspinlock: Enhance pvqspinlock performance Waiman Long
2015-07-15  2:13 ` [PATCH v2 1/6] locking/pvqspinlock: Unconditional PV kick with _Q_SLOW_VAL Waiman Long
2015-07-15  9:10   ` Peter Zijlstra
2015-07-16  0:18     ` Waiman Long
2015-07-16  5:42       ` Peter Zijlstra
2015-07-16 14:07         ` Waiman Long
2015-07-16 15:04           ` Waiman Long
2015-07-16 15:10             ` Will Deacon
2015-08-03 16:59               ` [tip:locking/core] locking/Documentation: Clarify failed cmpxchg( ) memory ordering semantics tip-bot for Will Deacon
2015-08-03 17:36                 ` Davidlohr Bueso
2015-07-15  2:13 ` [PATCH v2 2/6] locking/pvqspinlock: Add pending bit support Waiman Long
2015-07-15  2:13 ` [PATCH v2 3/6] locking/pvqspinlock: Collect slowpath lock statistics Waiman Long
2015-07-15  2:13 ` Waiman Long [this message]
2015-07-15  9:39   ` [PATCH v2 4/6] locking/pvqspinlock: Allow vCPUs kick-ahead Peter Zijlstra
2015-07-16  2:01     ` Waiman Long
2015-07-16  5:46       ` Peter Zijlstra
2015-07-16 14:51         ` Waiman Long
2015-07-15  2:13 ` [PATCH v2 5/6] locking/pvqspinlock: Opportunistically defer kicking to unlock time Waiman Long
2015-07-15  6:14   ` Raghavendra K T
2015-07-15 10:03   ` Peter Zijlstra
2015-07-16  2:18     ` Waiman Long
2015-07-16  5:49       ` Peter Zijlstra
2015-07-15  2:13 ` [PATCH v2 6/6] locking/pvqspinlock: Queue node adaptive spinning Waiman Long
2015-07-15 10:01   ` Peter Zijlstra
2015-07-16  2:13     ` Waiman Long

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1436926417-20256-5-git-send-email-Waiman.Long@hp.com \
    --to=waiman.long@hp.com \
    --cc=dave@stgolabs.net \
    --cc=doug.hatch@hp.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=scott.norton@hp.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.