All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
From: Waiman Long <Waiman.Long@hp.com>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	"H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org, linux-kernel@vger.kernel.org,
	Scott J Norton <scott.norton@hp.com>,
	Douglas Hatch <doug.hatch@hp.com>,
	Waiman Long <Waiman.Long@hp.com>
Subject: [PATCH 1/7] locking/pvqspinlock: Only kick CPU at unlock time
Date: Sat, 11 Jul 2015 16:36:52 -0400	[thread overview]
Message-ID: <1436647018-49734-2-git-send-email-Waiman.Long@hp.com> (raw)
In-Reply-To: <1436647018-49734-1-git-send-email-Waiman.Long@hp.com>

For an over-committed guest with more vCPUs than physical CPUs
available, it is possible that a vCPU may be kicked twice before
getting the lock - one before it becomes queue head and once before
it gets the lock. All these CPU kicking and halting (VMEXIT) can be
expensive and slow down system performance.

This patch adds a new vCPU state (vcpu_hashed) which enables the code
to delay CPU kicking until at unlock time. Once this state is set,
the new lock holder will set _Q_SLOW_VAL and fill in the hash table
on behalf of the halted queue head vCPU. The original vcpu_halted
state will be used by pv_wait_node() only to differentiate other
queue nodes from the qeue head.

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
---
 kernel/locking/qspinlock.c          |   10 ++--
 kernel/locking/qspinlock_paravirt.h |   83 ++++++++++++++++++++++++++---------
 2 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 38c4920..d2e0fc1 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
 
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
-
+static __always_inline void __pv_scan_next(struct qspinlock *lock,
+					   struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_head(struct qspinlock *lock,
 					   struct mcs_spinlock *node) { }
 
@@ -248,7 +248,7 @@ static __always_inline void __pv_wait_head(struct qspinlock *lock,
 
 #define pv_init_node		__pv_init_node
 #define pv_wait_node		__pv_wait_node
-#define pv_kick_node		__pv_kick_node
+#define pv_scan_next		__pv_scan_next
 #define pv_wait_head		__pv_wait_head
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
@@ -440,7 +440,7 @@ queue:
 		cpu_relax();
 
 	arch_mcs_spin_unlock_contended(&next->locked);
-	pv_kick_node(next);
+	pv_scan_next(lock, next);
 
 release:
 	/*
@@ -461,7 +461,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
 
 #undef pv_init_node
 #undef pv_wait_node
-#undef pv_kick_node
+#undef pv_scan_next
 #undef pv_wait_head
 
 #undef  queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 04ab181..d302c39 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -21,9 +21,14 @@
 
 #define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
 
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
 enum vcpu_state {
 	vcpu_running = 0,
-	vcpu_halted,
+	vcpu_halted,		/* Used only in pv_wait_node */
+	vcpu_hashed,		/* = pv_hash'ed + vcpu_halted */
 };
 
 struct pv_node {
@@ -152,7 +157,8 @@ static void pv_init_node(struct mcs_spinlock *node)
 
 /*
  * Wait for node->locked to become true, halt the vcpu after a short spin.
- * pv_kick_node() is used to wake the vcpu again.
+ * pv_scan_next() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
  */
 static void pv_wait_node(struct mcs_spinlock *node)
 {
@@ -171,9 +177,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 *
 		 * [S] pn->state = vcpu_halted	  [S] next->locked = 1
 		 *     MB			      MB
-		 * [L] pn->locked		[RmW] pn->state = vcpu_running
+		 * [L] pn->locked		[RmW] pn->state = vcpu_hashed
 		 *
-		 * Matches the xchg() from pv_kick_node().
+		 * Matches the cmpxchg() from pv_scan_next().
 		 */
 		smp_store_mb(pn->state, vcpu_halted);
 
@@ -181,9 +187,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
 			pv_wait(&pn->state, vcpu_halted);
 
 		/*
-		 * Reset the vCPU state to avoid unncessary CPU kicking
+		 * Reset the state except when vcpu_hashed is set.
 		 */
-		WRITE_ONCE(pn->state, vcpu_running);
+		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 
 		/*
 		 * If the locked flag is still not set after wakeup, it is a
@@ -193,6 +199,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 * MCS lock will be released soon.
 		 */
 	}
+
 	/*
 	 * By now our node->locked should be 1 and our caller will not actually
 	 * spin-wait for it. We do however rely on our caller to do a
@@ -201,24 +208,32 @@ static void pv_wait_node(struct mcs_spinlock *node)
 }
 
 /*
- * Called after setting next->locked = 1, used to wake those stuck in
- * pv_wait_node().
+ * Called after setting next->locked = 1 & lock acquired.
+ * Check if the the vCPU has been halted. If so, set the _Q_SLOW_VAL flag
+ * and put an entry into the lock hash table to be waken up at unlock time.
  */
-static void pv_kick_node(struct mcs_spinlock *node)
+static void pv_scan_next(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
+	struct __qspinlock *l = (void *)lock;
 
 	/*
-	 * Note that because node->locked is already set, this actual
-	 * mcs_spinlock entry could be re-used already.
-	 *
-	 * This should be fine however, kicking people for no reason is
-	 * harmless.
+	 * Transition vCPU state: halted => hashed
+	 * Quit if the transition failed.
 	 *
-	 * See the comment in pv_wait_node().
+	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
 	 */
-	if (xchg(&pn->state, vcpu_running) == vcpu_halted)
-		pv_kick(pn->cpu);
+	if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+		return;
+
+	/*
+	 * Put the lock into the hash table & set the _Q_SLOW_VAL in the lock.
+	 * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+	 * the hash table later on at unlock time, no atomic instruction is
+	 * needed.
+	 */
+	WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+	(void)pv_hash(lock, pn);
 }
 
 /*
@@ -229,19 +244,42 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
 	struct __qspinlock *l = (void *)lock;
-	struct qspinlock **lp = NULL;
+	struct qspinlock **lp;
 	int loop;
 
+	/*
+	 * Initialize lp to a non-NULL value if it has already been in the
+	 * pv_hashed state so that pv_hash() won't be called again.
+	 */
+	lp = (READ_ONCE(pn->state) == vcpu_hashed) ? (struct qspinlock **)1
+						   : NULL;
 	for (;;) {
+		WRITE_ONCE(pn->state, vcpu_running);
 		for (loop = SPIN_THRESHOLD; loop; loop--) {
 			if (!READ_ONCE(l->locked))
 				return;
 			cpu_relax();
 		}
 
-		WRITE_ONCE(pn->state, vcpu_halted);
+		/*
+		 * Recheck lock value after setting vcpu_hashed state
+		 *
+		 * [S] state = vcpu_hashed	[S] l->locked = 0
+		 *     MB			    MB
+		 * [L] l->locked		[L] state == vcpu_hashed
+		 *
+		 * Matches smp_store_mb() in __pv_queued_spin_unlock()
+		 */
+		smp_store_mb(pn->state, vcpu_hashed);
+
+		if (!READ_ONCE(l->locked)) {
+			WRITE_ONCE(pn->state, vcpu_running);
+			return;
+		}
+
 		if (!lp) { /* ONCE */
 			lp = pv_hash(lock, pn);
+
 			/*
 			 * lp must be set before setting _Q_SLOW_VAL
 			 *
@@ -305,13 +343,16 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 * Now that we have a reference to the (likely) blocked pv_node,
 	 * release the lock.
 	 */
-	smp_store_release(&l->locked, 0);
+	smp_store_mb(l->locked, 0);
 
 	/*
 	 * At this point the memory pointed at by lock can be freed/reused,
 	 * however we can still use the pv_node to kick the CPU.
+	 * The other vCPU may not really be halted, but kicking an active
+	 * vCPU is harmless other than the additional latency in completing
+	 * the unlock.
 	 */
-	if (READ_ONCE(node->state) == vcpu_halted)
+	if (READ_ONCE(node->state) == vcpu_hashed)
 		pv_kick(node->cpu);
 }
 /*
-- 
1.7.1


  reply	other threads:[~2015-07-11 20:37 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-07-11 20:36 [PATCH 0/7] locking/qspinlock: Enhance pvqspinlock & introduce queued unfair lock Waiman Long
2015-07-11 20:36 ` Waiman Long [this message]
2015-07-13 12:02   ` [PATCH 1/7] locking/pvqspinlock: Only kick CPU at unlock time Peter Zijlstra
2015-07-13 12:31     ` Peter Zijlstra
2015-07-15  1:24     ` Waiman Long
2015-07-13 13:48   ` Peter Zijlstra
2015-07-14  9:31     ` Peter Zijlstra
2015-07-15  1:31     ` Waiman Long
2015-08-03 17:00   ` [tip:locking/core] " tip-bot for Waiman Long
2015-07-11 20:36 ` [PATCH 2/7] locking/pvqspinlock: Allow vCPUs kick-ahead Waiman Long
2015-07-13 13:52   ` Peter Zijlstra
2015-07-15  1:38     ` Waiman Long
2015-07-11 20:36 ` [PATCH 3/7] locking/pvqspinlock: Implement wait-early for overcommitted guest Waiman Long
2015-07-12  8:23   ` Peter Zijlstra
2015-07-13 19:50   ` Davidlohr Bueso
2015-07-15  1:39     ` Waiman Long
2015-07-11 20:36 ` [PATCH 4/7] locking/pvqspinlock: Collect slowpath lock statistics Waiman Long
2015-07-12  8:22   ` Peter Zijlstra
2015-07-14 18:48     ` Waiman Long
2015-07-11 20:36 ` [PATCH 5/7] locking/pvqspinlock: Add pending bit support Waiman Long
2015-07-12  8:21   ` Peter Zijlstra
2015-07-14 18:47     ` Waiman Long
2015-07-11 20:36 ` [PATCH 6/7] locking/qspinlock: A fairer queued unfair lock Waiman Long
2015-07-12  8:21   ` Peter Zijlstra
2015-07-14 18:47     ` Waiman Long
2015-07-14 20:45       ` Peter Zijlstra
2015-07-11 20:36 ` [PATCH 7/7] locking/qspinlock: Collect queued unfair lock slowpath statistics Waiman Long

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1436647018-49734-2-git-send-email-Waiman.Long@hp.com \
    --to=waiman.long@hp.com \
    --cc=doug.hatch@hp.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=scott.norton@hp.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.