From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751779AbbGKUhV (ORCPT ); Sat, 11 Jul 2015 16:37:21 -0400 Received: from g9t5008.houston.hp.com ([15.240.92.66]:38004 "EHLO g9t5008.houston.hp.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751536AbbGKUhS (ORCPT ); Sat, 11 Jul 2015 16:37:18 -0400 From: Waiman Long To: Peter Zijlstra , Ingo Molnar , Thomas Gleixner , "H. Peter Anvin" Cc: x86@kernel.org, linux-kernel@vger.kernel.org, Scott J Norton , Douglas Hatch , Waiman Long Subject: [PATCH 1/7] locking/pvqspinlock: Only kick CPU at unlock time Date: Sat, 11 Jul 2015 16:36:52 -0400 Message-Id: <1436647018-49734-2-git-send-email-Waiman.Long@hp.com> X-Mailer: git-send-email 1.7.1 In-Reply-To: <1436647018-49734-1-git-send-email-Waiman.Long@hp.com> References: <1436647018-49734-1-git-send-email-Waiman.Long@hp.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org For an over-committed guest with more vCPUs than physical CPUs available, it is possible that a vCPU may be kicked twice before getting the lock - one before it becomes queue head and once before it gets the lock. All these CPU kicking and halting (VMEXIT) can be expensive and slow down system performance. This patch adds a new vCPU state (vcpu_hashed) which enables the code to delay CPU kicking until at unlock time. Once this state is set, the new lock holder will set _Q_SLOW_VAL and fill in the hash table on behalf of the halted queue head vCPU. The original vcpu_halted state will be used by pv_wait_node() only to differentiate other queue nodes from the qeue head. Signed-off-by: Waiman Long --- kernel/locking/qspinlock.c | 10 ++-- kernel/locking/qspinlock_paravirt.h | 83 ++++++++++++++++++++++++++--------- 2 files changed, 67 insertions(+), 26 deletions(-) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38c4920..d2e0fc1 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock) static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } -static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { } - +static __always_inline void __pv_scan_next(struct qspinlock *lock, + struct mcs_spinlock *node) { } static __always_inline void __pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) { } @@ -248,7 +248,7 @@ static __always_inline void __pv_wait_head(struct qspinlock *lock, #define pv_init_node __pv_init_node #define pv_wait_node __pv_wait_node -#define pv_kick_node __pv_kick_node +#define pv_scan_next __pv_scan_next #define pv_wait_head __pv_wait_head #ifdef CONFIG_PARAVIRT_SPINLOCKS @@ -440,7 +440,7 @@ queue: cpu_relax(); arch_mcs_spin_unlock_contended(&next->locked); - pv_kick_node(next); + pv_scan_next(lock, next); release: /* @@ -461,7 +461,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #undef pv_init_node #undef pv_wait_node -#undef pv_kick_node +#undef pv_scan_next #undef pv_wait_head #undef queued_spin_lock_slowpath diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 04ab181..d302c39 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -21,9 +21,14 @@ #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) +/* + * Queue node uses: vcpu_running & vcpu_halted. + * Queue head uses: vcpu_running & vcpu_hashed. + */ enum vcpu_state { vcpu_running = 0, - vcpu_halted, + vcpu_halted, /* Used only in pv_wait_node */ + vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ }; struct pv_node { @@ -152,7 +157,8 @@ static void pv_init_node(struct mcs_spinlock *node) /* * Wait for node->locked to become true, halt the vcpu after a short spin. - * pv_kick_node() is used to wake the vcpu again. + * pv_scan_next() is used to set _Q_SLOW_VAL and fill in hash table on its + * behalf. */ static void pv_wait_node(struct mcs_spinlock *node) { @@ -171,9 +177,9 @@ static void pv_wait_node(struct mcs_spinlock *node) * * [S] pn->state = vcpu_halted [S] next->locked = 1 * MB MB - * [L] pn->locked [RmW] pn->state = vcpu_running + * [L] pn->locked [RmW] pn->state = vcpu_hashed * - * Matches the xchg() from pv_kick_node(). + * Matches the cmpxchg() from pv_scan_next(). */ smp_store_mb(pn->state, vcpu_halted); @@ -181,9 +187,9 @@ static void pv_wait_node(struct mcs_spinlock *node) pv_wait(&pn->state, vcpu_halted); /* - * Reset the vCPU state to avoid unncessary CPU kicking + * Reset the state except when vcpu_hashed is set. */ - WRITE_ONCE(pn->state, vcpu_running); + cmpxchg(&pn->state, vcpu_halted, vcpu_running); /* * If the locked flag is still not set after wakeup, it is a @@ -193,6 +199,7 @@ static void pv_wait_node(struct mcs_spinlock *node) * MCS lock will be released soon. */ } + /* * By now our node->locked should be 1 and our caller will not actually * spin-wait for it. We do however rely on our caller to do a @@ -201,24 +208,32 @@ static void pv_wait_node(struct mcs_spinlock *node) } /* - * Called after setting next->locked = 1, used to wake those stuck in - * pv_wait_node(). + * Called after setting next->locked = 1 & lock acquired. + * Check if the the vCPU has been halted. If so, set the _Q_SLOW_VAL flag + * and put an entry into the lock hash table to be waken up at unlock time. */ -static void pv_kick_node(struct mcs_spinlock *node) +static void pv_scan_next(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; + struct __qspinlock *l = (void *)lock; /* - * Note that because node->locked is already set, this actual - * mcs_spinlock entry could be re-used already. - * - * This should be fine however, kicking people for no reason is - * harmless. + * Transition vCPU state: halted => hashed + * Quit if the transition failed. * - * See the comment in pv_wait_node(). + * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() */ - if (xchg(&pn->state, vcpu_running) == vcpu_halted) - pv_kick(pn->cpu); + if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + return; + + /* + * Put the lock into the hash table & set the _Q_SLOW_VAL in the lock. + * As this is the same vCPU that will check the _Q_SLOW_VAL value and + * the hash table later on at unlock time, no atomic instruction is + * needed. + */ + WRITE_ONCE(l->locked, _Q_SLOW_VAL); + (void)pv_hash(lock, pn); } /* @@ -229,19 +244,42 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; struct __qspinlock *l = (void *)lock; - struct qspinlock **lp = NULL; + struct qspinlock **lp; int loop; + /* + * Initialize lp to a non-NULL value if it has already been in the + * pv_hashed state so that pv_hash() won't be called again. + */ + lp = (READ_ONCE(pn->state) == vcpu_hashed) ? (struct qspinlock **)1 + : NULL; for (;;) { + WRITE_ONCE(pn->state, vcpu_running); for (loop = SPIN_THRESHOLD; loop; loop--) { if (!READ_ONCE(l->locked)) return; cpu_relax(); } - WRITE_ONCE(pn->state, vcpu_halted); + /* + * Recheck lock value after setting vcpu_hashed state + * + * [S] state = vcpu_hashed [S] l->locked = 0 + * MB MB + * [L] l->locked [L] state == vcpu_hashed + * + * Matches smp_store_mb() in __pv_queued_spin_unlock() + */ + smp_store_mb(pn->state, vcpu_hashed); + + if (!READ_ONCE(l->locked)) { + WRITE_ONCE(pn->state, vcpu_running); + return; + } + if (!lp) { /* ONCE */ lp = pv_hash(lock, pn); + /* * lp must be set before setting _Q_SLOW_VAL * @@ -305,13 +343,16 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * Now that we have a reference to the (likely) blocked pv_node, * release the lock. */ - smp_store_release(&l->locked, 0); + smp_store_mb(l->locked, 0); /* * At this point the memory pointed at by lock can be freed/reused, * however we can still use the pv_node to kick the CPU. + * The other vCPU may not really be halted, but kicking an active + * vCPU is harmless other than the additional latency in completing + * the unlock. */ - if (READ_ONCE(node->state) == vcpu_halted) + if (READ_ONCE(node->state) == vcpu_hashed) pv_kick(node->cpu); } /* -- 1.7.1