All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
From: Josh Poimboeuf <jpoimboe@kernel.org>
To: x86@kernel.org
Cc: linux-kernel@vger.kernel.org,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Daniel Sneddon <daniel.sneddon@linux.intel.com>,
	Pawan Gupta <pawan.kumar.gupta@linux.intel.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Alexandre Chartre <alexandre.chartre@oracle.com>,
	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	Sean Christopherson <seanjc@google.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Nikolay Borisov <nik.borisov@suse.com>,
	KP Singh <kpsingh@kernel.org>, Waiman Long <longman@redhat.com>,
	Borislav Petkov <bp@alien8.de>, Ingo Molnar <mingo@kernel.org>
Subject: [PATCH v4 1/5] x86/bugs: Only harden syscalls when needed
Date: Fri, 19 Apr 2024 14:09:47 -0700	[thread overview]
Message-ID: <982d05a2f669140f26500bee643011896d661094.1713559768.git.jpoimboe@kernel.org> (raw)
In-Reply-To: <cover.1713559768.git.jpoimboe@kernel.org>

Syscall hardening (converting the syscall indirect branch to a series of
direct branches) has shown some performance regressions:

- Red Hat internal testing showed up to 12% slowdowns in database
  benchmark testing on Sapphire Rapids when the DB was stressed with 80+
  users to cause contention.

- The kernel test robot's will-it-scale benchmarks showed significant
  regressions on Skylake with IBRS:
  https://lkml.kernel.org/lkml/202404191333.178a0eed-yujie.liu@intel.com

To fix those slowdowns, only use the syscall direct branches when
indirect branches are considered to be "not OK": meaning Spectre v2+BHI
isn't mitigated by HW and the user hasn't disabled mitigations.

Fixes: 1e3ad78334a6 ("x86/syscall: Don't force use of indirect calls for system calls")
Reviewed-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 arch/x86/entry/common.c            | 15 ++++++++++---
 arch/x86/entry/syscall_32.c        | 11 +---------
 arch/x86/entry/syscall_64.c        |  6 -----
 arch/x86/entry/syscall_x32.c       |  7 +++++-
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/syscall.h     |  8 ++++++-
 arch/x86/kernel/cpu/bugs.c         | 35 +++++++++++++++++++++++++++++-
 7 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 51cc9c7cb9bd..db1ef98da3a4 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -49,7 +49,10 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
 
 	if (likely(unr < NR_syscalls)) {
 		unr = array_index_nospec(unr, NR_syscalls);
-		regs->ax = x64_sys_call(regs, unr);
+		if (likely(cpu_feature_enabled(X86_FEATURE_INDIRECT_BRANCH_OK)))
+			regs->ax = sys_call_table[unr](regs);
+		else
+			regs->ax = x64_sys_call(regs, unr);
 		return true;
 	}
 	return false;
@@ -66,7 +69,10 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
 
 	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
 		xnr = array_index_nospec(xnr, X32_NR_syscalls);
-		regs->ax = x32_sys_call(regs, xnr);
+		if (likely(cpu_feature_enabled(X86_FEATURE_INDIRECT_BRANCH_OK)))
+			regs->ax = x32_sys_call_table[xnr](regs);
+		else
+			regs->ax = x32_sys_call(regs, xnr);
 		return true;
 	}
 	return false;
@@ -162,7 +168,10 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
 
 	if (likely(unr < IA32_NR_syscalls)) {
 		unr = array_index_nospec(unr, IA32_NR_syscalls);
-		regs->ax = ia32_sys_call(regs, unr);
+		if (likely(cpu_feature_enabled(X86_FEATURE_INDIRECT_BRANCH_OK)))
+			regs->ax = ia32_sys_call_table[unr](regs);
+		else
+			regs->ax = ia32_sys_call(regs, unr);
 	} else if (nr != -1) {
 		regs->ax = __ia32_sys_ni_syscall(regs);
 	}
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index c2235bae17ef..aab31760b4e3 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -14,25 +14,16 @@
 #endif
 
 #define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
-
 #include <asm/syscalls_32.h>
 #undef __SYSCALL
 
-/*
- * The sys_call_table[] is no longer used for system calls, but
- * kernel/trace/trace_syscalls.c still wants to know the system
- * call address.
- */
-#ifdef CONFIG_X86_32
 #define __SYSCALL(nr, sym) __ia32_##sym,
-const sys_call_ptr_t sys_call_table[] = {
+const sys_call_ptr_t ia32_sys_call_table[] = {
 #include <asm/syscalls_32.h>
 };
 #undef __SYSCALL
-#endif
 
 #define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);
-
 long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
 {
 	switch (nr) {
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 33b3f09e6f15..96ea1f8a1d3f 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -11,11 +11,6 @@
 #include <asm/syscalls_64.h>
 #undef __SYSCALL
 
-/*
- * The sys_call_table[] is no longer used for system calls, but
- * kernel/trace/trace_syscalls.c still wants to know the system
- * call address.
- */
 #define __SYSCALL(nr, sym) __x64_##sym,
 const sys_call_ptr_t sys_call_table[] = {
 #include <asm/syscalls_64.h>
@@ -23,7 +18,6 @@ const sys_call_ptr_t sys_call_table[] = {
 #undef __SYSCALL
 
 #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-
 long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
 {
 	switch (nr) {
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index 03de4a932131..5aef4230faca 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -11,8 +11,13 @@
 #include <asm/syscalls_x32.h>
 #undef __SYSCALL
 
-#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
+#define __SYSCALL(nr, sym) __x64_##sym,
+const sys_call_ptr_t x32_sys_call_table[] = {
+#include <asm/syscalls_x32.h>
+};
+#undef __SYSCALL
 
+#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
 long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
 {
 	switch (nr) {
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 3c7434329661..d64b0a5291f1 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -470,6 +470,7 @@
 #define X86_FEATURE_BHI_CTRL		(21*32+ 2) /* "" BHI_DIS_S HW control available */
 #define X86_FEATURE_CLEAR_BHB_HW	(21*32+ 3) /* "" BHI_DIS_S HW control enabled */
 #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */
+#define X86_FEATURE_INDIRECT_BRANCH_OK	(21*32+ 5) /* "" It's OK to use indirect branches */
 
 /*
  * BUG word(s)
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 2fc7bc3863ff..dfb59521244c 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -16,14 +16,20 @@
 #include <asm/thread_info.h>	/* for TS_COMPAT */
 #include <asm/unistd.h>
 
-/* This is used purely for kernel/trace/trace_syscalls.c */
 typedef long (*sys_call_ptr_t)(const struct pt_regs *);
 extern const sys_call_ptr_t sys_call_table[];
 
+#if defined(CONFIG_X86_32)
+#define ia32_sys_call_table sys_call_table
+#else
 /*
  * These may not exist, but still put the prototypes in so we
  * can use IS_ENABLED().
  */
+extern const sys_call_ptr_t ia32_sys_call_table[];
+extern const sys_call_ptr_t x32_sys_call_table[];
+#endif
+
 extern long ia32_sys_call(const struct pt_regs *, unsigned int nr);
 extern long x32_sys_call(const struct pt_regs *, unsigned int nr);
 extern long x64_sys_call(const struct pt_regs *, unsigned int nr);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ab18185894df..5fca46c78daf 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1665,6 +1665,12 @@ static void __init bhi_select_mitigation(void)
 	if (!IS_ENABLED(CONFIG_X86_64))
 		return;
 
+	/*
+	 * There's no HW mitigation in place.  Mark indirect branches as
+	 * "not OK".
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_INDIRECT_BRANCH_OK);
+
 	/* Mitigate KVM by default */
 	setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
 	pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
@@ -1679,6 +1685,28 @@ static void __init spectre_v2_select_mitigation(void)
 	enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
 	enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
 
+	/*
+	 * X86_FEATURE_INDIRECT_BRANCH_OK indicates that indirect calls are
+	 * "OK" to use due to (at least) one of the following being true:
+	 *
+	 *   - the CPU isn't vulnerable to Spectre v2, BHI, etc;
+	 *
+	 *   - a HW mitigation is in place (e.g., IBRS, eIBRS+BHI_DIS_S); or
+	 *
+	 *   - the user disabled mitigations.
+	 *
+	 * Clearing the bit enables certain indirect branch "easy targets" [*]
+	 * to be converted to a series of direct branches.
+	 *
+	 * Assume innocence until proven guilty: set it now and clear it later
+	 * if/when needed.
+	 *
+	 * [*] The closer the indirect branch is to kernel entry, and the more
+	 *     user-controlled registers there are, the easier target it may be
+	 *     for future Spectre v2 variants.
+	 */
+	setup_force_cpu_cap(X86_FEATURE_INDIRECT_BRANCH_OK);
+
 	/*
 	 * If the CPU is not affected and the command line mode is NONE or AUTO
 	 * then nothing to do.
@@ -1765,11 +1793,16 @@ static void __init spectre_v2_select_mitigation(void)
 		break;
 
 	case SPECTRE_V2_LFENCE:
+		setup_clear_cpu_cap(X86_FEATURE_INDIRECT_BRANCH_OK);
+		fallthrough;
 	case SPECTRE_V2_EIBRS_LFENCE:
 		setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
-		fallthrough;
+		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+		break;
 
 	case SPECTRE_V2_RETPOLINE:
+		setup_clear_cpu_cap(X86_FEATURE_INDIRECT_BRANCH_OK);
+		fallthrough;
 	case SPECTRE_V2_EIBRS_RETPOLINE:
 		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
 		break;
-- 
2.44.0


  reply	other threads:[~2024-04-19 21:09 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-19 21:09 [PATCH 0/5] x86/bugs: more BHI fixes Josh Poimboeuf
2024-04-19 21:09 ` Josh Poimboeuf [this message]
2024-04-22  8:09   ` [PATCH v4 1/5] x86/bugs: Only harden syscalls when needed Yujie Liu
2024-05-07  5:17     ` Josh Poimboeuf
2024-04-19 21:09 ` [PATCH v4 2/5] cpu/speculation: Fix CPU mitigation defaults for !x86 Josh Poimboeuf
2024-04-20  0:09   ` Sean Christopherson
2024-04-23 14:10     ` Sean Christopherson
2024-04-24  5:35       ` Josh Poimboeuf
2024-04-19 21:09 ` [PATCH v4 3/5] x86/syscall: Mark exit[_group] syscall handlers __noreturn Josh Poimboeuf
2024-04-20 13:58   ` Paul E. McKenney
2024-04-21  5:25     ` Josh Poimboeuf
2024-04-21 20:40       ` Paul McKenney
2024-04-21 21:47         ` Paul McKenney
2024-05-02 23:48           ` Paul McKenney
2024-05-03 15:38             ` Paul E. McKenney
2024-05-03 19:56             ` Josh Poimboeuf
2024-05-03 20:44               ` Josh Poimboeuf
2024-05-03 23:33                 ` Paul E. McKenney
2024-05-03 23:48                   ` Josh Poimboeuf
2024-05-04 16:48                     ` Paul E. McKenney
2024-04-19 21:09 ` [PATCH v4 4/5] x86/bugs: Remove duplicate Spectre cmdline option descriptions Josh Poimboeuf
2024-04-19 21:09 ` [PATCH v4 5/5] x86/bugs: Add 'spectre_bhi=vmexit' cmdline option Josh Poimboeuf
2024-04-19 21:46   ` Josh Poimboeuf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=982d05a2f669140f26500bee643011896d661094.1713559768.git.jpoimboe@kernel.org \
    --to=jpoimboe@kernel.org \
    --cc=alexandre.chartre@oracle.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=bp@alien8.de \
    --cc=daniel.sneddon@linux.intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=konrad.wilk@oracle.com \
    --cc=kpsingh@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=mingo@kernel.org \
    --cc=nik.borisov@suse.com \
    --cc=pawan.kumar.gupta@linux.intel.com \
    --cc=peterz@infradead.org \
    --cc=seanjc@google.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.