LKML Archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] x86/percpu: Define this_percpu_xchg_op()
@ 2024-03-20  8:30 Uros Bizjak
  2024-03-20  8:30 ` [PATCH 2/2] x86/percpu: Move raw_percpu_xchg_op() to a better place Uros Bizjak
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Uros Bizjak @ 2024-03-20  8:30 UTC (permalink / raw
  To: x86, linux-kernel
  Cc: Uros Bizjak, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin

Rewrite percpu_xchg_op() using generic percpu primitives instead
of using asm. The new implementation is similar to local_xchg() and
allows the compiler to perform various optimizations (e.g. the
compiler is able to create fast path through the loop, according
to likely/unlikely annotations in percpu_try_cmpxchg_op).

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/percpu.h | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 44958ebaf626..de991e6d050a 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,25 +230,15 @@ do {									\
 })
 
 /*
- * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * expensive due to the implied lock prefix.  The processor cannot prefetch
- * cachelines if xchg is used.
+ * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
+ * xchg is expensive due to the implied lock prefix. The processor
+ * cannot prefetch cachelines if xchg is used.
  */
-#define percpu_xchg_op(size, qual, _var, _nval)				\
+#define this_percpu_xchg_op(_var, _nval)				\
 ({									\
-	__pcpu_type_##size pxo_old__;					\
-	__pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval);	\
-	asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]),		\
-				    "%[oval]")				\
-		  "\n1:\t"						\
-		  __pcpu_op2_##size("cmpxchg", "%[nval]",		\
-				    __percpu_arg([var]))		\
-		  "\n\tjnz 1b"						\
-		  : [oval] "=&a" (pxo_old__),				\
-		    [var] "+m" (__my_cpu_var(_var))			\
-		  : [nval] __pcpu_reg_##size(, pxo_new__)		\
-		  : "memory");						\
-	(typeof(_var))(unsigned long) pxo_old__;			\
+	typeof(_var) pxo_old__ = this_cpu_read(_var);			\
+	do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval));	\
+	pxo_old__;							\
 })
 
 /*
@@ -534,9 +524,9 @@ do {									\
 #define this_cpu_or_1(pcp, val)		percpu_to_op(1, volatile, "or", (pcp), val)
 #define this_cpu_or_2(pcp, val)		percpu_to_op(2, volatile, "or", (pcp), val)
 #define this_cpu_or_4(pcp, val)		percpu_to_op(4, volatile, "or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(1, volatile, pcp, nval)
-#define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(2, volatile, pcp, nval)
-#define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(4, volatile, pcp, nval)
+#define this_cpu_xchg_1(pcp, nval)	this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval)	this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval)	this_percpu_xchg_op(pcp, nval)
 
 #define raw_cpu_add_return_1(pcp, val)		percpu_add_return_op(1, , pcp, val)
 #define raw_cpu_add_return_2(pcp, val)		percpu_add_return_op(2, , pcp, val)
@@ -575,7 +565,7 @@ do {									\
 #define this_cpu_and_8(pcp, val)		percpu_to_op(8, volatile, "and", (pcp), val)
 #define this_cpu_or_8(pcp, val)			percpu_to_op(8, volatile, "or", (pcp), val)
 #define this_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, volatile, pcp, val)
-#define this_cpu_xchg_8(pcp, nval)		percpu_xchg_op(8, volatile, pcp, nval)
+#define this_cpu_xchg_8(pcp, nval)		this_percpu_xchg_op(pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
 #define this_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
 #endif
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/2] x86/percpu: Move raw_percpu_xchg_op() to a better place
  2024-03-20  8:30 [PATCH 1/2] x86/percpu: Define this_percpu_xchg_op() Uros Bizjak
@ 2024-03-20  8:30 ` Uros Bizjak
  2024-03-20 11:22   ` [tip: x86/percpu] " tip-bot2 for Uros Bizjak
  2024-03-20 11:39   ` tip-bot2 for Uros Bizjak
  2024-03-20 11:22 ` [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code tip-bot2 for Uros Bizjak
  2024-03-20 11:39 ` tip-bot2 for Uros Bizjak
  2 siblings, 2 replies; 9+ messages in thread
From: Uros Bizjak @ 2024-03-20  8:30 UTC (permalink / raw
  To: x86, linux-kernel
  Cc: Uros Bizjak, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin

Move raw_percpu_xchg_op() together with this_percpu_xchg_op()
and trivially rename some internal variables to harmonize them
between macro implementations.

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/percpu.h | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index de991e6d050a..7563e69838c4 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -229,6 +229,17 @@ do {									\
 	(typeof(_var))(unsigned long) (paro_tmp__ + _val);		\
 })
 
+/*
+ * raw_cpu_xchg() can use a load-store since
+ * it is not required to be IRQ-safe.
+ */
+#define raw_percpu_xchg_op(_var, _nval)					\
+({									\
+	typeof(_var) pxo_old__ = raw_cpu_read(_var);			\
+	raw_cpu_write(_var, _nval);					\
+	pxo_old__;							\
+})
+
 /*
  * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
  * xchg is expensive due to the implied lock prefix. The processor
@@ -499,18 +510,6 @@ do {									\
 #define raw_cpu_or_1(pcp, val)		percpu_to_op(1, , "or", (pcp), val)
 #define raw_cpu_or_2(pcp, val)		percpu_to_op(2, , "or", (pcp), val)
 #define raw_cpu_or_4(pcp, val)		percpu_to_op(4, , "or", (pcp), val)
-
-/*
- * raw_cpu_xchg() can use a load-store since it is not required to be
- * IRQ-safe.
- */
-#define raw_percpu_xchg_op(var, nval)					\
-({									\
-	typeof(var) pxo_ret__ = raw_cpu_read(var);			\
-	raw_cpu_write(var, (nval));					\
-	pxo_ret__;							\
-})
-
 #define raw_cpu_xchg_1(pcp, val)	raw_percpu_xchg_op(pcp, val)
 #define raw_cpu_xchg_2(pcp, val)	raw_percpu_xchg_op(pcp, val)
 #define raw_cpu_xchg_4(pcp, val)	raw_percpu_xchg_op(pcp, val)
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip: x86/percpu] x86/percpu: Move raw_percpu_xchg_op() to a better place
  2024-03-20  8:30 ` [PATCH 2/2] x86/percpu: Move raw_percpu_xchg_op() to a better place Uros Bizjak
@ 2024-03-20 11:22   ` tip-bot2 for Uros Bizjak
  2024-03-20 11:39   ` tip-bot2 for Uros Bizjak
  1 sibling, 0 replies; 9+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2024-03-20 11:22 UTC (permalink / raw
  To: linux-tip-commits; +Cc: Uros Bizjak, Ingo Molnar, x86, linux-kernel

The following commit has been merged into the x86/percpu branch of tip:

Commit-ID:     733b3d4dfa6c6b55885e77f1982ef5edc2023d21
Gitweb:        https://git.kernel.org/tip/733b3d4dfa6c6b55885e77f1982ef5edc2023d21
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 20 Mar 2024 09:30:41 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 20 Mar 2024 12:08:11 +01:00

x86/percpu: Move raw_percpu_xchg_op() to a better place

Move raw_percpu_xchg_op() together with this_percpu_xchg_op()
and trivially rename some internal variables to harmonize them
between macro implementations.

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20240320083127.493250-2-ubizjak@gmail.com
---
 arch/x86/include/asm/percpu.h | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index de991e6..7563e69 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,6 +230,17 @@ do {									\
 })
 
 /*
+ * raw_cpu_xchg() can use a load-store since
+ * it is not required to be IRQ-safe.
+ */
+#define raw_percpu_xchg_op(_var, _nval)					\
+({									\
+	typeof(_var) pxo_old__ = raw_cpu_read(_var);			\
+	raw_cpu_write(_var, _nval);					\
+	pxo_old__;							\
+})
+
+/*
  * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
  * xchg is expensive due to the implied lock prefix. The processor
  * cannot prefetch cachelines if xchg is used.
@@ -499,18 +510,6 @@ do {									\
 #define raw_cpu_or_1(pcp, val)		percpu_to_op(1, , "or", (pcp), val)
 #define raw_cpu_or_2(pcp, val)		percpu_to_op(2, , "or", (pcp), val)
 #define raw_cpu_or_4(pcp, val)		percpu_to_op(4, , "or", (pcp), val)
-
-/*
- * raw_cpu_xchg() can use a load-store since it is not required to be
- * IRQ-safe.
- */
-#define raw_percpu_xchg_op(var, nval)					\
-({									\
-	typeof(var) pxo_ret__ = raw_cpu_read(var);			\
-	raw_cpu_write(var, (nval));					\
-	pxo_ret__;							\
-})
-
 #define raw_cpu_xchg_1(pcp, val)	raw_percpu_xchg_op(pcp, val)
 #define raw_cpu_xchg_2(pcp, val)	raw_percpu_xchg_op(pcp, val)
 #define raw_cpu_xchg_4(pcp, val)	raw_percpu_xchg_op(pcp, val)

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
  2024-03-20  8:30 [PATCH 1/2] x86/percpu: Define this_percpu_xchg_op() Uros Bizjak
  2024-03-20  8:30 ` [PATCH 2/2] x86/percpu: Move raw_percpu_xchg_op() to a better place Uros Bizjak
@ 2024-03-20 11:22 ` tip-bot2 for Uros Bizjak
  2024-03-20 11:39 ` tip-bot2 for Uros Bizjak
  2 siblings, 0 replies; 9+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2024-03-20 11:22 UTC (permalink / raw
  To: linux-tip-commits; +Cc: Uros Bizjak, Ingo Molnar, x86, linux-kernel

The following commit has been merged into the x86/percpu branch of tip:

Commit-ID:     50c6d2457d01944d58af355d324cb7106de19a66
Gitweb:        https://git.kernel.org/tip/50c6d2457d01944d58af355d324cb7106de19a66
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 20 Mar 2024 09:30:40 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 20 Mar 2024 12:08:11 +01:00

x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

Rewrite percpu_xchg_op() using generic percpu primitives instead
of using asm. The new implementation is similar to local_xchg() and
allows the compiler to perform various optimizations: e.g. the
compiler is able to create fast path through the loop, according
to likely/unlikely annotations in percpu_try_cmpxchg_op().

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20240320083127.493250-1-ubizjak@gmail.com
---
 arch/x86/include/asm/percpu.h | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 44958eb..de991e6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,25 +230,15 @@ do {									\
 })
 
 /*
- * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * expensive due to the implied lock prefix.  The processor cannot prefetch
- * cachelines if xchg is used.
+ * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
+ * xchg is expensive due to the implied lock prefix. The processor
+ * cannot prefetch cachelines if xchg is used.
  */
-#define percpu_xchg_op(size, qual, _var, _nval)				\
+#define this_percpu_xchg_op(_var, _nval)				\
 ({									\
-	__pcpu_type_##size pxo_old__;					\
-	__pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval);	\
-	asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]),		\
-				    "%[oval]")				\
-		  "\n1:\t"						\
-		  __pcpu_op2_##size("cmpxchg", "%[nval]",		\
-				    __percpu_arg([var]))		\
-		  "\n\tjnz 1b"						\
-		  : [oval] "=&a" (pxo_old__),				\
-		    [var] "+m" (__my_cpu_var(_var))			\
-		  : [nval] __pcpu_reg_##size(, pxo_new__)		\
-		  : "memory");						\
-	(typeof(_var))(unsigned long) pxo_old__;			\
+	typeof(_var) pxo_old__ = this_cpu_read(_var);			\
+	do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval));	\
+	pxo_old__;							\
 })
 
 /*
@@ -534,9 +524,9 @@ do {									\
 #define this_cpu_or_1(pcp, val)		percpu_to_op(1, volatile, "or", (pcp), val)
 #define this_cpu_or_2(pcp, val)		percpu_to_op(2, volatile, "or", (pcp), val)
 #define this_cpu_or_4(pcp, val)		percpu_to_op(4, volatile, "or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(1, volatile, pcp, nval)
-#define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(2, volatile, pcp, nval)
-#define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(4, volatile, pcp, nval)
+#define this_cpu_xchg_1(pcp, nval)	this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval)	this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval)	this_percpu_xchg_op(pcp, nval)
 
 #define raw_cpu_add_return_1(pcp, val)		percpu_add_return_op(1, , pcp, val)
 #define raw_cpu_add_return_2(pcp, val)		percpu_add_return_op(2, , pcp, val)
@@ -575,7 +565,7 @@ do {									\
 #define this_cpu_and_8(pcp, val)		percpu_to_op(8, volatile, "and", (pcp), val)
 #define this_cpu_or_8(pcp, val)			percpu_to_op(8, volatile, "or", (pcp), val)
 #define this_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, volatile, pcp, val)
-#define this_cpu_xchg_8(pcp, nval)		percpu_xchg_op(8, volatile, pcp, nval)
+#define this_cpu_xchg_8(pcp, nval)		this_percpu_xchg_op(pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
 #define this_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
 #endif

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip: x86/percpu] x86/percpu: Move raw_percpu_xchg_op() to a better place
  2024-03-20  8:30 ` [PATCH 2/2] x86/percpu: Move raw_percpu_xchg_op() to a better place Uros Bizjak
  2024-03-20 11:22   ` [tip: x86/percpu] " tip-bot2 for Uros Bizjak
@ 2024-03-20 11:39   ` tip-bot2 for Uros Bizjak
  1 sibling, 0 replies; 9+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2024-03-20 11:39 UTC (permalink / raw
  To: linux-tip-commits
  Cc: Uros Bizjak, Ingo Molnar, H. Peter Anvin, Linus Torvalds, x86,
	linux-kernel

The following commit has been merged into the x86/percpu branch of tip:

Commit-ID:     ce99b9c8daff3352a2ae0c72acf44e0663095fea
Gitweb:        https://git.kernel.org/tip/ce99b9c8daff3352a2ae0c72acf44e0663095fea
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 20 Mar 2024 09:30:41 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 20 Mar 2024 12:29:17 +01:00

x86/percpu: Move raw_percpu_xchg_op() to a better place

Move raw_percpu_xchg_op() together with this_percpu_xchg_op()
and trivially rename some internal variables to harmonize them
between macro implementations.

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20240320083127.493250-2-ubizjak@gmail.com
---
 arch/x86/include/asm/percpu.h | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index de991e6..7563e69 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,6 +230,17 @@ do {									\
 })
 
 /*
+ * raw_cpu_xchg() can use a load-store since
+ * it is not required to be IRQ-safe.
+ */
+#define raw_percpu_xchg_op(_var, _nval)					\
+({									\
+	typeof(_var) pxo_old__ = raw_cpu_read(_var);			\
+	raw_cpu_write(_var, _nval);					\
+	pxo_old__;							\
+})
+
+/*
  * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
  * xchg is expensive due to the implied lock prefix. The processor
  * cannot prefetch cachelines if xchg is used.
@@ -499,18 +510,6 @@ do {									\
 #define raw_cpu_or_1(pcp, val)		percpu_to_op(1, , "or", (pcp), val)
 #define raw_cpu_or_2(pcp, val)		percpu_to_op(2, , "or", (pcp), val)
 #define raw_cpu_or_4(pcp, val)		percpu_to_op(4, , "or", (pcp), val)
-
-/*
- * raw_cpu_xchg() can use a load-store since it is not required to be
- * IRQ-safe.
- */
-#define raw_percpu_xchg_op(var, nval)					\
-({									\
-	typeof(var) pxo_ret__ = raw_cpu_read(var);			\
-	raw_cpu_write(var, (nval));					\
-	pxo_ret__;							\
-})
-
 #define raw_cpu_xchg_1(pcp, val)	raw_percpu_xchg_op(pcp, val)
 #define raw_cpu_xchg_2(pcp, val)	raw_percpu_xchg_op(pcp, val)
 #define raw_cpu_xchg_4(pcp, val)	raw_percpu_xchg_op(pcp, val)

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
  2024-03-20  8:30 [PATCH 1/2] x86/percpu: Define this_percpu_xchg_op() Uros Bizjak
  2024-03-20  8:30 ` [PATCH 2/2] x86/percpu: Move raw_percpu_xchg_op() to a better place Uros Bizjak
  2024-03-20 11:22 ` [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code tip-bot2 for Uros Bizjak
@ 2024-03-20 11:39 ` tip-bot2 for Uros Bizjak
  2024-03-20 11:45   ` Ingo Molnar
  2 siblings, 1 reply; 9+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2024-03-20 11:39 UTC (permalink / raw
  To: linux-tip-commits
  Cc: Uros Bizjak, Ingo Molnar, H. Peter Anvin, Linus Torvalds, x86,
	linux-kernel

The following commit has been merged into the x86/percpu branch of tip:

Commit-ID:     0539084639f3835c8d0b798e6659ec14a266b4f1
Gitweb:        https://git.kernel.org/tip/0539084639f3835c8d0b798e6659ec14a266b4f1
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 20 Mar 2024 09:30:40 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 20 Mar 2024 12:29:02 +01:00

x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

Rewrite percpu_xchg_op() using generic percpu primitives instead
of using asm. The new implementation is similar to local_xchg() and
allows the compiler to perform various optimizations: e.g. the
compiler is able to create fast path through the loop, according
to likely/unlikely annotations in percpu_try_cmpxchg_op().

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20240320083127.493250-1-ubizjak@gmail.com
---
 arch/x86/include/asm/percpu.h | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 44958eb..de991e6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,25 +230,15 @@ do {									\
 })
 
 /*
- * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * expensive due to the implied lock prefix.  The processor cannot prefetch
- * cachelines if xchg is used.
+ * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
+ * xchg is expensive due to the implied lock prefix. The processor
+ * cannot prefetch cachelines if xchg is used.
  */
-#define percpu_xchg_op(size, qual, _var, _nval)				\
+#define this_percpu_xchg_op(_var, _nval)				\
 ({									\
-	__pcpu_type_##size pxo_old__;					\
-	__pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval);	\
-	asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]),		\
-				    "%[oval]")				\
-		  "\n1:\t"						\
-		  __pcpu_op2_##size("cmpxchg", "%[nval]",		\
-				    __percpu_arg([var]))		\
-		  "\n\tjnz 1b"						\
-		  : [oval] "=&a" (pxo_old__),				\
-		    [var] "+m" (__my_cpu_var(_var))			\
-		  : [nval] __pcpu_reg_##size(, pxo_new__)		\
-		  : "memory");						\
-	(typeof(_var))(unsigned long) pxo_old__;			\
+	typeof(_var) pxo_old__ = this_cpu_read(_var);			\
+	do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval));	\
+	pxo_old__;							\
 })
 
 /*
@@ -534,9 +524,9 @@ do {									\
 #define this_cpu_or_1(pcp, val)		percpu_to_op(1, volatile, "or", (pcp), val)
 #define this_cpu_or_2(pcp, val)		percpu_to_op(2, volatile, "or", (pcp), val)
 #define this_cpu_or_4(pcp, val)		percpu_to_op(4, volatile, "or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(1, volatile, pcp, nval)
-#define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(2, volatile, pcp, nval)
-#define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(4, volatile, pcp, nval)
+#define this_cpu_xchg_1(pcp, nval)	this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval)	this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval)	this_percpu_xchg_op(pcp, nval)
 
 #define raw_cpu_add_return_1(pcp, val)		percpu_add_return_op(1, , pcp, val)
 #define raw_cpu_add_return_2(pcp, val)		percpu_add_return_op(2, , pcp, val)
@@ -575,7 +565,7 @@ do {									\
 #define this_cpu_and_8(pcp, val)		percpu_to_op(8, volatile, "and", (pcp), val)
 #define this_cpu_or_8(pcp, val)			percpu_to_op(8, volatile, "or", (pcp), val)
 #define this_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, volatile, pcp, val)
-#define this_cpu_xchg_8(pcp, nval)		percpu_xchg_op(8, volatile, pcp, nval)
+#define this_cpu_xchg_8(pcp, nval)		this_percpu_xchg_op(pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
 #define this_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
 #endif

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
  2024-03-20 11:39 ` tip-bot2 for Uros Bizjak
@ 2024-03-20 11:45   ` Ingo Molnar
  2024-03-20 13:12     ` Uros Bizjak
  0 siblings, 1 reply; 9+ messages in thread
From: Ingo Molnar @ 2024-03-20 11:45 UTC (permalink / raw
  To: linux-kernel, Uros Bizjak
  Cc: linux-tip-commits, H. Peter Anvin, Linus Torvalds, x86,
	Nathan Chancellor, Kees Cook, Josh Poimboeuf


* tip-bot2 for Uros Bizjak <tip-bot2@linutronix.de> wrote:

> The following commit has been merged into the x86/percpu branch of tip:
> 
> Commit-ID:     0539084639f3835c8d0b798e6659ec14a266b4f1
> Gitweb:        https://git.kernel.org/tip/0539084639f3835c8d0b798e6659ec14a266b4f1
> Author:        Uros Bizjak <ubizjak@gmail.com>
> AuthorDate:    Wed, 20 Mar 2024 09:30:40 +01:00
> Committer:     Ingo Molnar <mingo@kernel.org>
> CommitterDate: Wed, 20 Mar 2024 12:29:02 +01:00
> 
> x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
> 
> Rewrite percpu_xchg_op() using generic percpu primitives instead
> of using asm. The new implementation is similar to local_xchg() and
> allows the compiler to perform various optimizations: e.g. the
> compiler is able to create fast path through the loop, according
> to likely/unlikely annotations in percpu_try_cmpxchg_op().

So, while at it, there's two other x86 percpu code generation details I was 
wondering about:

1)

Right now it's GCC-only:

  config CC_HAS_NAMED_AS
          def_bool CC_IS_GCC && GCC_VERSION >= 120100

Because we wanted to create a stable core of known-working functionality.

I suppose we have already established that with the current merge window, 
so it might be time to expand it.

Clang claims to be compatible:

  https://releases.llvm.org/9.0.0/tools/clang/docs/LanguageExtensions.html

  "You can also use the GCC compatibility macros __seg_fs and __seg_gs for the
   same purpose. The preprocessor symbols __SEG_FS and __SEG_GS indicate their
   support."

I haven't tried it yet though.

2)

Also, is the GCC_VERSION cutoff accurate - are previous GCC versions 
known-buggy, or was it primarily a risk-reduction cutoff?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
  2024-03-20 11:45   ` Ingo Molnar
@ 2024-03-20 13:12     ` Uros Bizjak
  2024-03-20 17:37       ` Nathan Chancellor
  0 siblings, 1 reply; 9+ messages in thread
From: Uros Bizjak @ 2024-03-20 13:12 UTC (permalink / raw
  To: Ingo Molnar
  Cc: linux-kernel, linux-tip-commits, H. Peter Anvin, Linus Torvalds,
	x86, Nathan Chancellor, Kees Cook, Josh Poimboeuf

On Wed, Mar 20, 2024 at 12:45 PM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * tip-bot2 for Uros Bizjak <tip-bot2@linutronix.de> wrote:
>
> > The following commit has been merged into the x86/percpu branch of tip:
> >
> > Commit-ID:     0539084639f3835c8d0b798e6659ec14a266b4f1
> > Gitweb:        https://git.kernel.org/tip/0539084639f3835c8d0b798e6659ec14a266b4f1
> > Author:        Uros Bizjak <ubizjak@gmail.com>
> > AuthorDate:    Wed, 20 Mar 2024 09:30:40 +01:00
> > Committer:     Ingo Molnar <mingo@kernel.org>
> > CommitterDate: Wed, 20 Mar 2024 12:29:02 +01:00
> >
> > x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
> >
> > Rewrite percpu_xchg_op() using generic percpu primitives instead
> > of using asm. The new implementation is similar to local_xchg() and
> > allows the compiler to perform various optimizations: e.g. the
> > compiler is able to create fast path through the loop, according
> > to likely/unlikely annotations in percpu_try_cmpxchg_op().
>
> So, while at it, there's two other x86 percpu code generation details I was
> wondering about:
>
> 1)
>
> Right now it's GCC-only:
>
>   config CC_HAS_NAMED_AS
>           def_bool CC_IS_GCC && GCC_VERSION >= 120100
>
> Because we wanted to create a stable core of known-working functionality.
>
> I suppose we have already established that with the current merge window,
> so it might be time to expand it.

Please note the KASAN incompatibility issue with GCC < 13.3. This
issue was fixed in the meantime, so I have posted a patch to re-enable
the named AS feature for gcc-13.3+ [1].

[1] https://lore.kernel.org/lkml/20240320124603.566923-1-ubizjak@gmail.com/

> Clang claims to be compatible:
>
>   https://releases.llvm.org/9.0.0/tools/clang/docs/LanguageExtensions.html
>
>   "You can also use the GCC compatibility macros __seg_fs and __seg_gs for the
>    same purpose. The preprocessor symbols __SEG_FS and __SEG_GS indicate their
>    support."
>
> I haven't tried it yet though.

In the RFC submission, the support was determined by the functional
check [2]. Perhaps we should re-introduce this instead of checking for
known compiler versions:

+config CC_HAS_NAMED_AS
+ def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC)
-x c - -c -o /dev/null)

[2] https://lore.kernel.org/lkml/20231001131620.112484-3-ubizjak@gmail.com/

> 2)
>
> Also, is the GCC_VERSION cutoff accurate - are previous GCC versions
> known-buggy, or was it primarily a risk-reduction cutoff?

This approach was chosen from our discussion [3]. The version cutoff
is arbitrary, it was later set to gcc-12.1, because it is the version
of the compiler you used at the time ;) I have also tried gcc-11 and
gcc-10 in the past, and the compiler produced bootable image. Saying
that, the usage of named address spaces in the kernel is somehow basic
(from the compiler PoV), so I think we could try the above approach
with the functional check and see if and what breaks. We can always
disable the USE_X86_SEG_SUPPORT config variable for known bad compiler
versions.

[3] https://lore.kernel.org/lkml/ZRwZOtANkcwtL+5B@gmail.com/

BTW: Related to percpu series is the patch that fixes the issue with
%rip-relative addressing for PER_CPU_VAR in BPF. IMHO, this issue
should be fixed before rc1, otherwise call thunks will be unusable
with BPF.

[4] https://lore.kernel.org/lkml/20240316232104.368561-1-joanbrugueram@gmail.com/

Thanks,
Uros.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
  2024-03-20 13:12     ` Uros Bizjak
@ 2024-03-20 17:37       ` Nathan Chancellor
  0 siblings, 0 replies; 9+ messages in thread
From: Nathan Chancellor @ 2024-03-20 17:37 UTC (permalink / raw
  To: Uros Bizjak
  Cc: Ingo Molnar, linux-kernel, linux-tip-commits, H. Peter Anvin,
	Linus Torvalds, x86, Kees Cook, Josh Poimboeuf

On Wed, Mar 20, 2024 at 02:12:14PM +0100, Uros Bizjak wrote:
> On Wed, Mar 20, 2024 at 12:45 PM Ingo Molnar <mingo@kernel.org> wrote:
> > Clang claims to be compatible:
> >
> >   https://releases.llvm.org/9.0.0/tools/clang/docs/LanguageExtensions.html
> >
> >   "You can also use the GCC compatibility macros __seg_fs and __seg_gs for the
> >    same purpose. The preprocessor symbols __SEG_FS and __SEG_GS indicate their
> >    support."
> >
> > I haven't tried it yet though.
> 
> In the RFC submission, the support was determined by the functional
> check [2]. Perhaps we should re-introduce this instead of checking for
> known compiler versions:
> 
> +config CC_HAS_NAMED_AS
> + def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC)
> -x c - -c -o /dev/null)
> 
> [2] https://lore.kernel.org/lkml/20231001131620.112484-3-ubizjak@gmail.com/

I applied this change on top of current mainline (a4145ce1e7bc) and
built ARCH=x86_64 defconfig with LLVM 17.0.6 from [1] but it doesn't get
too far :)

  In file included from arch/x86/kernel/asm-offsets.c:9:
  In file included from include/linux/crypto.h:15:
  In file included from include/linux/completion.h:12:
  In file included from include/linux/swait.h:7:
  In file included from include/linux/spinlock.h:56:
  In file included from include/linux/preempt.h:79:
  In file included from arch/x86/include/asm/preempt.h:7:
  arch/x86/include/asm/current.h:47:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
     47 |                 return this_cpu_read_const(const_pcpu_hot.current_task);
        |                        ^
  arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
    471 | #define this_cpu_read_const(pcp)        __raw_cpu_read(, pcp)
        |                                         ^
  arch/x86/include/asm/percpu.h:441:30: note: expanded from macro '__raw_cpu_read'
    441 |         *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp));               \
        |                                     ^
  arch/x86/include/asm/percpu.h:105:28: note: expanded from macro '__my_cpu_ptr'
    105 | #define __my_cpu_ptr(ptr)       (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
        |                                  ^
  arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
    104 | #define __my_cpu_type(var)      typeof(var) __percpu_seg_override
        |                                             ^
  arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
     45 | #define __percpu_seg_override   __seg_gs
        |                                 ^
  <built-in>:338:33: note: expanded from macro '__seg_gs'
    338 | #define __seg_gs __attribute__((address_space(256)))
        |                                 ^
  In file included from arch/x86/kernel/asm-offsets.c:9:
  In file included from include/linux/crypto.h:15:
  In file included from include/linux/completion.h:12:
  In file included from include/linux/swait.h:7:
  In file included from include/linux/spinlock.h:56:
  In file included from include/linux/preempt.h:79:
  In file included from arch/x86/include/asm/preempt.h:7:
  arch/x86/include/asm/current.h:47:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
  arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
    471 | #define this_cpu_read_const(pcp)        __raw_cpu_read(, pcp)
        |                                         ^
  arch/x86/include/asm/percpu.h:441:9: note: expanded from macro '__raw_cpu_read'
    441 |         *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp));               \
        |                ^
  arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
    104 | #define __my_cpu_type(var)      typeof(var) __percpu_seg_override
        |                                             ^
  arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
     45 | #define __percpu_seg_override   __seg_gs
        |                                 ^
  <built-in>:338:33: note: expanded from macro '__seg_gs'
    338 | #define __seg_gs __attribute__((address_space(256)))
        |                                 ^
  In file included from arch/x86/kernel/asm-offsets.c:9:
  In file included from include/linux/crypto.h:15:
  In file included from include/linux/completion.h:12:
  In file included from include/linux/swait.h:7:
  In file included from include/linux/spinlock.h:60:
  In file included from include/linux/thread_info.h:60:
  In file included from arch/x86/include/asm/thread_info.h:59:
  In file included from arch/x86/include/asm/cpufeature.h:5:
  arch/x86/include/asm/processor.h:530:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
    530 |                 return this_cpu_read_const(const_pcpu_hot.top_of_stack);
        |                        ^
  arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
    471 | #define this_cpu_read_const(pcp)        __raw_cpu_read(, pcp)
        |                                         ^
  arch/x86/include/asm/percpu.h:441:30: note: expanded from macro '__raw_cpu_read'
    441 |         *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp));               \
        |                                     ^
  arch/x86/include/asm/percpu.h:105:28: note: expanded from macro '__my_cpu_ptr'
    105 | #define __my_cpu_ptr(ptr)       (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
        |                                  ^
  arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
    104 | #define __my_cpu_type(var)      typeof(var) __percpu_seg_override
        |                                             ^
  arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
     45 | #define __percpu_seg_override   __seg_gs
        |                                 ^
  <built-in>:338:33: note: expanded from macro '__seg_gs'
    338 | #define __seg_gs __attribute__((address_space(256)))
        |                                 ^
  In file included from arch/x86/kernel/asm-offsets.c:9:
  In file included from include/linux/crypto.h:15:
  In file included from include/linux/completion.h:12:
  In file included from include/linux/swait.h:7:
  In file included from include/linux/spinlock.h:60:
  In file included from include/linux/thread_info.h:60:
  In file included from arch/x86/include/asm/thread_info.h:59:
  In file included from arch/x86/include/asm/cpufeature.h:5:
  arch/x86/include/asm/processor.h:530:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
  arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
    471 | #define this_cpu_read_const(pcp)        __raw_cpu_read(, pcp)
        |                                         ^
  arch/x86/include/asm/percpu.h:441:9: note: expanded from macro '__raw_cpu_read'
    441 |         *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp));               \
        |                ^
  arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
    104 | #define __my_cpu_type(var)      typeof(var) __percpu_seg_override
        |                                             ^
  arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
     45 | #define __percpu_seg_override   __seg_gs
        |                                 ^
  <built-in>:338:33: note: expanded from macro '__seg_gs'
    338 | #define __seg_gs __attribute__((address_space(256)))
        |                                 ^
  4 errors generated.

[1]: https://mirrors.edge.kernel.org/pub/tools/llvm/

Cheers,
Nathan

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-03-20 17:38 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-03-20  8:30 [PATCH 1/2] x86/percpu: Define this_percpu_xchg_op() Uros Bizjak
2024-03-20  8:30 ` [PATCH 2/2] x86/percpu: Move raw_percpu_xchg_op() to a better place Uros Bizjak
2024-03-20 11:22   ` [tip: x86/percpu] " tip-bot2 for Uros Bizjak
2024-03-20 11:39   ` tip-bot2 for Uros Bizjak
2024-03-20 11:22 ` [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code tip-bot2 for Uros Bizjak
2024-03-20 11:39 ` tip-bot2 for Uros Bizjak
2024-03-20 11:45   ` Ingo Molnar
2024-03-20 13:12     ` Uros Bizjak
2024-03-20 17:37       ` Nathan Chancellor

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).