All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/1] riscv: assembler versions of memcpy, memmove, memset
@ 2021-03-27 11:37 Heinrich Schuchardt
  0 siblings, 0 replies; only message in thread
From: Heinrich Schuchardt @ 2021-03-27 11:37 UTC (permalink / raw
  To: u-boot

Provide optimized versions of memcpy(), memmove(), memset() copied from
the Linux kernel.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
Reviewed-by: Leo Yu-Chi Liang <ycliang@andestech.com>
---
v2:
	remove duplicate line in arch/riscv/lib/Makefile
---
 arch/riscv/Kconfig              |  78 ++++++++++++++++++++++
 arch/riscv/include/asm/string.h |  36 +++++-----
 arch/riscv/lib/Makefile         |   4 ++
 arch/riscv/lib/memcpy.S         | 108 ++++++++++++++++++++++++++++++
 arch/riscv/lib/memmove.S        |  64 ++++++++++++++++++
 arch/riscv/lib/memset.S         | 113 ++++++++++++++++++++++++++++++++
 6 files changed, 382 insertions(+), 21 deletions(-)
 create mode 100644 arch/riscv/lib/memcpy.S
 create mode 100644 arch/riscv/lib/memmove.S
 create mode 100644 arch/riscv/lib/memset.S

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 55eaee2da6..7d0bd73ce2 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -271,4 +271,82 @@ config STACK_SIZE_SHIFT
 config OF_BOARD_FIXUP
 	default y if OF_SEPARATE && RISCV_SMODE

+config USE_ARCH_MEMCPY
+	bool "Use an assembly optimized implementation of memcpy"
+	default y
+	help
+	  Enable the generation of an optimized version of memcpy.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
+config SPL_USE_ARCH_MEMCPY
+	bool "Use an assembly optimized implementation of memcpy for SPL"
+	default y if USE_ARCH_MEMCPY
+	depends on SPL
+	help
+	  Enable the generation of an optimized version of memcpy.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
+config TPL_USE_ARCH_MEMCPY
+	bool "Use an assembly optimized implementation of memcpy for TPL"
+	default y if USE_ARCH_MEMCPY
+	depends on TPL
+	help
+	  Enable the generation of an optimized version of memcpy.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
+config USE_ARCH_MEMMOVE
+	bool "Use an assembly optimized implementation of memmove"
+	default y
+	help
+	  Enable the generation of an optimized version of memmove.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
+config SPL_USE_ARCH_MEMMOVE
+	bool "Use an assembly optimized implementation of memmove for SPL"
+	default y if USE_ARCH_MEMCPY
+	depends on SPL
+	help
+	  Enable the generation of an optimized version of memmove.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
+config TPL_USE_ARCH_MEMMOVE
+	bool "Use an assembly optimized implementation of memmove for TPL"
+	default y if USE_ARCH_MEMCPY
+	depends on TPL
+	help
+	  Enable the generation of an optimized version of memmove.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
+config USE_ARCH_MEMSET
+	bool "Use an assembly optimized implementation of memset"
+	default y
+	help
+	  Enable the generation of an optimized version of memset.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
+config SPL_USE_ARCH_MEMSET
+	bool "Use an assembly optimized implementation of memset for SPL"
+	default y if USE_ARCH_MEMSET
+	depends on SPL
+	help
+	  Enable the generation of an optimized version of memset.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
+config TPL_USE_ARCH_MEMSET
+	bool "Use an assembly optimized implementation of memset for TPL"
+	default y if USE_ARCH_MEMSET
+	depends on TPL
+	help
+	  Enable the generation of an optimized version of memset.
+	  Such an implementation may be faster under some conditions
+	  but may increase the binary size.
+
 endmenu
diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
index 0fc3424a2f..7dee3e4c9f 100644
--- a/arch/riscv/include/asm/string.h
+++ b/arch/riscv/include/asm/string.h
@@ -19,31 +19,25 @@

 #undef __HAVE_ARCH_STRRCHR
 #undef __HAVE_ARCH_STRCHR
-#undef __HAVE_ARCH_MEMCPY
-#undef __HAVE_ARCH_MEMMOVE
 #undef __HAVE_ARCH_MEMCHR
 #undef __HAVE_ARCH_MEMZERO
-#undef __HAVE_ARCH_MEMSET

-#ifdef CONFIG_MARCO_MEMSET
-#define memset(_p, _v, _n)	\
-	(typeof(_p) (p) = (_p); \
-	 typeof(_v) (v) = (_v); \
-	 typeof(_n) (n) = (_n); \
-	 {								\
-		if ((n) != 0) {						\
-			if (__builtin_constant_p((v)) && (v) == 0)	\
-				__memzero((p), (n));			\
-			else						\
-				memset((p), (v), (n));			\
-		}							\
-		(p);							\
-	})
+#undef __HAVE_ARCH_MEMCPY
+#if CONFIG_IS_ENABLED(USE_ARCH_MEMCPY)
+#define __HAVE_ARCH_MEMCPY
+#endif
+extern void *memcpy(void *, const void *, __kernel_size_t);
+
+#undef __HAVE_ARCH_MEMMOVE
+#if CONFIG_IS_ENABLED(USE_ARCH_MEMMOVE)
+#define __HAVE_ARCH_MEMMOVE
+#endif
+extern void *memmove(void *, const void *, __kernel_size_t);

-#define memzero(_p, _n) \
-	(typeof(_p) (p) = (_p); \
-	 typeof(_n) (n) = (_n); \
-	 { if ((n) != 0) __memzero((p), (n)); (p); })
+#undef __HAVE_ARCH_MEMZERO
+#if CONFIG_IS_ENABLED(USE_ARCH_MEMSET)
+#define __HAVE_ARCH_MEMSET
 #endif
+extern void *memset(void *, int, __kernel_size_t);

 #endif /* __ASM_RISCV_STRING_H */
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 12c14f2019..ff0677aa96 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -36,3 +36,7 @@ CFLAGS_REMOVE_$(EFI_RELOC) := $(CFLAGS_NON_EFI)
 extra-$(CONFIG_CMD_BOOTEFI_HELLO_COMPILE) += $(EFI_CRT0) $(EFI_RELOC)
 extra-$(CONFIG_CMD_BOOTEFI_SELFTEST) += $(EFI_CRT0) $(EFI_RELOC)
 extra-$(CONFIG_EFI) += $(EFI_CRT0) $(EFI_RELOC)
+
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset.o
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMMOVE) += memmove.o
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy.o
diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S
new file mode 100644
index 0000000000..51ab716253
--- /dev/null
+++ b/arch/riscv/lib/memcpy.S
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+/* void *memcpy(void *, const void *, size_t) */
+ENTRY(__memcpy)
+WEAK(memcpy)
+	move t6, a0  /* Preserve return value */
+
+	/* Defer to byte-oriented copy for small sizes */
+	sltiu a3, a2, 128
+	bnez a3, 4f
+	/* Use word-oriented copy only if low-order bits match */
+	andi a3, t6, SZREG-1
+	andi a4, a1, SZREG-1
+	bne a3, a4, 4f
+
+	beqz a3, 2f  /* Skip if already aligned */
+	/*
+	 * Round to nearest double word-aligned address
+	 * greater than or equal to start address
+	 */
+	andi a3, a1, ~(SZREG-1)
+	addi a3, a3, SZREG
+	/* Handle initial misalignment */
+	sub a4, a3, a1
+1:
+	lb a5, 0(a1)
+	addi a1, a1, 1
+	sb a5, 0(t6)
+	addi t6, t6, 1
+	bltu a1, a3, 1b
+	sub a2, a2, a4  /* Update count */
+
+2:
+	andi a4, a2, ~((16*SZREG)-1)
+	beqz a4, 4f
+	add a3, a1, a4
+3:
+	REG_L a4,       0(a1)
+	REG_L a5,   SZREG(a1)
+	REG_L a6, 2*SZREG(a1)
+	REG_L a7, 3*SZREG(a1)
+	REG_L t0, 4*SZREG(a1)
+	REG_L t1, 5*SZREG(a1)
+	REG_L t2, 6*SZREG(a1)
+	REG_L t3, 7*SZREG(a1)
+	REG_L t4, 8*SZREG(a1)
+	REG_L t5, 9*SZREG(a1)
+	REG_S a4,       0(t6)
+	REG_S a5,   SZREG(t6)
+	REG_S a6, 2*SZREG(t6)
+	REG_S a7, 3*SZREG(t6)
+	REG_S t0, 4*SZREG(t6)
+	REG_S t1, 5*SZREG(t6)
+	REG_S t2, 6*SZREG(t6)
+	REG_S t3, 7*SZREG(t6)
+	REG_S t4, 8*SZREG(t6)
+	REG_S t5, 9*SZREG(t6)
+	REG_L a4, 10*SZREG(a1)
+	REG_L a5, 11*SZREG(a1)
+	REG_L a6, 12*SZREG(a1)
+	REG_L a7, 13*SZREG(a1)
+	REG_L t0, 14*SZREG(a1)
+	REG_L t1, 15*SZREG(a1)
+	addi a1, a1, 16*SZREG
+	REG_S a4, 10*SZREG(t6)
+	REG_S a5, 11*SZREG(t6)
+	REG_S a6, 12*SZREG(t6)
+	REG_S a7, 13*SZREG(t6)
+	REG_S t0, 14*SZREG(t6)
+	REG_S t1, 15*SZREG(t6)
+	addi t6, t6, 16*SZREG
+	bltu a1, a3, 3b
+	andi a2, a2, (16*SZREG)-1  /* Update count */
+
+4:
+	/* Handle trailing misalignment */
+	beqz a2, 6f
+	add a3, a1, a2
+
+	/* Use word-oriented copy if co-aligned to word boundary */
+	or a5, a1, t6
+	or a5, a5, a3
+	andi a5, a5, 3
+	bnez a5, 5f
+7:
+	lw a4, 0(a1)
+	addi a1, a1, 4
+	sw a4, 0(t6)
+	addi t6, t6, 4
+	bltu a1, a3, 7b
+
+	ret
+
+5:
+	lb a4, 0(a1)
+	addi a1, a1, 1
+	sb a4, 0(t6)
+	addi t6, t6, 1
+	bltu a1, a3, 5b
+6:
+	ret
+END(__memcpy)
diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
new file mode 100644
index 0000000000..07d1d2152b
--- /dev/null
+++ b/arch/riscv/lib/memmove.S
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+ENTRY(__memmove)
+WEAK(memmove)
+        move    t0, a0
+        move    t1, a1
+
+        beq     a0, a1, exit_memcpy
+        beqz    a2, exit_memcpy
+        srli    t2, a2, 0x2
+
+        slt     t3, a0, a1
+        beqz    t3, do_reverse
+
+        andi    a2, a2, 0x3
+        li      t4, 1
+        beqz    t2, byte_copy
+
+word_copy:
+        lw      t3, 0(a1)
+        addi    t2, t2, -1
+        addi    a1, a1, 4
+        sw      t3, 0(a0)
+        addi    a0, a0, 4
+        bnez    t2, word_copy
+        beqz    a2, exit_memcpy
+        j       byte_copy
+
+do_reverse:
+        add     a0, a0, a2
+        add     a1, a1, a2
+        andi    a2, a2, 0x3
+        li      t4, -1
+        beqz    t2, reverse_byte_copy
+
+reverse_word_copy:
+        addi    a1, a1, -4
+        addi    t2, t2, -1
+        lw      t3, 0(a1)
+        addi    a0, a0, -4
+        sw      t3, 0(a0)
+        bnez    t2, reverse_word_copy
+        beqz    a2, exit_memcpy
+
+reverse_byte_copy:
+        addi    a0, a0, -1
+        addi    a1, a1, -1
+
+byte_copy:
+        lb      t3, 0(a1)
+        addi    a2, a2, -1
+        sb      t3, 0(a0)
+        add     a1, a1, t4
+        add     a0, a0, t4
+        bnez    a2, byte_copy
+
+exit_memcpy:
+        move a0, t0
+        move a1, t1
+        ret
+END(__memmove)
diff --git a/arch/riscv/lib/memset.S b/arch/riscv/lib/memset.S
new file mode 100644
index 0000000000..34c5360c67
--- /dev/null
+++ b/arch/riscv/lib/memset.S
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+/* void *memset(void *, int, size_t) */
+ENTRY(__memset)
+WEAK(memset)
+	move t0, a0  /* Preserve return value */
+
+	/* Defer to byte-oriented fill for small sizes */
+	sltiu a3, a2, 16
+	bnez a3, 4f
+
+	/*
+	 * Round to nearest XLEN-aligned address
+	 * greater than or equal to start address
+	 */
+	addi a3, t0, SZREG-1
+	andi a3, a3, ~(SZREG-1)
+	beq a3, t0, 2f  /* Skip if already aligned */
+	/* Handle initial misalignment */
+	sub a4, a3, t0
+1:
+	sb a1, 0(t0)
+	addi t0, t0, 1
+	bltu t0, a3, 1b
+	sub a2, a2, a4  /* Update count */
+
+2: /* Duff's device with 32 XLEN stores per iteration */
+	/* Broadcast value into all bytes */
+	andi a1, a1, 0xff
+	slli a3, a1, 8
+	or a1, a3, a1
+	slli a3, a1, 16
+	or a1, a3, a1
+#ifdef CONFIG_64BIT
+	slli a3, a1, 32
+	or a1, a3, a1
+#endif
+
+	/* Calculate end address */
+	andi a4, a2, ~(SZREG-1)
+	add a3, t0, a4
+
+	andi a4, a4, 31*SZREG  /* Calculate remainder */
+	beqz a4, 3f            /* Shortcut if no remainder */
+	neg a4, a4
+	addi a4, a4, 32*SZREG  /* Calculate initial offset */
+
+	/* Adjust start address with offset */
+	sub t0, t0, a4
+
+	/* Jump into loop body */
+	/* Assumes 32-bit instruction lengths */
+	la a5, 3f
+#ifdef CONFIG_64BIT
+	srli a4, a4, 1
+#endif
+	add a5, a5, a4
+	jr a5
+3:
+	REG_S a1,        0(t0)
+	REG_S a1,    SZREG(t0)
+	REG_S a1,  2*SZREG(t0)
+	REG_S a1,  3*SZREG(t0)
+	REG_S a1,  4*SZREG(t0)
+	REG_S a1,  5*SZREG(t0)
+	REG_S a1,  6*SZREG(t0)
+	REG_S a1,  7*SZREG(t0)
+	REG_S a1,  8*SZREG(t0)
+	REG_S a1,  9*SZREG(t0)
+	REG_S a1, 10*SZREG(t0)
+	REG_S a1, 11*SZREG(t0)
+	REG_S a1, 12*SZREG(t0)
+	REG_S a1, 13*SZREG(t0)
+	REG_S a1, 14*SZREG(t0)
+	REG_S a1, 15*SZREG(t0)
+	REG_S a1, 16*SZREG(t0)
+	REG_S a1, 17*SZREG(t0)
+	REG_S a1, 18*SZREG(t0)
+	REG_S a1, 19*SZREG(t0)
+	REG_S a1, 20*SZREG(t0)
+	REG_S a1, 21*SZREG(t0)
+	REG_S a1, 22*SZREG(t0)
+	REG_S a1, 23*SZREG(t0)
+	REG_S a1, 24*SZREG(t0)
+	REG_S a1, 25*SZREG(t0)
+	REG_S a1, 26*SZREG(t0)
+	REG_S a1, 27*SZREG(t0)
+	REG_S a1, 28*SZREG(t0)
+	REG_S a1, 29*SZREG(t0)
+	REG_S a1, 30*SZREG(t0)
+	REG_S a1, 31*SZREG(t0)
+	addi t0, t0, 32*SZREG
+	bltu t0, a3, 3b
+	andi a2, a2, SZREG-1  /* Update count */
+
+4:
+	/* Handle trailing misalignment */
+	beqz a2, 6f
+	add a3, t0, a2
+5:
+	sb a1, 0(t0)
+	addi t0, t0, 1
+	bltu t0, a3, 5b
+6:
+	ret
+END(__memset)
--
2.30.2

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2021-03-27 11:37 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-03-27 11:37 [PATCH v2 1/1] riscv: assembler versions of memcpy, memmove, memset Heinrich Schuchardt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.