From: Vineet Gupta <Vineet.Gupta1@synopsys.com> To: <linux-arch@vger.kernel.org>, <linux-kernel@vger.kernel.org> Cc: <arnd@arndb.de>, <arc-linux-dev@synopsys.com>, Claudiu Zissulescu <Claudiu.Zissulescu@synopsys.com>, Vineet Gupta <Vineet.Gupta1@synopsys.com> Subject: [PATCH 09/28] ARCv2: optimised string/mem lib routines Date: Tue, 9 Jun 2015 17:18:09 +0530 [thread overview] Message-ID: <1433850508-26317-10-git-send-email-vgupta@synopsys.com> (raw) In-Reply-To: <1433850508-26317-1-git-send-email-vgupta@synopsys.com> From: Claudiu Zissulescu <claziss@synopsys.com> Signed-off-by: Vineet Gupta <vgupta@synopsys.com> --- arch/arc/lib/Makefile | 6 +- arch/arc/lib/memcpy-archs.S | 236 ++++++++++++++++++++++++++++++++++++++++++++ arch/arc/lib/memset-archs.S | 93 +++++++++++++++++ arch/arc/lib/strcmp-archs.S | 78 +++++++++++++++ 4 files changed, 411 insertions(+), 2 deletions(-) create mode 100644 arch/arc/lib/memcpy-archs.S create mode 100644 arch/arc/lib/memset-archs.S create mode 100644 arch/arc/lib/strcmp-archs.S diff --git a/arch/arc/lib/Makefile b/arch/arc/lib/Makefile index db46e200baba..b1656d156097 100644 --- a/arch/arc/lib/Makefile +++ b/arch/arc/lib/Makefile @@ -5,5 +5,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -lib-y := strchr-700.o strcmp.o strcpy-700.o strlen.o -lib-y += memcmp.o memcpy-700.o memset.o +lib-y := strchr-700.o strcpy-700.o strlen.o memcmp.o + +lib-$(CONFIG_ISA_ARCOMPACT) += memcpy-700.o memset.o strcmp.o +lib-$(CONFIG_ISA_ARCV2) += memcpy-archs.o memset-archs.o strcmp-archs.o diff --git a/arch/arc/lib/memcpy-archs.S b/arch/arc/lib/memcpy-archs.S new file mode 100644 index 000000000000..1b2b3acfed52 --- /dev/null +++ b/arch/arc/lib/memcpy-archs.S @@ -0,0 +1,236 @@ +/* + * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +#ifdef __LITTLE_ENDIAN__ +# define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << +# define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> +# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM +# define MERGE_2(RX,RY,IMM) +# define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF +# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM +#else +# define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> +# define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << +# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << +# define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << +# define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM +# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 +#endif + +#ifdef CONFIG_ARC_HAS_LL64 +# define PREFETCH_READ(RX) prefetch [RX, 56] +# define PREFETCH_WRITE(RX) prefetchw [RX, 64] +# define LOADX(DST,RX) ldd.ab DST, [RX, 8] +# define STOREX(SRC,RX) std.ab SRC, [RX, 8] +# define ZOLSHFT 5 +# define ZOLAND 0x1F +#else +# define PREFETCH_READ(RX) prefetch [RX, 28] +# define PREFETCH_WRITE(RX) prefetchw [RX, 32] +# define LOADX(DST,RX) ld.ab DST, [RX, 4] +# define STOREX(SRC,RX) st.ab SRC, [RX, 4] +# define ZOLSHFT 4 +# define ZOLAND 0xF +#endif + +ENTRY(memcpy) + prefetch [r1] ; Prefetch the read location + prefetchw [r0] ; Prefetch the write location + mov.f 0, r2 +;;; if size is zero + jz.d [blink] + mov r3, r0 ; don;t clobber ret val + +;;; if size <= 8 + cmp r2, 8 + bls.d @smallchunk + mov.f lp_count, r2 + + and.f r4, r0, 0x03 + rsub lp_count, r4, 4 + lpnz @aligndestination + ;; LOOP BEGIN + ldb.ab r5, [r1,1] + sub r2, r2, 1 + stb.ab r5, [r3,1] +aligndestination: + +;;; Check the alignment of the source + and.f r4, r1, 0x03 + bnz.d @sourceunaligned + +;;; CASE 0: Both source and destination are 32bit aligned +;;; Convert len to Dwords, unfold x4 + lsr.f lp_count, r2, ZOLSHFT + lpnz @copy32_64bytes + ;; LOOP START + LOADX (r6, r1) + PREFETCH_READ (r1) + PREFETCH_WRITE (r3) + LOADX (r8, r1) + LOADX (r10, r1) + LOADX (r4, r1) + STOREX (r6, r3) + STOREX (r8, r3) + STOREX (r10, r3) + STOREX (r4, r3) +copy32_64bytes: + + and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes +smallchunk: + lpnz @copyremainingbytes + ;; LOOP START + ldb.ab r5, [r1,1] + stb.ab r5, [r3,1] +copyremainingbytes: + + j [blink] +;;; END CASE 0 + +sourceunaligned: + cmp r4, 2 + beq.d @unalignedOffby2 + sub r2, r2, 1 + + bhi.d @unalignedOffby3 + ldb.ab r5, [r1, 1] + +;;; CASE 1: The source is unaligned, off by 1 + ;; Hence I need to read 1 byte for a 16bit alignment + ;; and 2bytes to reach 32bit alignment + ldh.ab r6, [r1, 2] + sub r2, r2, 2 + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 + MERGE_1 (r6, r6, 8) + MERGE_2 (r5, r5, 24) + or r5, r5, r6 + + ;; Both src and dst are aligned + lpnz @copy8bytes_1 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetchw [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 24) + or r7, r7, r5 + SHIFT_2 (r5, r6, 8) + + SHIFT_1 (r9, r8, 24) + or r9, r9, r5 + SHIFT_2 (r5, r8, 8) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +copy8bytes_1: + + ;; Write back the remaining 16bits + EXTRACT_1 (r6, r5, 16) + sth.ab r6, [r3, 2] + ;; Write back the remaining 8bits + EXTRACT_2 (r5, r5, 16) + stb.ab r5, [r3, 1] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @copybytewise_1 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +copybytewise_1: + j [blink] + +unalignedOffby2: +;;; CASE 2: The source is unaligned, off by 2 + ldh.ab r5, [r1, 2] + sub r2, r2, 1 + + ;; Both src and dst are aligned + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 +#ifdef __BIG_ENDIAN__ + asl.nz r5, r5, 16 +#endif + lpnz @copy8bytes_2 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetchw [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 16) + or r7, r7, r5 + SHIFT_2 (r5, r6, 16) + + SHIFT_1 (r9, r8, 16) + or r9, r9, r5 + SHIFT_2 (r5, r8, 16) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +copy8bytes_2: + +#ifdef __BIG_ENDIAN__ + lsr.nz r5, r5, 16 +#endif + sth.ab r5, [r3, 2] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @copybytewise_2 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +copybytewise_2: + j [blink] + +unalignedOffby3: +;;; CASE 3: The source is unaligned, off by 3 +;;; Hence, I need to read 1byte for achieve the 32bit alignment + + ;; Both src and dst are aligned + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 +#ifdef __BIG_ENDIAN__ + asl.ne r5, r5, 24 +#endif + lpnz @copy8bytes_3 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetch [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 8) + or r7, r7, r5 + SHIFT_2 (r5, r6, 24) + + SHIFT_1 (r9, r8, 8) + or r9, r9, r5 + SHIFT_2 (r5, r8, 24) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +copy8bytes_3: + +#ifdef __BIG_ENDIAN__ + lsr.nz r5, r5, 24 +#endif + stb.ab r5, [r3, 1] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @copybytewise_3 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +copybytewise_3: + j [blink] + +END(memcpy) diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S new file mode 100644 index 000000000000..92d573c734b5 --- /dev/null +++ b/arch/arc/lib/memset-archs.S @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +#undef PREALLOC_NOT_AVAIL + +#ifdef PREALLOC_NOT_AVAIL +#define PREWRITE(A,B) prefetchw [(A),(B)] +#else +#define PREWRITE(A,B) prealloc [(A),(B)] +#endif + +ENTRY(memset) + prefetchw [r0] ; Prefetch the write location + mov.f 0, r2 +;;; if size is zero + jz.d [blink] + mov r3, r0 ; don't clobber ret val + +;;; if length < 8 + brls.d.nt r2, 8, .Lsmallchunk + mov.f lp_count,r2 + + and.f r4, r0, 0x03 + rsub lp_count, r4, 4 + lpnz @.Laligndestination + ;; LOOP BEGIN + stb.ab r1, [r3,1] + sub r2, r2, 1 +.Laligndestination: + +;;; Destination is aligned + and r1, r1, 0xFF + asl r4, r1, 8 + or r4, r4, r1 + asl r5, r4, 16 + or r5, r5, r4 + mov r4, r5 + + sub3 lp_count, r2, 8 + cmp r2, 64 + bmsk.hi r2, r2, 5 + mov.ls lp_count, 0 + add3.hi r2, r2, 8 + +;;; Convert len to Dwords, unfold x8 + lsr.f lp_count, lp_count, 6 + lpnz @.Lset64bytes + ;; LOOP START + PREWRITE(r3, 64) ;Prefetch the next write location + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] +.Lset64bytes: + + lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes + lpnz .Lset32bytes + ;; LOOP START + prefetchw [r3, 32] ;Prefetch the next write location + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] +.Lset32bytes: + + and.f lp_count, r2, 0x1F ;Last remaining 31 bytes +.Lsmallchunk: + lpnz .Lcopy3bytes + ;; LOOP START + stb.ab r1, [r3, 1] +.Lcopy3bytes: + + j [blink] + +END(memset) + +ENTRY(memzero) + ; adjust bzero args to memset args + mov r2, r1 + b.d memset ;tail call so need to tinker with blink + mov r1, 0 +END(memzero) diff --git a/arch/arc/lib/strcmp-archs.S b/arch/arc/lib/strcmp-archs.S new file mode 100644 index 000000000000..4f338eec3365 --- /dev/null +++ b/arch/arc/lib/strcmp-archs.S @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +ENTRY(strcmp) + or r2, r0, r1 + bmsk_s r2, r2, 1 + brne r2, 0, @.Lcharloop + +;;; s1 and s2 are word aligned + ld.ab r2, [r0, 4] + + mov_s r12, 0x01010101 + ror r11, r12 + .align 4 +.LwordLoop: + ld.ab r3, [r1, 4] + ;; Detect NULL char in str1 + sub r4, r2, r12 + ld.ab r5, [r0, 4] + bic r4, r4, r2 + and r4, r4, r11 + brne.d.nt r4, 0, .LfoundNULL + ;; Check if the read locations are the same + cmp r2, r3 + beq.d .LwordLoop + mov.eq r2, r5 + + ;; A match is found, spot it out +#ifdef __LITTLE_ENDIAN__ + swape r3, r3 + mov_s r0, 1 + swape r2, r2 +#else + mov_s r0, 1 +#endif + cmp_s r2, r3 + j_s.d [blink] + bset.lo r0, r0, 31 + + .align 4 +.LfoundNULL: +#ifdef __BIG_ENDIAN__ + swape r4, r4 + swape r2, r2 + swape r3, r3 +#endif + ;; Find null byte + ffs r0, r4 + bmsk r2, r2, r0 + bmsk r3, r3, r0 + swape r2, r2 + swape r3, r3 + ;; make the return value + sub.f r0, r2, r3 + mov.hi r0, 1 + j_s.d [blink] + bset.lo r0, r0, 31 + + .align 4 +.Lcharloop: + ldb.ab r2, [r0, 1] + ldb.ab r3, [r1, 1] + nop + breq r2, 0, .Lcmpend + breq r2, r3, .Lcharloop + + .align 4 +.Lcmpend: + j_s.d [blink] + sub r0, r2, r3 +END(strcmp) -- 1.9.1
WARNING: multiple messages have this Message-ID (diff)
From: Vineet Gupta <Vineet.Gupta1@synopsys.com> To: linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org Cc: arnd@arndb.de, arc-linux-dev@synopsys.com, Claudiu Zissulescu <Claudiu.Zissulescu@synopsys.com>, Vineet Gupta <Vineet.Gupta1@synopsys.com> Subject: [PATCH 09/28] ARCv2: optimised string/mem lib routines Date: Tue, 9 Jun 2015 17:18:09 +0530 [thread overview] Message-ID: <1433850508-26317-10-git-send-email-vgupta@synopsys.com> (raw) In-Reply-To: <1433850508-26317-1-git-send-email-vgupta@synopsys.com> From: Claudiu Zissulescu <claziss@synopsys.com> Signed-off-by: Vineet Gupta <vgupta@synopsys.com> --- arch/arc/lib/Makefile | 6 +- arch/arc/lib/memcpy-archs.S | 236 ++++++++++++++++++++++++++++++++++++++++++++ arch/arc/lib/memset-archs.S | 93 +++++++++++++++++ arch/arc/lib/strcmp-archs.S | 78 +++++++++++++++ 4 files changed, 411 insertions(+), 2 deletions(-) create mode 100644 arch/arc/lib/memcpy-archs.S create mode 100644 arch/arc/lib/memset-archs.S create mode 100644 arch/arc/lib/strcmp-archs.S diff --git a/arch/arc/lib/Makefile b/arch/arc/lib/Makefile index db46e200baba..b1656d156097 100644 --- a/arch/arc/lib/Makefile +++ b/arch/arc/lib/Makefile @@ -5,5 +5,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -lib-y := strchr-700.o strcmp.o strcpy-700.o strlen.o -lib-y += memcmp.o memcpy-700.o memset.o +lib-y := strchr-700.o strcpy-700.o strlen.o memcmp.o + +lib-$(CONFIG_ISA_ARCOMPACT) += memcpy-700.o memset.o strcmp.o +lib-$(CONFIG_ISA_ARCV2) += memcpy-archs.o memset-archs.o strcmp-archs.o diff --git a/arch/arc/lib/memcpy-archs.S b/arch/arc/lib/memcpy-archs.S new file mode 100644 index 000000000000..1b2b3acfed52 --- /dev/null +++ b/arch/arc/lib/memcpy-archs.S @@ -0,0 +1,236 @@ +/* + * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +#ifdef __LITTLE_ENDIAN__ +# define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << +# define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> +# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM +# define MERGE_2(RX,RY,IMM) +# define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF +# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM +#else +# define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> +# define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << +# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << +# define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << +# define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM +# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 +#endif + +#ifdef CONFIG_ARC_HAS_LL64 +# define PREFETCH_READ(RX) prefetch [RX, 56] +# define PREFETCH_WRITE(RX) prefetchw [RX, 64] +# define LOADX(DST,RX) ldd.ab DST, [RX, 8] +# define STOREX(SRC,RX) std.ab SRC, [RX, 8] +# define ZOLSHFT 5 +# define ZOLAND 0x1F +#else +# define PREFETCH_READ(RX) prefetch [RX, 28] +# define PREFETCH_WRITE(RX) prefetchw [RX, 32] +# define LOADX(DST,RX) ld.ab DST, [RX, 4] +# define STOREX(SRC,RX) st.ab SRC, [RX, 4] +# define ZOLSHFT 4 +# define ZOLAND 0xF +#endif + +ENTRY(memcpy) + prefetch [r1] ; Prefetch the read location + prefetchw [r0] ; Prefetch the write location + mov.f 0, r2 +;;; if size is zero + jz.d [blink] + mov r3, r0 ; don;t clobber ret val + +;;; if size <= 8 + cmp r2, 8 + bls.d @smallchunk + mov.f lp_count, r2 + + and.f r4, r0, 0x03 + rsub lp_count, r4, 4 + lpnz @aligndestination + ;; LOOP BEGIN + ldb.ab r5, [r1,1] + sub r2, r2, 1 + stb.ab r5, [r3,1] +aligndestination: + +;;; Check the alignment of the source + and.f r4, r1, 0x03 + bnz.d @sourceunaligned + +;;; CASE 0: Both source and destination are 32bit aligned +;;; Convert len to Dwords, unfold x4 + lsr.f lp_count, r2, ZOLSHFT + lpnz @copy32_64bytes + ;; LOOP START + LOADX (r6, r1) + PREFETCH_READ (r1) + PREFETCH_WRITE (r3) + LOADX (r8, r1) + LOADX (r10, r1) + LOADX (r4, r1) + STOREX (r6, r3) + STOREX (r8, r3) + STOREX (r10, r3) + STOREX (r4, r3) +copy32_64bytes: + + and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes +smallchunk: + lpnz @copyremainingbytes + ;; LOOP START + ldb.ab r5, [r1,1] + stb.ab r5, [r3,1] +copyremainingbytes: + + j [blink] +;;; END CASE 0 + +sourceunaligned: + cmp r4, 2 + beq.d @unalignedOffby2 + sub r2, r2, 1 + + bhi.d @unalignedOffby3 + ldb.ab r5, [r1, 1] + +;;; CASE 1: The source is unaligned, off by 1 + ;; Hence I need to read 1 byte for a 16bit alignment + ;; and 2bytes to reach 32bit alignment + ldh.ab r6, [r1, 2] + sub r2, r2, 2 + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 + MERGE_1 (r6, r6, 8) + MERGE_2 (r5, r5, 24) + or r5, r5, r6 + + ;; Both src and dst are aligned + lpnz @copy8bytes_1 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetchw [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 24) + or r7, r7, r5 + SHIFT_2 (r5, r6, 8) + + SHIFT_1 (r9, r8, 24) + or r9, r9, r5 + SHIFT_2 (r5, r8, 8) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +copy8bytes_1: + + ;; Write back the remaining 16bits + EXTRACT_1 (r6, r5, 16) + sth.ab r6, [r3, 2] + ;; Write back the remaining 8bits + EXTRACT_2 (r5, r5, 16) + stb.ab r5, [r3, 1] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @copybytewise_1 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +copybytewise_1: + j [blink] + +unalignedOffby2: +;;; CASE 2: The source is unaligned, off by 2 + ldh.ab r5, [r1, 2] + sub r2, r2, 1 + + ;; Both src and dst are aligned + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 +#ifdef __BIG_ENDIAN__ + asl.nz r5, r5, 16 +#endif + lpnz @copy8bytes_2 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetchw [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 16) + or r7, r7, r5 + SHIFT_2 (r5, r6, 16) + + SHIFT_1 (r9, r8, 16) + or r9, r9, r5 + SHIFT_2 (r5, r8, 16) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +copy8bytes_2: + +#ifdef __BIG_ENDIAN__ + lsr.nz r5, r5, 16 +#endif + sth.ab r5, [r3, 2] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @copybytewise_2 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +copybytewise_2: + j [blink] + +unalignedOffby3: +;;; CASE 3: The source is unaligned, off by 3 +;;; Hence, I need to read 1byte for achieve the 32bit alignment + + ;; Both src and dst are aligned + ;; Convert to words, unfold x2 + lsr.f lp_count, r2, 3 +#ifdef __BIG_ENDIAN__ + asl.ne r5, r5, 24 +#endif + lpnz @copy8bytes_3 + ;; LOOP START + ld.ab r6, [r1, 4] + prefetch [r1, 28] ;Prefetch the next read location + ld.ab r8, [r1,4] + prefetch [r3, 32] ;Prefetch the next write location + + SHIFT_1 (r7, r6, 8) + or r7, r7, r5 + SHIFT_2 (r5, r6, 24) + + SHIFT_1 (r9, r8, 8) + or r9, r9, r5 + SHIFT_2 (r5, r8, 24) + + st.ab r7, [r3, 4] + st.ab r9, [r3, 4] +copy8bytes_3: + +#ifdef __BIG_ENDIAN__ + lsr.nz r5, r5, 24 +#endif + stb.ab r5, [r3, 1] + + and.f lp_count, r2, 0x07 ;Last 8bytes + lpnz @copybytewise_3 + ;; LOOP START + ldb.ab r6, [r1,1] + stb.ab r6, [r3,1] +copybytewise_3: + j [blink] + +END(memcpy) diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S new file mode 100644 index 000000000000..92d573c734b5 --- /dev/null +++ b/arch/arc/lib/memset-archs.S @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +#undef PREALLOC_NOT_AVAIL + +#ifdef PREALLOC_NOT_AVAIL +#define PREWRITE(A,B) prefetchw [(A),(B)] +#else +#define PREWRITE(A,B) prealloc [(A),(B)] +#endif + +ENTRY(memset) + prefetchw [r0] ; Prefetch the write location + mov.f 0, r2 +;;; if size is zero + jz.d [blink] + mov r3, r0 ; don't clobber ret val + +;;; if length < 8 + brls.d.nt r2, 8, .Lsmallchunk + mov.f lp_count,r2 + + and.f r4, r0, 0x03 + rsub lp_count, r4, 4 + lpnz @.Laligndestination + ;; LOOP BEGIN + stb.ab r1, [r3,1] + sub r2, r2, 1 +.Laligndestination: + +;;; Destination is aligned + and r1, r1, 0xFF + asl r4, r1, 8 + or r4, r4, r1 + asl r5, r4, 16 + or r5, r5, r4 + mov r4, r5 + + sub3 lp_count, r2, 8 + cmp r2, 64 + bmsk.hi r2, r2, 5 + mov.ls lp_count, 0 + add3.hi r2, r2, 8 + +;;; Convert len to Dwords, unfold x8 + lsr.f lp_count, lp_count, 6 + lpnz @.Lset64bytes + ;; LOOP START + PREWRITE(r3, 64) ;Prefetch the next write location + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] +.Lset64bytes: + + lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes + lpnz .Lset32bytes + ;; LOOP START + prefetchw [r3, 32] ;Prefetch the next write location + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] + std.ab r4, [r3, 8] +.Lset32bytes: + + and.f lp_count, r2, 0x1F ;Last remaining 31 bytes +.Lsmallchunk: + lpnz .Lcopy3bytes + ;; LOOP START + stb.ab r1, [r3, 1] +.Lcopy3bytes: + + j [blink] + +END(memset) + +ENTRY(memzero) + ; adjust bzero args to memset args + mov r2, r1 + b.d memset ;tail call so need to tinker with blink + mov r1, 0 +END(memzero) diff --git a/arch/arc/lib/strcmp-archs.S b/arch/arc/lib/strcmp-archs.S new file mode 100644 index 000000000000..4f338eec3365 --- /dev/null +++ b/arch/arc/lib/strcmp-archs.S @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +ENTRY(strcmp) + or r2, r0, r1 + bmsk_s r2, r2, 1 + brne r2, 0, @.Lcharloop + +;;; s1 and s2 are word aligned + ld.ab r2, [r0, 4] + + mov_s r12, 0x01010101 + ror r11, r12 + .align 4 +.LwordLoop: + ld.ab r3, [r1, 4] + ;; Detect NULL char in str1 + sub r4, r2, r12 + ld.ab r5, [r0, 4] + bic r4, r4, r2 + and r4, r4, r11 + brne.d.nt r4, 0, .LfoundNULL + ;; Check if the read locations are the same + cmp r2, r3 + beq.d .LwordLoop + mov.eq r2, r5 + + ;; A match is found, spot it out +#ifdef __LITTLE_ENDIAN__ + swape r3, r3 + mov_s r0, 1 + swape r2, r2 +#else + mov_s r0, 1 +#endif + cmp_s r2, r3 + j_s.d [blink] + bset.lo r0, r0, 31 + + .align 4 +.LfoundNULL: +#ifdef __BIG_ENDIAN__ + swape r4, r4 + swape r2, r2 + swape r3, r3 +#endif + ;; Find null byte + ffs r0, r4 + bmsk r2, r2, r0 + bmsk r3, r3, r0 + swape r2, r2 + swape r3, r3 + ;; make the return value + sub.f r0, r2, r3 + mov.hi r0, 1 + j_s.d [blink] + bset.lo r0, r0, 31 + + .align 4 +.Lcharloop: + ldb.ab r2, [r0, 1] + ldb.ab r3, [r1, 1] + nop + breq r2, 0, .Lcmpend + breq r2, r3, .Lcharloop + + .align 4 +.Lcmpend: + j_s.d [blink] + sub r0, r2, r3 +END(strcmp) -- 1.9.1
next prev parent reply other threads:[~2015-06-09 11:53 UTC|newest] Thread overview: 109+ messages / expand[flat|nested] mbox.gz Atom feed top 2015-06-09 11:48 [PATCH 00/28] ARCv2 port to Linux - (B) ISA / Core / platform support Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 01/28] ARCv2: [intc] HS38 core interrupt controller Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 02/28] ARCv2: Support for ARCv2 ISA and HS38x cores Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 03/28] ARCv2: STAR 9000793984: Handle return from intr to Delay Slot Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 04/28] ARCv2: STAR 9000808988: signals involving " Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 05/28] ARCv2: STAR 9000814690: Really Re-enable interrupts to avoid deadlocks Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 06/28] ARCv2: MMUv4: TLB programming Model changes Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 07/28] ARCv2: MMUv4: cache programming model changes Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 08/28] ARCv2: MMUv4: support aliasing icache config Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta [this message] 2015-06-09 11:48 ` [PATCH 09/28] ARCv2: optimised string/mem lib routines Vineet Gupta 2015-06-09 11:48 ` [PATCH 10/28] ARCv2: Adhere to Zero Delay loop restriction Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 11/28] ARCv2: extable: Enable sorting at build time Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-24 5:51 ` Vineet Gupta 2015-06-24 5:51 ` Vineet Gupta 2015-06-29 20:38 ` David Daney 2015-06-30 4:41 ` Vineet Gupta 2015-06-30 4:41 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 12/28] ARCv2: clocksource: Introduce 64bit local RTC counter Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 13/28] ARC: make plat_smp_ops weak to allow over-rides Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 14/28] ARCv2: SMP: ARConnect debug/robustness Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 15/28] ARCv2: SMP: clocksource: Enable Global Real Time counter Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 16/28] ARCv2: SMP: intc: IDU 2nd level intc for dynamic IRQ distribution Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 17/28] ARC: add compiler barrier to LLSC based cmpxchg Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 12:23 ` Peter Zijlstra 2015-06-09 11:48 ` [PATCH 18/28] ARC: add smp barriers around atomics per memory-barrriers.txt Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 12:30 ` Peter Zijlstra 2015-06-10 9:17 ` Vineet Gupta 2015-06-10 10:53 ` Peter Zijlstra 2015-06-11 13:03 ` Vineet Gupta 2015-06-12 12:15 ` [PATCH v2] ARC: add smp barriers around atomics per Documentation/atomic_ops.txt Vineet Gupta 2015-06-12 12:15 ` Vineet Gupta 2015-06-12 13:04 ` Peter Zijlstra 2015-06-12 13:16 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 19/28] arch: conditionally define smp_{mb,rmb,wmb} Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 12:32 ` Peter Zijlstra 2015-06-09 11:48 ` [PATCH 20/28] ARCv2: barriers Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 12:40 ` Peter Zijlstra 2015-06-10 9:34 ` Vineet Gupta 2015-06-10 10:58 ` Peter Zijlstra 2015-06-10 13:01 ` Will Deacon 2015-06-11 12:13 ` Vineet Gupta 2015-06-11 13:39 ` Will Deacon 2015-06-19 13:13 ` Vineet Gupta 2015-06-19 13:13 ` Vineet Gupta 2015-06-19 13:13 ` Vineet Gupta 2015-06-22 13:36 ` Will Deacon 2015-06-22 13:36 ` Will Deacon 2015-06-22 13:36 ` Will Deacon 2015-06-23 7:58 ` [PATCH v2 " Vineet Gupta 2015-06-23 7:58 ` Vineet Gupta 2015-06-23 8:49 ` Will Deacon 2015-06-23 9:03 ` Vineet Gupta 2015-06-23 9:26 ` Will Deacon 2015-06-23 9:52 ` [PATCH v3 22/28] " Vineet Gupta 2015-06-23 9:52 ` Vineet Gupta 2015-06-23 16:28 ` Will Deacon 2015-06-23 9:25 ` [PATCH v2 20/28] " Peter Zijlstra 2015-06-23 8:02 ` [PATCH " Vineet Gupta 2015-06-09 11:48 ` [PATCH 21/28] ARC: Reduce bitops lines of code using macros Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-12 12:20 ` [PATCH v2] " Vineet Gupta 2015-06-12 12:20 ` Vineet Gupta 2015-06-12 13:05 ` Peter Zijlstra 2015-06-09 11:48 ` [PATCH 22/28] ARCv2: STAR 9000837815 workaround hardware exclusive transactions livelock Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 12:35 ` Peter Zijlstra 2015-06-10 10:01 ` Vineet Gupta 2015-06-10 11:02 ` Peter Zijlstra 2015-06-19 9:55 ` [PATCH v2 " Vineet Gupta 2015-06-19 9:55 ` Vineet Gupta 2015-06-19 9:59 ` Will Deacon 2015-06-19 10:09 ` Vineet Gupta 2015-06-23 7:59 ` Vineet Gupta 2015-06-23 7:59 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 23/28] ARCv2: SLC: Handle explcit flush for DMA ops (w/o IO-coherency) Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 24/28] ARCv2: All bits in place, allow ARCv2 builds Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 25/28] ARCv2: [nsim*hs*] Support simulation platforms for HS38x cores Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 26/28] ARC: [axs101] Prepare for AXS103 Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 27/28] ARCv2: [axs103] Support ARC SDP FPGA platform for HS38x cores Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta 2015-06-09 11:48 ` [PATCH 28/28] ARCv2: [vdk] dts files and defconfig for HS38 VDK Vineet Gupta 2015-06-09 11:48 ` Vineet Gupta
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=1433850508-26317-10-git-send-email-vgupta@synopsys.com \ --to=vineet.gupta1@synopsys.com \ --cc=Claudiu.Zissulescu@synopsys.com \ --cc=arc-linux-dev@synopsys.com \ --cc=arnd@arndb.de \ --cc=linux-arch@vger.kernel.org \ --cc=linux-kernel@vger.kernel.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.