libc: krait: Use performance version of bcopy and memmove
Ported from CM10.2.
bionic-benchmarks on mako:
before:
iterations ns/op
BM_string_memmove/8 50000000 32 243.54 MiB/s
BM_string_memmove/64 20000000 143 446.41 MiB/s
BM_string_memmove/512 2000000 885 578.14 MiB/s
BM_string_memmove/1K 1000000 1733 590.55 MiB/s
BM_string_memmove/8K 200000 13618 601.54 MiB/s
BM_string_memmove/16K 100000 27276 600.66 MiB/s
BM_string_memmove/32K 50000 59115 554.30 MiB/s
BM_string_memmove/64K 10000 118162 554.63 MiB/s
after:
iterations ns/op
BM_string_memmove/8 50000000 20 381.94 MiB/s
BM_string_memmove/64 100000000 17 3636.07 MiB/s
BM_string_memmove/512 50000000 50 10116.80 MiB/s
BM_string_memmove/1K 20000000 98 10429.23 MiB/s
BM_string_memmove/8K 2000000 876 9346.43 MiB/s
BM_string_memmove/16K 1000000 1836 8923.09 MiB/s
BM_string_memmove/32K 500000 4392 7459.79 MiB/s
BM_string_memmove/64K 200000 8562 7653.85 MiB/s
Change-Id: Id64913a71857d9cfdf6bd1bbe2c66cfc49d72748
diff --git a/libc/Android.mk b/libc/Android.mk
index 9610c14..9fc94df 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -364,8 +364,6 @@
# =========================================================
ifeq ($(TARGET_ARCH),arm)
libc_common_src_files += \
- bionic/memmove.c.arm \
- string/bcopy.c \
string/strncmp.c \
string/strncat.c \
string/strncpy.c \
diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk
index e230003..f9a1e48 100644
--- a/libc/arch-arm/generic/generic.mk
+++ b/libc/arch-arm/generic/generic.mk
@@ -2,6 +2,8 @@
$(call libc-add-cpu-variant-src,MEMSET,arch-arm/generic/bionic/memset.S)
$(call libc-add-cpu-variant-src,STRCAT,string/strcat.c)
$(call libc-add-cpu-variant-src,STRCMP,arch-arm/generic/bionic/strcmp.S)
+$(call libc-add-cpu-variant-src,MEMMOVE,bionic/memmove.c.arm)
+$(call libc-add-cpu-variant-src,BCOPY,string/bcopy.c.arm)
$(call libc-add-cpu-variant-src,STRCPY,arch-arm/generic/bionic/strcpy.S)
$(call libc-add-cpu-variant-src,STRLEN,arch-arm/generic/bionic/strlen.c)
$(call libc-add-cpu-variant-src,__STRCAT_CHK,bionic/__strcat_chk.cpp)
diff --git a/libc/arch-arm/krait/bionic/memmove.S b/libc/arch-arm/krait/bionic/memmove.S
new file mode 100644
index 0000000..349c8e3
--- /dev/null
+++ b/libc/arch-arm/krait/bionic/memmove.S
@@ -0,0 +1,209 @@
+/***************************************************************************
+ Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of The Linux Foundation nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ***************************************************************************/
+
+/***************************************************************************
+ * Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ * Inputs:
+ * dest: The destination buffer
+ * src: The source buffer
+ * n: The size of the buffer to transfer
+ * Outputs:
+ *
+ ***************************************************************************/
+
+#include <machine/cpu-features.h>
+
+/*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_KRAIT_PLD_SET := true
+ * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+ * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#if (PLDOFFS < 5)
+#error Routine does not support offsets less than 5
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+#define NOP_OPCODE (0xe320f000)
+
+ .code 32
+ .align 5
+ .global memmove
+ .type memmove, %function
+
+ .global _memmove_words
+ .type _memmove_words, %function
+
+ .global bcopy
+ .type bcopy, %function
+
+bcopy:
+ mov r12, r0
+ mov r0, r1
+ mov r1, r12
+ .balignl 64, NOP_OPCODE, 4*2
+memmove:
+_memmove_words:
+.Lneon_memmove_cmf:
+ subs r12, r0, r1
+ bxeq lr
+ cmphi r2, r12
+ bls memcpy /* Use memcpy for non-overlapping areas */
+
+ push {r0}
+
+.Lneon_back_to_front_copy:
+ add r0, r0, r2
+ add r1, r1, r2
+ cmp r2, #4
+ bgt .Lneon_b2f_gt4
+ cmp r2, #0
+.Lneon_b2f_smallcopy_loop:
+ beq .Lneon_memmove_done
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ b .Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+ sub r3, r0, r1
+ cmp r2, r3
+ movle r12, r2
+ movgt r12, r3
+ cmp r12, #64
+ bge .Lneon_b2f_copy_64
+ cmp r12, #32
+ bge .Lneon_b2f_copy_32
+ cmp r12, #8
+ bge .Lneon_b2f_copy_8
+ cmp r12, #4
+ bge .Lneon_b2f_copy_4
+ b .Lneon_b2f_copy_1
+.Lneon_b2f_copy_64:
+ sub r1, r1, #64 /* Predecrement */
+ sub r0, r0, #64
+ movs r12, r2, lsr #6
+ cmp r12, #PLDTHRESH
+ ble .Lneon_b2f_copy_64_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #-(PLDOFFS-5)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-4)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-3)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-2)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-1)*PLDSIZE]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_64_loop_outer:
+ pld [r1, #-(PLDOFFS)*PLDSIZE]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ sub r1, r1, #96 /* Post-fixup and predecrement */
+ vst1.32 {q2, q3}, [r0]
+ sub r0, r0, #96
+ bne .Lneon_b2f_copy_64_loop_outer
+ mov r12, #PLDOFFS
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_64_loop_nopld:
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q8, q9}, [r0]!
+ sub r1, r1, #96 /* Post-fixup and predecrement */
+ vst1.32 {q10, q11}, [r0]
+ sub r0, r0, #96
+ bne .Lneon_b2f_copy_64_loop_nopld
+ ands r2, r2, #0x3f
+ beq .Lneon_memmove_done
+ add r1, r1, #64 /* Post-fixup */
+ add r0, r0, #64
+ cmp r2, #32
+ blt .Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+ mov r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+ sub r1, r1, #32 /* Predecrement */
+ sub r0, r0, #32
+ vld1.32 {q0,q1}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]
+ bne .Lneon_b2f_copy_32_loop
+ ands r2, r2, #0x1f
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+ movs r12, r2, lsr #0x3
+ beq .Lneon_b2f_copy_4
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_8_loop:
+ sub r1, r1, #8 /* Predecrement */
+ sub r0, r0, #8
+ vld1.32 {d0}, [r1]
+ subs r12, r12, #1
+ vst1.32 {d0}, [r0]
+ bne .Lneon_b2f_copy_8_loop
+ ands r2, r2, #0x7
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_4:
+ movs r12, r2, lsr #0x2
+ beq .Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+ ldr r3, [r1, #-4]!
+ subs r12, r12, #1
+ str r3, [r0, #-4]!
+ bne .Lneon_b2f_copy_4_loop
+ ands r2, r2, #0x3
+.Lneon_b2f_copy_1:
+ cmp r2, #0
+ beq .Lneon_memmove_done
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_1_loop:
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ bne .Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+ pop {r0}
+ bx lr
+
+ .end
+
+
diff --git a/libc/arch-arm/krait/krait.mk b/libc/arch-arm/krait/krait.mk
index 29ab743..6a5a839 100644
--- a/libc/arch-arm/krait/krait.mk
+++ b/libc/arch-arm/krait/krait.mk
@@ -1,6 +1,7 @@
$(call libc-add-cpu-variant-src,MEMCPY,arch-arm/krait/bionic/memcpy.S)
$(call libc-add-cpu-variant-src,MEMSET,arch-arm/krait/bionic/memset.S)
$(call libc-add-cpu-variant-src,STRCMP,arch-arm/krait/bionic/strcmp.S)
+$(call libc-add-cpu-variant-src,MEMMOVE,arch-arm/krait/bionic/memmove.S)
$(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/krait/bionic/__strcat_chk.S)
$(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/krait/bionic/__strcpy_chk.S)
# Use cortex-a15 versions of strcat/strcpy/strlen.
@@ -8,4 +9,4 @@
$(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a15/bionic/strcpy.S)
$(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a15/bionic/strlen.S)
-include bionic/libc/arch-arm/generic/generic.mk
+#include bionic/libc/arch-arm/generic/generic.mk