AArch64: Tune memcpy

* newlib/libc/machine/aarch64/memcpy.S (memcpy):
 	Further tuning for performance.
This commit is contained in:
Wilco Dijkstra 2015-11-06 14:09:20 +00:00 committed by Corinna Vinschen
parent 914620a7e6
commit 3c8636acf6
2 changed files with 45 additions and 32 deletions

View File

@ -1,3 +1,8 @@
2015-11-12 Wilco Dijkstra <wdijkstr@arm.com>
* newlib/libc/machine/aarch64/memcpy.S (memcpy): Further tuning for
performance.
2015-11-12 Joseph Myers <joseph@codesourcery.com> 2015-11-12 Joseph Myers <joseph@codesourcery.com>
* libc/machine/arm/strcmp-arm-tiny.S: Use .cfi_sections * libc/machine/arm/strcmp-arm-tiny.S: Use .cfi_sections

View File

@ -73,6 +73,7 @@
#define A_h x7 #define A_h x7
#define A_hw w7 #define A_hw w7
#define B_l x8 #define B_l x8
#define B_lw w8
#define B_h x9 #define B_h x9
#define C_l x10 #define C_l x10
#define C_h x11 #define C_h x11
@ -104,45 +105,20 @@
*/ */
def_fn memcpy p2align=6 def_fn memcpy p2align=6
prfm PLDL1KEEP, [src]
add srcend, src, count add srcend, src, count
add dstend, dstin, count add dstend, dstin, count
cmp count, 16
b.ls L(copy16)
cmp count, 96 cmp count, 96
b.hi L(copy_long) b.hi L(copy_long)
cmp count, 16
b.hs L(copy_medium)
/* Small copies: 0..16 bytes. */
L(copy16):
tbz count, 3, 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
1:
tbz count, 2, 1f
ldr A_lw, [src]
ldr A_hw, [srcend, -4]
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
.p2align 4
1:
cbz count, 2f
ldrb A_lw, [src]
tbz count, 1, 1f
ldrh A_hw, [srcend, -2]
strh A_hw, [dstend, -2]
1: strb A_lw, [dstin]
2: ret
.p2align 4
/* Medium copies: 17..96 bytes. */ /* Medium copies: 17..96 bytes. */
L(copy_medium): sub tmp1, count, 1
ldp A_l, A_h, [src] ldp A_l, A_h, [src]
tbnz count, 6, L(copy96) tbnz tmp1, 6, L(copy96)
ldp D_l, D_h, [srcend, -16] ldp D_l, D_h, [srcend, -16]
tbz count, 5, 1f tbz tmp1, 5, 1f
ldp B_l, B_h, [src, 16] ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [srcend, -32] ldp C_l, C_h, [srcend, -32]
stp B_l, B_h, [dstin, 16] stp B_l, B_h, [dstin, 16]
@ -152,6 +128,38 @@ L(copy_medium):
stp D_l, D_h, [dstend, -16] stp D_l, D_h, [dstend, -16]
ret ret
.p2align 4
/* Small copies: 0..16 bytes. */
L(copy16):
cmp count, 8
b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
.p2align 4
1:
tbz count, 2, 1f
ldr A_lw, [src]
ldr A_hw, [srcend, -4]
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
/* Copy 0..3 bytes. Use a branchless sequence that copies the same
byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
cbz count, 2f
lsr tmp1, count, 1
ldrb A_lw, [src]
ldrb A_hw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb A_hw, [dstend, -1]
2: ret
.p2align 4 .p2align 4
/* Copy 64..96 bytes. Copy 64 bytes from the start and /* Copy 64..96 bytes. Copy 64 bytes from the start and
32 bytes from the end. */ 32 bytes from the end. */