AArch64: Tune memcpy
* newlib/libc/machine/aarch64/memcpy.S (memcpy): Further tuning for performance.
This commit is contained in:
parent
914620a7e6
commit
3c8636acf6
|
@ -1,3 +1,8 @@
|
||||||
|
2015-11-12 Wilco Dijkstra <wdijkstr@arm.com>
|
||||||
|
|
||||||
|
* newlib/libc/machine/aarch64/memcpy.S (memcpy): Further tuning for
|
||||||
|
performance.
|
||||||
|
|
||||||
2015-11-12 Joseph Myers <joseph@codesourcery.com>
|
2015-11-12 Joseph Myers <joseph@codesourcery.com>
|
||||||
|
|
||||||
* libc/machine/arm/strcmp-arm-tiny.S: Use .cfi_sections
|
* libc/machine/arm/strcmp-arm-tiny.S: Use .cfi_sections
|
||||||
|
|
|
@ -73,6 +73,7 @@
|
||||||
#define A_h x7
|
#define A_h x7
|
||||||
#define A_hw w7
|
#define A_hw w7
|
||||||
#define B_l x8
|
#define B_l x8
|
||||||
|
#define B_lw w8
|
||||||
#define B_h x9
|
#define B_h x9
|
||||||
#define C_l x10
|
#define C_l x10
|
||||||
#define C_h x11
|
#define C_h x11
|
||||||
|
@ -104,45 +105,20 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
def_fn memcpy p2align=6
|
def_fn memcpy p2align=6
|
||||||
|
prfm PLDL1KEEP, [src]
|
||||||
add srcend, src, count
|
add srcend, src, count
|
||||||
add dstend, dstin, count
|
add dstend, dstin, count
|
||||||
|
cmp count, 16
|
||||||
|
b.ls L(copy16)
|
||||||
cmp count, 96
|
cmp count, 96
|
||||||
b.hi L(copy_long)
|
b.hi L(copy_long)
|
||||||
cmp count, 16
|
|
||||||
b.hs L(copy_medium)
|
|
||||||
|
|
||||||
/* Small copies: 0..16 bytes. */
|
|
||||||
L(copy16):
|
|
||||||
tbz count, 3, 1f
|
|
||||||
ldr A_l, [src]
|
|
||||||
ldr A_h, [srcend, -8]
|
|
||||||
str A_l, [dstin]
|
|
||||||
str A_h, [dstend, -8]
|
|
||||||
ret
|
|
||||||
1:
|
|
||||||
tbz count, 2, 1f
|
|
||||||
ldr A_lw, [src]
|
|
||||||
ldr A_hw, [srcend, -4]
|
|
||||||
str A_lw, [dstin]
|
|
||||||
str A_hw, [dstend, -4]
|
|
||||||
ret
|
|
||||||
.p2align 4
|
|
||||||
1:
|
|
||||||
cbz count, 2f
|
|
||||||
ldrb A_lw, [src]
|
|
||||||
tbz count, 1, 1f
|
|
||||||
ldrh A_hw, [srcend, -2]
|
|
||||||
strh A_hw, [dstend, -2]
|
|
||||||
1: strb A_lw, [dstin]
|
|
||||||
2: ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
/* Medium copies: 17..96 bytes. */
|
/* Medium copies: 17..96 bytes. */
|
||||||
L(copy_medium):
|
sub tmp1, count, 1
|
||||||
ldp A_l, A_h, [src]
|
ldp A_l, A_h, [src]
|
||||||
tbnz count, 6, L(copy96)
|
tbnz tmp1, 6, L(copy96)
|
||||||
ldp D_l, D_h, [srcend, -16]
|
ldp D_l, D_h, [srcend, -16]
|
||||||
tbz count, 5, 1f
|
tbz tmp1, 5, 1f
|
||||||
ldp B_l, B_h, [src, 16]
|
ldp B_l, B_h, [src, 16]
|
||||||
ldp C_l, C_h, [srcend, -32]
|
ldp C_l, C_h, [srcend, -32]
|
||||||
stp B_l, B_h, [dstin, 16]
|
stp B_l, B_h, [dstin, 16]
|
||||||
|
@ -152,6 +128,38 @@ L(copy_medium):
|
||||||
stp D_l, D_h, [dstend, -16]
|
stp D_l, D_h, [dstend, -16]
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
/* Small copies: 0..16 bytes. */
|
||||||
|
L(copy16):
|
||||||
|
cmp count, 8
|
||||||
|
b.lo 1f
|
||||||
|
ldr A_l, [src]
|
||||||
|
ldr A_h, [srcend, -8]
|
||||||
|
str A_l, [dstin]
|
||||||
|
str A_h, [dstend, -8]
|
||||||
|
ret
|
||||||
|
.p2align 4
|
||||||
|
1:
|
||||||
|
tbz count, 2, 1f
|
||||||
|
ldr A_lw, [src]
|
||||||
|
ldr A_hw, [srcend, -4]
|
||||||
|
str A_lw, [dstin]
|
||||||
|
str A_hw, [dstend, -4]
|
||||||
|
ret
|
||||||
|
|
||||||
|
/* Copy 0..3 bytes. Use a branchless sequence that copies the same
|
||||||
|
byte 3 times if count==1, or the 2nd byte twice if count==2. */
|
||||||
|
1:
|
||||||
|
cbz count, 2f
|
||||||
|
lsr tmp1, count, 1
|
||||||
|
ldrb A_lw, [src]
|
||||||
|
ldrb A_hw, [srcend, -1]
|
||||||
|
ldrb B_lw, [src, tmp1]
|
||||||
|
strb A_lw, [dstin]
|
||||||
|
strb B_lw, [dstin, tmp1]
|
||||||
|
strb A_hw, [dstend, -1]
|
||||||
|
2: ret
|
||||||
|
|
||||||
.p2align 4
|
.p2align 4
|
||||||
/* Copy 64..96 bytes. Copy 64 bytes from the start and
|
/* Copy 64..96 bytes. Copy 64 bytes from the start and
|
||||||
32 bytes from the end. */
|
32 bytes from the end. */
|
||||||
|
|
Loading…
Reference in New Issue