113 lines
2.8 KiB
ArmAsm
113 lines
2.8 KiB
ArmAsm
/* a-memcpy.s -- memcpy, optimised for m68k asm
|
|
*
|
|
* Copyright (c) 2007 mocom software GmbH & Co KG)
|
|
*
|
|
* The authors hereby grant permission to use, copy, modify, distribute,
|
|
* and license this software and its documentation for any purpose, provided
|
|
* that existing copyright notices are retained in all copies and that this
|
|
* notice is included verbatim in any distributions. No written agreement,
|
|
* license, or royalty fee is required for any of the authorized uses.
|
|
* Modifications to this software may be copyrighted by their authors
|
|
* and need not follow the licensing terms described here, provided that
|
|
* the new terms are clearly indicated on the first page of each file where
|
|
* they apply.
|
|
*/
|
|
|
|
#include "m68kasm.h"
|
|
|
|
#if defined (__mcoldfire__) || defined (__mc68010__) || defined (__mc68020__) || defined (__mc68030__) || defined (__mc68040__) || defined (__mc68060__)
|
|
# define MISALIGNED_OK 1
|
|
#else
|
|
# define MISALIGNED_OK 0
|
|
#endif
|
|
|
|
.text
|
|
.align 4
|
|
|
|
.globl SYM(memcpy)
|
|
.type SYM(memcpy), @function
|
|
|
|
/* memcpy, optimised
|
|
*
|
|
* strategy:
|
|
* - no argument testing (the original memcpy from the GNU lib does
|
|
* no checking either)
|
|
* - make sure the destination pointer (the write pointer) is long word
|
|
* aligned. This is the best you can do, because writing to unaligned
|
|
* addresses can be the most costfull thing you could do.
|
|
* - Once you have figured that out, we do a little loop unrolling
|
|
* to further improve speed.
|
|
*/
|
|
|
|
SYM(memcpy):
|
|
move.l 4(sp),a0 | dest ptr
|
|
move.l 8(sp),a1 | src ptr
|
|
move.l 12(sp),d1 | len
|
|
cmp.l #8,d1 | if fewer than 8 bytes to transfer,
|
|
blo .Lresidue | do not optimise
|
|
|
|
#if !MISALIGNED_OK
|
|
/* Goto .Lresidue if either dest or src is not 4-byte aligned */
|
|
move.l a0,d0
|
|
and.l #3,d0
|
|
bne .Lresidue
|
|
move.l a1,d0
|
|
and.l #3,d0
|
|
bne .Lresidue
|
|
#else /* MISALIGNED_OK */
|
|
/* align dest */
|
|
move.l a0,d0 | copy of dest
|
|
neg.l d0
|
|
and.l #3,d0 | look for the lower two only
|
|
beq 2f | is aligned?
|
|
sub.l d0,d1
|
|
lsr.l #1,d0 | word align needed?
|
|
bcc 1f
|
|
move.b (a1)+,(a0)+
|
|
1:
|
|
lsr.l #1,d0 | long align needed?
|
|
bcc 2f
|
|
move.w (a1)+,(a0)+
|
|
2:
|
|
#endif /* !MISALIGNED_OK */
|
|
|
|
/* long word transfers */
|
|
move.l d1,d0
|
|
and.l #3,d1 | byte residue
|
|
lsr.l #3,d0
|
|
bcc 1f | carry set for 4-byte residue
|
|
move.l (a1)+,(a0)+
|
|
1:
|
|
lsr.l #1,d0 | number of 16-byte transfers
|
|
bcc .Lcopy | carry set for 8-byte residue
|
|
bra .Lcopy8
|
|
|
|
1:
|
|
move.l (a1)+,(a0)+
|
|
move.l (a1)+,(a0)+
|
|
.Lcopy8:
|
|
move.l (a1)+,(a0)+
|
|
move.l (a1)+,(a0)+
|
|
.Lcopy:
|
|
#if !defined (__mcoldfire__)
|
|
dbra d0,1b
|
|
sub.l #0x10000,d0
|
|
#else
|
|
subq.l #1,d0
|
|
#endif
|
|
bpl 1b
|
|
bra .Lresidue
|
|
|
|
1:
|
|
move.b (a1)+,(a0)+ | move residue bytes
|
|
|
|
.Lresidue:
|
|
#if !defined (__mcoldfire__)
|
|
dbra d1,1b | loop until done
|
|
#else
|
|
subq.l #1,d1
|
|
bpl 1b
|
|
#endif
|
|
move.l 4(sp),d0 | return value
|
|
rts
|