mirror of
git://sourceware.org/git/newlib-cygwin.git
synced 2025-02-14 21:19:17 +08:00
This brings to newlib a performance improvement that we developed in Bionic libc. That change has been submitted for review to Bionic libc: https://android-review.googlesource.com/418279 A similar patch has been submitted for review in glibc: https://sourceware.org/ml/libc-alpha/2017-06/msg01143.html Patch written by Vikas Sinha and Sebastian Pop. The performance was measured on the bionic-benchmarks on a hikey (aarch64 8xA53) board. There was no performance change to the existing benchmark and a performance improvement on the new benchmark for memcmp on the unaligned side. The new benchmark has been submitted for review at https://android-review.googlesource.com/414860 The overall performance improves by 18% for the small data set 8 and the performance improves by 450% for the large data set 64k. The base is with the libc from /system/lib64. The bionic libc with this patch is in /data. hikey:/data # export LD_LIBRARY_PATH=/system/lib64 hikey:/data # ./bionic-benchmarks --benchmark_filter='BM_string_memcmp*' Run on (8 X 2.4 MHz CPU s) Benchmark Time CPU Iterations ---------------------------------------------------------------------- BM_string_memcmp/8 30 ns 30 ns 22955680 251.07MB/s BM_string_memcmp/64 57 ns 57 ns 12349184 1076.99MB/s BM_string_memcmp/512 305 ns 305 ns 2297163 1.56496GB/s BM_string_memcmp/1024 571 ns 571 ns 1225211 1.66912GB/s BM_string_memcmp/8k 4307 ns 4306 ns 162562 1.77177GB/s BM_string_memcmp/16k 8676 ns 8675 ns 80676 1.75887GB/s BM_string_memcmp/32k 19233 ns 19230 ns 36394 1.58695GB/s BM_string_memcmp/64k 36986 ns 36984 ns 18952 1.65029GB/s BM_string_memcmp_aligned/8 199 ns 199 ns 3519166 38.3336MB/s BM_string_memcmp_aligned/64 386 ns 386 ns 1810734 158.073MB/s BM_string_memcmp_aligned/512 1735 ns 1734 ns 403981 281.525MB/s BM_string_memcmp_aligned/1024 3200 ns 3200 ns 218838 305.151MB/s BM_string_memcmp_aligned/8k 25084 ns 25080 ns 28180 311.507MB/s BM_string_memcmp_aligned/16k 51730 ns 51729 ns 13521 302.057MB/s BM_string_memcmp_aligned/32k 103228 ns 103228 ns 6782 302.727MB/s BM_string_memcmp_aligned/64k 207117 ns 207087 ns 3450 301.806MB/s BM_string_memcmp_unaligned/8 339 ns 339 ns 2070998 22.5302MB/s BM_string_memcmp_unaligned/64 1392 ns 1392 ns 502796 43.8454MB/s BM_string_memcmp_unaligned/512 9194 ns 9194 ns 76133 53.1104MB/s BM_string_memcmp_unaligned/1024 18325 ns 18323 ns 38206 53.2963MB/s BM_string_memcmp_unaligned/8k 148579 ns 148574 ns 4713 52.5831MB/s BM_string_memcmp_unaligned/16k 298169 ns 298120 ns 2344 52.4118MB/s BM_string_memcmp_unaligned/32k 598813 ns 598797 ns 1085 52.188MB/s BM_string_memcmp_unaligned/64k 1196079 ns 1196083 ns 540 52.2539MB/s hikey:/data # export LD_LIBRARY_PATH=/data hikey:/data # ./bionic-benchmarks --benchmark_filter='BM_string_memcmp*' Run on (8 X 2.4 MHz CPU s) Benchmark Time CPU Iterations ---------------------------------------------------------------------- BM_string_memcmp/8 30 ns 30 ns 23209918 252.802MB/s BM_string_memcmp/64 57 ns 57 ns 12348447 1076.95MB/s BM_string_memcmp/512 305 ns 305 ns 2296878 1.56471GB/s BM_string_memcmp/1024 572 ns 571 ns 1224426 1.6689GB/s BM_string_memcmp/8k 4309 ns 4308 ns 162491 1.77109GB/s BM_string_memcmp/16k 9348 ns 9345 ns 74894 1.63285GB/s BM_string_memcmp/32k 18329 ns 18322 ns 38249 1.6656GB/s BM_string_memcmp/64k 36992 ns 36981 ns 18952 1.65045GB/s BM_string_memcmp_aligned/8 199 ns 199 ns 3513925 38.3162MB/s BM_string_memcmp_aligned/64 386 ns 386 ns 1814038 158.192MB/s BM_string_memcmp_aligned/512 1735 ns 1735 ns 402279 281.502MB/s BM_string_memcmp_aligned/1024 3204 ns 3202 ns 218761 304.941MB/s BM_string_memcmp_aligned/8k 25577 ns 25569 ns 27406 305.548MB/s BM_string_memcmp_aligned/16k 52143 ns 52123 ns 13522 299.769MB/s BM_string_memcmp_aligned/32k 105169 ns 105127 ns 6637 297.26MB/s BM_string_memcmp_aligned/64k 206508 ns 206383 ns 3417 302.835MB/s BM_string_memcmp_unaligned/8 282 ns 282 ns 2482953 27.062MB/s BM_string_memcmp_unaligned/64 542 ns 541 ns 1298317 112.77MB/s BM_string_memcmp_unaligned/512 2152 ns 2152 ns 325267 226.915MB/s BM_string_memcmp_unaligned/1024 4025 ns 4025 ns 173904 242.622MB/s BM_string_memcmp_unaligned/8k 32276 ns 32271 ns 21818 242.09MB/s BM_string_memcmp_unaligned/16k 65970 ns 65970 ns 10554 236.851MB/s BM_string_memcmp_unaligned/32k 131241 ns 131242 ns 5129 238.11MB/s BM_string_memcmp_unaligned/64k 266159 ns 266160 ns 2661 234.821MB/s
221 lines
6.0 KiB
ArmAsm
221 lines
6.0 KiB
ArmAsm
/* memcmp - compare memory
|
|
|
|
Copyright (c) 2013, Linaro Limited
|
|
Copyright (c) 2017, Samsung Austin R&D Center
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
* Neither the name of the Linaro nor the
|
|
names of its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
|
|
|
#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
|
|
/* See memcmp-stub.c */
|
|
#else
|
|
/* Assumptions:
|
|
*
|
|
* ARMv8-a, AArch64
|
|
*/
|
|
|
|
.macro def_fn f p2align=0
|
|
.text
|
|
.p2align \p2align
|
|
.global \f
|
|
.type \f, %function
|
|
\f:
|
|
.endm
|
|
|
|
/* Parameters and result. */
|
|
#define src1 x0
|
|
#define src2 x1
|
|
#define limit x2
|
|
#define result x0
|
|
|
|
/* Internal variables. */
|
|
#define data1 x3
|
|
#define data1w w3
|
|
#define data2 x4
|
|
#define data2w w4
|
|
#define has_nul x5
|
|
#define diff x6
|
|
#define endloop x7
|
|
#define tmp1 x8
|
|
#define tmp2 x9
|
|
#define tmp3 x10
|
|
#define pos x11
|
|
#define limit_wd x12
|
|
#define mask x13
|
|
|
|
def_fn memcmp p2align=6
|
|
cbz limit, .Lret0
|
|
eor tmp1, src1, src2
|
|
tst tmp1, #7
|
|
b.ne .Lmisaligned8
|
|
ands tmp1, src1, #7
|
|
b.ne .Lmutual_align
|
|
add limit_wd, limit, #7
|
|
lsr limit_wd, limit_wd, #3
|
|
/* Start of performance-critical section -- one 64B cache line. */
|
|
.Lloop_aligned:
|
|
ldr data1, [src1], #8
|
|
ldr data2, [src2], #8
|
|
.Lstart_realigned:
|
|
subs limit_wd, limit_wd, #1
|
|
eor diff, data1, data2 /* Non-zero if differences found. */
|
|
csinv endloop, diff, xzr, ne /* Last Dword or differences. */
|
|
cbz endloop, .Lloop_aligned
|
|
/* End of performance-critical section -- one 64B cache line. */
|
|
|
|
/* Not reached the limit, must have found a diff. */
|
|
cbnz limit_wd, .Lnot_limit
|
|
|
|
/* Limit % 8 == 0 => all bytes significant. */
|
|
ands limit, limit, #7
|
|
b.eq .Lnot_limit
|
|
|
|
lsl limit, limit, #3 /* Bits -> bytes. */
|
|
mov mask, #~0
|
|
#ifdef __AARCH64EB__
|
|
lsr mask, mask, limit
|
|
#else
|
|
lsl mask, mask, limit
|
|
#endif
|
|
bic data1, data1, mask
|
|
bic data2, data2, mask
|
|
|
|
orr diff, diff, mask
|
|
.Lnot_limit:
|
|
|
|
#ifndef __AARCH64EB__
|
|
rev diff, diff
|
|
rev data1, data1
|
|
rev data2, data2
|
|
#endif
|
|
/* The MS-non-zero bit of DIFF marks either the first bit
|
|
that is different, or the end of the significant data.
|
|
Shifting left now will bring the critical information into the
|
|
top bits. */
|
|
clz pos, diff
|
|
lsl data1, data1, pos
|
|
lsl data2, data2, pos
|
|
/* But we need to zero-extend (char is unsigned) the value and then
|
|
perform a signed 32-bit subtraction. */
|
|
lsr data1, data1, #56
|
|
sub result, data1, data2, lsr #56
|
|
ret
|
|
|
|
.Lmutual_align:
|
|
/* Sources are mutually aligned, but are not currently at an
|
|
alignment boundary. Round down the addresses and then mask off
|
|
the bytes that precede the start point. */
|
|
bic src1, src1, #7
|
|
bic src2, src2, #7
|
|
add limit, limit, tmp1 /* Adjust the limit for the extra. */
|
|
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
|
|
ldr data1, [src1], #8
|
|
neg tmp1, tmp1 /* Bits to alignment -64. */
|
|
ldr data2, [src2], #8
|
|
mov tmp2, #~0
|
|
#ifdef __AARCH64EB__
|
|
/* Big-endian. Early bytes are at MSB. */
|
|
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
|
|
#else
|
|
/* Little-endian. Early bytes are at LSB. */
|
|
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
|
|
#endif
|
|
add limit_wd, limit, #7
|
|
orr data1, data1, tmp2
|
|
orr data2, data2, tmp2
|
|
lsr limit_wd, limit_wd, #3
|
|
b .Lstart_realigned
|
|
|
|
.Lret0:
|
|
mov result, #0
|
|
ret
|
|
|
|
.p2align 6
|
|
.Lmisaligned8:
|
|
|
|
cmp limit, #8
|
|
b.lo .LmisalignedLt8
|
|
|
|
.LunalignedGe8 :
|
|
|
|
/* Load the first dword with both src potentially unaligned. */
|
|
ldr data1, [src1]
|
|
ldr data2, [src2]
|
|
|
|
eor diff, data1, data2 /* Non-zero if differences found. */
|
|
cbnz diff, .Lnot_limit
|
|
|
|
/* Sources are not aligned: align one of the sources. */
|
|
|
|
and tmp1, src1, #0x7
|
|
orr tmp3, xzr, #0x8
|
|
sub pos, tmp3, tmp1
|
|
|
|
/* Increment SRC pointers by POS so SRC1 is word-aligned. */
|
|
add src1, src1, pos
|
|
add src2, src2, pos
|
|
|
|
sub limit, limit, pos
|
|
lsr limit_wd, limit, #3
|
|
|
|
cmp limit_wd, #0
|
|
|
|
/* save #bytes to go back to be able to read 8byte at end
|
|
pos=negative offset position to read 8 bytes when len%8 != 0 */
|
|
and limit, limit, #7
|
|
sub pos, limit, #8
|
|
|
|
b .Lstart_part_realigned
|
|
|
|
.p2align 5
|
|
.Lloop_part_aligned:
|
|
ldr data1, [src1], #8
|
|
ldr data2, [src2], #8
|
|
subs limit_wd, limit_wd, #1
|
|
.Lstart_part_realigned:
|
|
eor diff, data1, data2 /* Non-zero if differences found. */
|
|
cbnz diff, .Lnot_limit
|
|
b.ne .Lloop_part_aligned
|
|
|
|
/* process leftover bytes: read the leftover bytes, starting with
|
|
negative offset - so we can load 8 bytes. */
|
|
ldr data1, [src1, pos]
|
|
ldr data2, [src2, pos]
|
|
eor diff, data1, data2 /* Non-zero if differences found. */
|
|
b .Lnot_limit
|
|
|
|
.LmisalignedLt8:
|
|
sub limit, limit, #1
|
|
1:
|
|
ldrb data1w, [src1], #1
|
|
ldrb data2w, [src2], #1
|
|
subs limit, limit, #1
|
|
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
|
b.eq 1b
|
|
sub result, data1, data2
|
|
ret
|
|
.size memcmp, . - memcmp
|
|
|
|
#endif
|