Files
2019-06-15 14:55:39 +02:00

1859 lines
47 KiB
ArmAsm

/*
* Copyright 2013 Harm Hanemaaijer <fgenfb@yahoo.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*/
#ifdef CONFIG_THUMB
#define W(instr) instr.w
#define THUMB(instr...) instr
#define ARM(instr...)
#else
#define W(instr) instr
#define THUMB(instr...)
#define ARM(instr...) instr
#endif
/*
* In practice, because the way NEON is configured on most systems,
* specifying alignment hints for NEON instructions doesn't seem
* to improve performance, or even degrade performance in some cases.
* However, actually having the address aligned to an element
* boundary or greater is beneficial.
*/
#define NEON_ALIGN(n)
/* #define NEON_ALIGN(n) :n */
/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
.text
.syntax unified
.arch armv7a
.fpu neon
.macro asm_function function_name
.global \function_name
.func \function_name
.type \function_name, function
ARM( .p2align 5 )
THUMB( .p2align 2 )
\function_name:
.endm
/*
* The following memcpy implementation is optimized with a fast path
* for common, word aligned cases and optionally use unaligned access for
* small sizes.
*
* - line_size is the cache line size used for prefetches. Must be 64 or 32.
* - prefetch_distance is the number of cache lines to look ahead and must be
* >= 2.
* - write_align is the write alignment enforced before the main loop for larger
* sizes (word aligned case) and must be 0, 16, 32, or 64.
* - aligned_access must be 0 or 1. When enabled, no unaligned memory accesses
* will occur. Both small size tresholds for unaligned access are not used
* in this case.
*/
/* The threshold size for using the fast path for the word-aligned case. */
#define FAST_PATH_THRESHOLD 256
/* The threshold size for using the small size path for the word-aligned case. */
#define SMALL_SIZE_THRESHOLD 15
/*
* The threshold size for using the small size path for the unaligned case.
* Unaligned memory accesses will be generated for requests smaller or equal to
* this size.
*/
#define UNALIGNED_SMALL_SIZE_THRESHOLD 64
/*
* The threshold size for using the small size path when both the source and
* the destination are unaligned. Unaligned memory accesses will be generated
* for requests smaller of equal to this size.
*/
#define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 32
/*
* For a code-reduced version, define all four of the above constants to 0,
* eliminating the fast path and small size special cases. With Thumb2
* enabled, this resulted in a reduction in code size from 1150 to 824 bytes,
* at the cost of lower performance for smaller sizes.
*/
// #define FAST_PATH_THRESHOLD 0
// #define SMALL_SIZE_THRESHOLD 0
// #define UNALIGNED_SMALL_SIZE_THRESHOLD 0
// #define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 0
/*
* EARLY_PREFETCHES is used in the fast path implementation.
* The optimal value for EARLY_PREFETCHES was determined empirically.
* It is equal to prefetch_distance + 1 for line_size 32.
* and prefetch_distance - 1 for line_size 64.
*/
#define EARLY_PREFETCHES (\prefetch_distance - (\line_size / 32) * 2 + 3)
#if FAST_PATH_THRESHOLD > 0
#define FAST_PATH(instr...) instr
#define NO_FAST_PATH(instr...)
#else
#define FAST_PATH(instr...)
#define NO_FAST_PATH(instr...) instr
#endif
/* Helper macro for the fast-path implementation. */
.macro copy_16_bytes bytes_to_go, line_size, prefetch_distance
#ifdef CONFIG_THUMB
/*
* When Thumb2 mode is enabled, the ldmia/stmia instructions
* will be 16-bit, and the preload instruction will be
* 32-bit, so we only need one 32-bit wide nop instruction
* when there's no preload, for a total size of two words.
*/
.if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \
(\bytes_to_go % \line_size) == 0
pld [r1, ip]
ldmia r1!, {r3, r4, r5, r6}
stmia r0!, {r3, r4, r5, r6}
.else
ldmia r1!, {r3, r4, r5, r6}
W( nop )
stmia r0!, {r3, r4, r5, r6}
.endif
#else
/*
* When ARM mode is enabled, every instruction is one word,
* so make sure the entire block is four instructions.
*/
.if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \
(\bytes_to_go % \line_size) == 0
pld [r1, ip]
.else
nop
.endif
ldmia r1!, {r3, r4, r5, r6}
nop
stmia r0!, {r3, r4, r5, r6}
#endif
.endm
/* Helper macro implementing unaligned copy. */
.macro unaligned_copy shift, line_size, prefetch_distance, write_align, \
aligned_access
/*
* ip is the aligned source base address.
* r3 is a word of data from the source.
*/
.if \write_align > 0
cmp r2, #(32 + \write_align - 4)
.else
cmp r2, #32
.endif
push {r5}
blt 55f
subs r2, r2, #32
/* Handle write alignment. */
.if \write_align > 0
.if \write_align == 8
tst r0, #4
mov r4, r3, lsr #\shift
ldrne r3, [r1], #4
subne r2, r2, #4
orrne r4, r4, r3, lsl #(32 - \shift)
strne r4, [r0], #4
.else
ands r5, r0, #(\write_align - 1)
rsb r5, r5, #\write_align
beq 59f
sub r2, r2, r5
58: movs r4, r3, lsr #\shift
ldr r3, [r1], #4
subs r5, r5, #4
orr r4, r4, r3, lsl #(32 - \shift)
str r4, [r0], #4
bgt 58b
59:
.endif
.endif
/*
* Assume a preload at aligned base + line_size will
* be useful.
*/
pld [ip, #\line_size]
push {r6-r11}
mov r11, r3
mov r4, ip
add r5, r1, #(\prefetch_distance * \line_size)
subs r2, r2, #(\prefetch_distance * \line_size)
bic r3, r5, #31
add r4, r4, #(2 * \line_size)
blt 54f
cmp r4, r3
sub ip, r3, r1
/*
* "Catch-up" the early preloads (which have been performed up
* to aligned source address + line_size) to the preload offset
* used in the main loop.
*/
bge 52f
51: adds r4, r4, #\line_size /* Thumb16 */
cmp r4, r3
pld [r4, #(- \line_size)]
blt 51b
52:
/*
* Note that when L1_CACHE_BYTES is 64, we are
* prefetching every 32 bytes. Although not optimal
* there doesn't seem to be big penalty for the extra
* preload instructions and it prevents greater
* code size and complexity.
*/
53: pld [r1, ip]
54:
ldmia r1!, {r4-r7}
mov r3, r11, lsr #\shift
ldmia r1!, {r8-r11}
orr r3, r3, r4, lsl #(32 - \shift)
movs r4, r4, lsr #\shift /* Thumb16 */
orr r4, r4, r5, lsl #(32 - \shift)
movs r5, r5, lsr #\shift /* Thumb16 */
orr r5, r5, r6, lsl #(32 - \shift)
movs r6, r6, lsr #\shift /* Thumb16 */
orr r6, r6, r7, lsl #(32 - \shift)
movs r7, r7, lsr #\shift /* Thumb16 */
orr r7, r7, r8, lsl #(32 - \shift)
mov r8, r8, lsr #\shift
orr r8, r8, r9, lsl #(32 - \shift)
mov r9, r9, lsr #\shift
orr r9, r9, r10, lsl #(32 - \shift)
mov r10, r10, lsr #\shift
orr r10, r10, r11, lsl #(32 - \shift)
subs r2, r2, #32
stmia r0!, {r3-r10}
bge 53b
cmn r2, #(\prefetch_distance * \line_size)
bge 54b
/* Correct the count. */
adds r2, r2, #(\prefetch_distance * \line_size + 32)
mov r3, r11
pop {r6-r11}
55: bics r5, r2, #3
beq 57f
56: movs r4, r3, lsr #\shift
ldr r3, [r1], #4
subs r5, r5, #4
orr r4, r4, r3, lsl #(32 - \shift)
str r4, [r0], #4
bgt 56b
57: pop {r5}
pop {r4}
subs r1, r1, #((32 - \shift) / 8)
.if \aligned_access == 1
b 7b
.else
b 3b
.endif
.endm
/* The main memcpy function macro. */
.macro memcpy_variant line_size, prefetch_distance, write_align, \
aligned_access
.if \aligned_access == 1
cmp r2, #3
.else
NO_FAST_PATH( cmp r2, #3 )
.endif
orr r3, r0, r1
.if \aligned_access == 1
push {r0}
ble 7f
.else
NO_FAST_PATH( push {r0} )
NO_FAST_PATH( ble 3f )
.endif
bic ip, r1, #(\line_size - 1)
tst r3, #3
pld [ip]
.if \aligned_access == 1
FAST_PATH( bne 30f )
.else
FAST_PATH( push {r0} )
FAST_PATH( bne 7f ) /* Unaligned source or destination. */
.endif
FAST_PATH( cmp r2, #FAST_PATH_THRESHOLD )
FAST_PATH( bgt 10f )
NO_FAST_PATH( bne 30f )
#if FAST_PATH_THRESHOLD == 0
/*
* When the fast path is disabled, check whether there are
* enough bytes for alignment, and jump to the main handling
* code for larger sizes.
*/
.if \write_align > 0
cmp r2, #(\write_align - 4)
bge 10f
.endif
push {r4}
b 18f
#endif
/*
* Fast path for aligned copies of size <= FAST_PATH_THRESHOLD.
*/
#if FAST_PATH_THRESHOLD > 0
#if SMALL_SIZE_THRESHOLD == 15
bics r3, r2, #15
pld [ip, #\line_size]
/* Jump for small sizes <= 15 bytes. */
beq 5f
#else
cmp r2, #SMALL_SIZE_THRESHOLD
pld [ip, #\line_size]
/* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */
ble 5f
bic r3, r2, #15
#endif
9: /*
* This is the entry-point into the fast path from
* an unaligned request that has been aligned.
*/
push {r4, r5, r6}
/*
* Use a heuristic to determine whether the preload
* at aligned_base + 2 * line_size will be useful.
*/
.if EARLY_PREFETCHES >= 3
cmp r2, #(2 * \line_size - \line_size / 2)
.endif
add r5, ip, #(EARLY_PREFETCHES * \line_size)
.if EARLY_PREFETCHES >= 3
blt 1f
.endif
.if EARLY_PREFETCHES == 3
pld [ip, #(2 * \line_size)] )
.endif
.if EARLY_PREFETCHES == 4
cmp r2, #(3 * \line_size - \line_size / 2)
pld [ip, #(2 * \line_size)]
blt 1f
pld [ip, #(3 * \line_size)]
.endif
.if EARLY_PREFETCHES == 5
cmp r2, #(3 * \line_size - \line_size / 2)
pld [ip, #(2 * \line_size)]
blt 1f
cmp r2, #(4 * \line_size - \line_size / 2)
pld [ip, #(3 * \line_size)]
blt 1f
pld [ip, #(4 * \line_size)]
.endif
1: /*
* Set r5 so that the next preload will occur
* exactly at aligned_base + EARLY_PREFETCHES *
* line_size. For example, if line_size is 64
* and the number of bytes is 240, the next preload
* will occur after processing 48 bytes, which is derived
* from the formula r3 & (line_size - 1),
* where r3 is equal to number_of_bytes & (~15).
*/
rsb r4, r3, #256
subs r5, r5, r1
and ip, r3, #(\line_size - 1)
subs r2, r2, r3 /* Thumb16 */
THUMB( lsrs r4, r4, #1 ) /* Thumb16 */
sub ip, r5, ip
add pc, pc, r4
nop
/* >= 256 bytes to go. */
copy_16_bytes 256, \line_size, \prefetch_distance
/* >= 240 bytes go. */
copy_16_bytes 240, \line_size, \prefetch_distance
/* >= 224 bytes to go. */
copy_16_bytes 224, \line_size, \prefetch_distance
/* >= 204 bytes go. */
copy_16_bytes 204, \line_size, \prefetch_distance
/* >= 192 bytes to go. */
copy_16_bytes 192, \line_size, \prefetch_distance
/* >= 176 bytes go. */
copy_16_bytes 176, \line_size, \prefetch_distance
/* >= 160 bytes to go. */
copy_16_bytes 160, \line_size, \prefetch_distance
/* >= 144 bytes go. */
copy_16_bytes 144, \line_size, \prefetch_distance
/* >= 128 bytes to go. */
copy_16_bytes 128, \line_size, \prefetch_distance
/* >= 112 bytes go. */
copy_16_bytes 112, \line_size, \prefetch_distance
/* >= 96 bytes to go. */
copy_16_bytes 96, \line_size, \prefetch_distance
/* >= 80 bytes to go. */
copy_16_bytes 80, \line_size, \prefetch_distance
/* >= 64 bytes to go. */
copy_16_bytes 64, \line_size, \prefetch_distance
/* >= 48 bytes to go. */
copy_16_bytes 48, \line_size, \prefetch_distance
/* >= 32 bytes to go. */
copy_16_bytes 32, \line_size, \prefetch_distance
/* At this point there are 16 to 31 bytes to go. */
tst r2, #15
ldmia r1!, {r3, r4, r5, r6}
cmpne r2, #8
/*
* If r2 == 8, we need to clear the eq flag while
* making sure carry remains set.
*/
tsteq r2, #15
stmia r0!, {r3, r4, r5, r6}
/*
* The equal flag is set if there are no bytes left.
* The carry flag is set is there are >= 8 bytes left.
*/
pop {r4, r5, r6}
beq 4f
2:
/*
* ARM mode imposes restrictions on the registers used
* in double-word loads and stored so we have to use
* single-word operations.
*/
.if \aligned_access == 0
ARM( ldrcs r3, [r1], #4 )
ARM( ldrcs ip, [r1], #4 )
ARM( strcs r3, [r0], #4 )
ARM( strcs ip, [r0], #4 )
THUMB( ldrdcs r3, ip, [r1], #8 )
THUMB( strdcs r3, ip, [r0], #8 )
.else
ldrcs r3, [r1], #4
ldrcs ip, [r1], #4
strcs r3, [r0], #4
strcs ip, [r0], #4
.endif
tst r2, #4
ldrne ip, [r1], #4
strne ip, [r0], #4
tst r2, #3
popeq {r0}
bxeq lr
/*
* Handle the last up to three bytes. Unaligned access
* make take place if source or destination is not
* half-word aligned.
*/
3: movs r2, r2, lsl #31
ldrhcs r3, [r1], #2
strhcs r3, [r0], #2
ldrbne r3, [r1], #1
strbne r3, [r0], #1
4: pop {r0}
bx lr
5: /*
* Sizes <= SMALL_SIZE_THRESHOLD bytes, both source and
* destination aligned.
*/
#if SMALL_SIZE_THRESHOLD <= 15
cmp r2, #8 /* cs if r2 >= 8. */
b 2b
#else
101: tst r2, #4
ldrne r3, [r1], #4
subne r2, r2, #4
strne r3, [r0], #4
cmp r2, #8
blt 3b
6: cmp r2, #16
ldr r3, [r1], #4
ldr ip, [r1], #4
str r3, [r0], #4
sub r2, r2, #8
str ip, [r0], #4
bge 6b
cmp r2, #0
popeq {r0}
bxeq lr
b 3b
#endif
#endif /* FAST_PATH_THRESHOLD > 0 */
.if \aligned_access == 1
/*
* Handle the last up to three bytes avoiding
* unaligned memory access.
*/
7: movs r2, r2, lsl #31
ldrbcs r3, [r1], #1
ldrbcs ip, [r1], #1
strbcs r3, [r0], #1
strbcs ip, [r0], #1
ldrbne r3, [r1], #1
strbne r3, [r0], #1
pop {r0}
bx lr
.endif
#if FAST_PATH_THRESHOLD > 0
.if \aligned_access == 0
7: /*
* Unaligned source or destination. There are seperate small
* size thresholds for when both source and destination are
* unaligned and the other case.
*/
tst r0, #3
mov r3, #UNALIGNED_SMALL_SIZE_THRESHOLD
tstne r1, #3
movne r3, #BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD
cmp r2, r3
bgt 30f
/* Small sizes, unaligned case. Use single word load/stores. */
#if SMALL_SIZE_THRESHOLD >= 16
/* Use the identical code path already defined above. */
b 101b
#else
tst r2, #4
ldrne r3, [r1], #4
subne r2, r2, #4
strne r3, [r0], #4
cmp r2, #8
blt 3b
8: cmp r2, #16
ldr r3, [r1], #4
ldr ip, [r1], #4
str r3, [r0], #4
sub r2, r2, #8
str ip, [r0], #4
bge 8b
b 3b
#endif
.endif
#endif /* FAST_PATH_THRESHOLD > 0 */
10: /*
* This is the start of the handling of larger sizes for
* aligned copies.
*
* Size > FAST_PATH_THRESHOLD (256).
* ip is the line_sized aligned source address for preloads.
*/
.if \write_align >= 16
ands r3, r0, #(\write_align - 1)
push {r4}
rsb r3, r3, #\write_align
beq 17f
push {lr}
bl 20f
pop {lr}
17:
.elseif \write_align == 8
/*
* For write alignment of 8, it is quickest to do a simple
* conditional load/store.
*/
tst r0, #4
push {r4}
ldrne r3, [r1], #4
subne r2, r2, #4
strne r3, [r0], #4
.else
push {r4}
.endif
18:
.if (FAST_PATH_THRESHOLD - (\write_align - 4)) < \line_size
cmp r2, #\line_size
blt 15f
.endif
subs r2, r2, #\line_size
16: /*
* This is the entry-point when source and destination were
* initially unaligned but are now aligned because they had
* the same alignment within a word. Write alignment and
* size check has already been handled.
*/
push {r5-r11}
/*
* Assume a preload at aligned base + line_size will
* be useful.
*/
mov r4, ip
pld [ip, #\line_size]
add r5, r1, #(\prefetch_distance * \line_size)
subs r2, r2, #(\prefetch_distance * \line_size)
bic r3, r5, #(\line_size - 1)
add r4, r4, #(2 * \line_size)
blt 14f
cmp r4, r3
sub ip, r3, r1
/*
* "Catch-up" the early preloads (which have been performed up
* to aligned source address + line_size) to the preload offset
* used in the main loop.
*/
bge 12f
11: adds r4, r4, #\line_size /* Thumb16 */
cmp r4, r3
pld [r4, #(- \line_size)]
blt 11b
12:
/*
* The main loop for large sizes. Copy 32 bytes at a time
* using ldmia/stmia while prefetching a 32-byte aligned
* address for line size 32, or 64 bytes at a time while
* prefetching a 64-byte aligned address for line size 64.
*/
13: pld [r1, ip]
14:
.if \line_size == 32
ldmia r1!, {r4-r7}
subs r2, r2, #32
ldmia r1!, {r8-r11}
stmia r0!, {r4-r7}
stmia r0!, {r8-r11}
.else
ldmia r1!, {r4-r11}
subs r2, r2, #64
stmia r0!, {r4-r11}
ldmia r1!, {r4-r11}
stmia r0!, {r4-r11}
.endif
bge 13b
cmn r2, #(\prefetch_distance * \line_size)
bge 14b
/* Correct the count. */
adds r2, r2, #((\prefetch_distance + 1) * \line_size)
pop {r5-r11}
15: ands r3, r2, #60
.if \write_align <= 8
/*
* When the subroutine is not used for write alignment, the
* subroutine will only be called once, so branch without
* linking.
*/
bne 20f
19:
.else
mov ip, lr
blne 20f
mov lr, ip
.endif
pop {r4}
#if FAST_PATH_THRESHOLD > 0
cmp r2, #0
bne 3b
#else
ARM( cmp r2, #0 )
ARM( beq 4f )
THUMB( cbz r2, 4f )
/* Handle the last up to three bytes. */
3: movs r2, r2, lsl #31
ldrhcs r3, [r1], #2
strhcs r3, [r0], #2
ldrbne r3, [r1], #1
strbne r3, [r0], #1
4:
#endif
pop {r0}
bx lr
/*
* Subroutine that copies a multiple of 4 bytes of size
* r3 from 0 to 64 or 32 bytes. r2 is decremented by the
* number of bytes copied.
*/
20: tst r3, #4
sub r2, r2, r3
ldrne r4, [r1], #4
subne r3, r3, #4
strne r4, [r0], #4
.if \write_align <= 32 && \line_size == 32
rsb r3, r3, #32
.else
rsb r3, r3, #64
.endif
/*
* These ldmia/stmia instructions are 16-bit on Thumb2,
* 32-bit on ARM.
*/
THUMB( lsrs r3, r3, #1 )
add pc, pc, r3
nop
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
.if \write_align > 32 || \line_size > 32
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
.endif
.if \write_align <= 8
b 19b
.else
mov pc, lr
.endif
30: /*
* Unaligned case. Align the destination.
* Number of bytes is > UNALIGNED_SMALL_SIZE_THRESHOLD.
* Note: This may use unaligned access.
* ip is the line_size aligned source address for preloads.
*/
ands r3, r0, #3
push {r4}
andeq r3, r1, #3
beq 40f /* Destination is aligned but source is not. */
/* Align the destination. */
cmp r3, #2
.if \aligned_access == 1
ldrble r4, [r1], #1
ldrble r3, [r1], #1
suble r2, r2, #2
strble r4, [r0], #1
strble r3, [r0], #1
.else
ldrhle r4, [r1], #2
suble r2, r2, #2
strhle r4, [r0], #2
.endif
ldrbne r4, [r1], #1
subne r2, r2, #1
strbne r4, [r0], #1
ands r3, r1, #3
bne 40f /* Destination is aligned but source is not. */
#if 0 && FAST_PATH_THRESHOLD > 0
/*
* Source and destination are now aligned.
* Now recreate the situation of a word-aligned memcpy
* with the current source and destination,
* which may require an extra preload instruction.
*
* This path is currently disabled disabled in favour
* of the one below this which does write alignment and
* jumps into the main loop for larger sizes.
*/
bic r3, r1, #(\line_size - 1)
pop {r4}
cmp r3, ip
THUMB( pldne [r3] )
THUMB( cmp r2, #FAST_PATH_THRESHOLD )
THUMB( mov ip, r3 )
ARM( beq 31f )
ARM( pld [r3] )
ARM( mov ip, r3 )
31: ARM( cmp r2, #FAST_PATH_THRESHOLD )
bgt 10b
/*
* Recreate the fast path small size check here,
* but only if it necessary.
*/
.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) <= SMALL_SIZE_THRESHOLD ||
\aligned_access == 1
cmp r2, #SMALL_SIZE_THRESHOLD
pld [ip, #\line_size]
/* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */
ble 5b
.else
pld [ip, #\line_size]
.endif
bic r3, r2, #15
b 9b
#else
/*
* Source and destination are now aligned. Check carefully
* whether there are enough bytes to do alignment.
*/
.if \write_align > 0
.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < (\write_align - 4) \
|| \aligned_access == 1
cmp r2, #(\write_align - 4)
blt 31f
.endif
.if \write_align == 8
/*
* For write alignment of 8, it is quickest to do a simple
* conditional load/store.
*/
tst r0, #4
ldrne r3, [r1], #4
subne r2, r2, #4
strne r3, [r0], #4
.else
ands r3, r0, #(\write_align - 1)
rsb r3, r3, #\write_align
beq 31f
push {lr}
bl 20b
pop {lr}
.endif
31: /*
* Check whether there are enough bytes to do one iteration
* of the main loop.
*/
.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3 - (\write_align - 4)) < \line_size \
|| \aligned_access == 1
cmp r2, #\line_size
blt 15b
.endif
subs r2, r2, #\line_size
.else
/*
* No write alignment. Only have to check for enough bytes to
* do one iteration of the main loop.
*/
.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < \line_size \
|| \aligned_access == 1
cmp r2, #\line_size
blt 15b
.endif
subs r2, r2, #\line_size
.endif
b 16b
#endif
40: /*
* Unaligned case. Size is > SMALL_SIZE_THRESHOLD - 3.
*/
bic r1, r1, #3
cmp r3, #2
ldr r3, [r1], #4
beq 41f
bgt 42f
unaligned_copy 8, \line_size, \prefetch_distance, \
\write_align, \aligned_access
41: unaligned_copy 16, \line_size, \prefetch_distance, \
\write_align, \aligned_access
42: unaligned_copy 24, \line_size, \prefetch_distance, \
\write_align, \aligned_access
.endm
/*
* The following is a NEON-based memcpy implementation that may use unaligned
* access, but NEON instruction addresses are always at least element aligned.
* It is optimized for both Thumb2 (CONFIG_THUMB) and ARM mode.
*
* - line_size is the cache line size used for prefetches. Must be 64 or 32.
* - prefetch_distance is the number of cache lines to look ahead and must be
* >= 2, or 0 to disable prefetching in the main copying loop.
* - early_prefetch indicates whether to perform early preloads. Must be 0 or 1.
* When prefetch_distance > 0, early_prefetch should be 1. To remove all PLD
* instructions altogether, set both prefetch_distance and early_prefetch
* to 0.
*/
.macro neon_memcpy_variant line_size, prefetch_distance, early_prefetch
cmp r2, #3
.if \prefetch_distance > 0 || \early_prefetch == 1
push {r0}
.else
mov ip, r0
.endif
orr r3, r0, r1
ble 8f
.if \prefetch_distance > 0 || \early_prefetch == 1
bic ip, r1, #(\line_size - 1)
.endif
tst r3, #3
.if \early_prefetch == 1
pld [ip]
.endif
bne 10f /* Unaligned source or destination. */
push {r4}
/* Aligned source and destination. */
1: cmp r2, #256
/*
* Jump to word-aligned NEON fast path <= 256 bytes.
*/
ble 18f
subs r2, r2, #\line_size
/* Align to a 32-byte boundary. */
#ifdef CONFIG_THUMB
/*
* Use conditional NEON instructions when
* available (Thumb2 mode)
*/
ands r4, r0, #31
rsb r4, r4, #32
beq 31f
tst r4, #4
sub r2, r2, r4
ldrne r3, [r1 :32], #4
strne r3, [r0 :32], #4
tst r4, #8
vld1ne.32 {d0}, [r1]!
vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
cmp r4, #16
vld1ge.32 {d2, d3}, [r1]!
vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
#else
/*
* Otherwise, branch into a series of single
* loads/stores.
*/
ands r4, r0, #31
beq 31f
rsb r3, r4, #32
lsl r4, r4, #1
sub r2, r2, r3
add pc, pc, r4
nop
ldr r3, [r1], #4
str r3, [r0], #4
ldr r4, [r1], #4
str r4, [r0], #4
ldr r3, [r1], #4
str r3, [r0], #4
ldr r4, [r1], #4
str r4, [r0], #4
ldr r3, [r1], #4
str r3, [r0], #4
ldr r4, [r1], #4
str r4, [r0], #4
ldr r3, [r1], #4
str r3, [r0], #4
ldr r4, [r1], #4
str r4, [r0], #4
#endif
cmp r2, #0
addlt r2, r2, \line_size
blt 6f
31:
.if \early_prefetch == 1
pld [ip, #\line_size]
.endif
.if \prefetch_distance > 0
/*
* Assume a preload at aligned base + line_size will
* be useful.
*/
push {r5}
mov r4, ip
add r5, r1, #(\prefetch_distance * \line_size)
subs r2, r2, #(\prefetch_distance * \line_size)
bic r3, r5, #(\line_size - 1)
add r4, r4, #(2 * \line_size)
blt 5f
cmp r4, r3
sub ip, r3, r1
/*
* "Catch-up" the early preloads (which have been performed up
* to aligned source address + line_size) to the preload offset
* used in the main loop.
*/
bge 3f
2: adds r4, r4, #\line_size /* Thumb16 */
cmp r4, r3
pld [r4, #(- \line_size)]
blt 2b
3:
.endif
sub ip, ip, #\line_size
4:
/*
* Since the destination is 32-byte aligned,
* specify 256-bit alignment for the NEON stores.
*/
.if \line_size == 32
vld1.32 {d0-d3}, [r1]!
subs r2, r2, #32
.if \prefetch_distance > 0
pld [r1, ip]
.endif
vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
.else /* line_size == 64 */
vld1.32 {d0-d3}, [r1]!
vld1.32 {d4-d7}, [r1]!
.if \prefetch_distance > 0
pld [r1, ip]
.endif
vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
subs r2, r2, #64
vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
.endif
bge 4b
.if \prefetch_distance > 0
5:
.if \line_size == 32
vld1.32 {d0-d3}, [r1]!
subs r2, r2, #32
vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
.else /* line_size == 64 */
vld1.32 {d0-d3}, [r1]!
vld1.32 {d4-d7}, [r1]!
vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
subs r2, r2, #64
vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
.endif
cmn r2, #(\prefetch_distance * \line_size)
bge 5b
.endif
/* Correct the count. */
23: adds r2, r2, #((\prefetch_distance + 1) * \line_size)
.if \prefetch_distance > 0
pop {r5}
.endif
/*
* Process the last 0-(line_size - 1) bytes, destination
* 32-byte aligned, source word aligned.
*/
6:
#ifdef CONFIG_THUMB
/*
* Use conditional NEON instructions when
* available (Thumb2 mode).
*/
.if \line_size == 64
cmp r2, #32
vld1ge.32 {d0-d3}, [r1]!
vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]!
tst r2, #16
vld1ne.32 {d0, d1}, [r1]!
vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
.else
cmp r2, #16
vld1ge.32 {d0, d1}, [r1]!
vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
.endif
tst r2, #8
vld1ne.32 {d2}, [r1]!
vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]!
tst r2, #4
ldrne r3, [r1], #4
strne r3, [r0 :32], #4
pop {r4}
#else
/*
* Just use the world-aligned tail code if we
* don't have Thumb2.
*/
b 17f
#endif
/*
* Handle the last up to three bytes. Unaligned access
* may take place if source or destination is not
* half-word aligned.
*/
8: movs r2, r2, lsl #31
ldrhcs r3, [r1], #2
strhcs r3, [r0], #2
ldrbne r3, [r1], #1
strbne r3, [r0]
9:
.if \prefetch_distance > 0 || \early_prefetch == 1
pop {r0}
.else
mov r0, ip
.endif
bx lr
10: /*
* Unaligned case. Align the destination.
* Number of bytes is > 3.
* Note: This may use unaligned access.
* ip is the line_size aligned source address for preloads.
*/
cmp r2, #64
push {r4}
/* For small sizes < 64 bytes just use the unaligned tail code. */
blt 16f
ands r3, r0, #3
beq 11f /* Destination is aligned but source is not. */
/* Align the destination. */
cmp r3, #2
ldrbne r4, [r1], #1
subne r2, r2, #1
strbne r4, [r0], #1
ldrhle r4, [r1], #2
suble r2, r2, #2
strhle r4, [r0], #2
tst r1, #3
beq 1b /* Destination and source are now aligned. */
/* Destination is now aligned to a word boundary. */
11:
cmp r2, #64
/*
* Jump to non-aligned NEON tail code for <= 64 bytes.
*/
ble 16f
subs r2, r2, #\line_size
/* Align destination to a 32-byte boundary. */
ands r4, r0, #31
rsb r4, r4, #32
beq 20f
tst r4, #4
sub r2, r2, r4
ldrne r3, [r1 :8], #4 /* Unaligned access. */
strne r3, [r0 :32], #4
tst r4, #8
#ifdef CONFIG_THUMB
/*
* Use conditional NEON instructions when
* available (Thumb2 mode)
*/
vld1ne.8 {d0}, [r1]!
vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
cmp r4, #16
vld1ge.8 {d2, d3}, [r1]!
vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
#else
beq 31f
vld1.8 {d0}, [r1]!
vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
31: cmp r4, #16
blt 32f
vld1.8 {d2, d3}, [r1]!
vst1.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
32:
#endif
cmp r2, #0
addlt r2, r2, #\line_size
blt 16f
20:
.if \early_prefetch == 1
pld [ip, #\line_size]
.endif
.if \prefetch_distance > 0
/*
* Assume a preload at aligned base + line_size will
* be useful.
*/
push {r5}
mov r4, ip
add r5, r1, #(\prefetch_distance * \line_size)
subs r2, r2, #(\prefetch_distance * \line_size)
bic r3, r5, #(\line_size - 1)
add r4, r4, #(2 * \line_size)
blt 15f
cmp r4, r3
sub ip, r3, r1
/*
* "Catch-up" the early preloads (which have been performed up
* to aligned source address + line_size) to the preload offset
* used in the main loop.
*/
bge 13f
12: adds r4, r4, #\line_size /* Thumb16 */
cmp r4, r3
pld [r4, #(- \line_size)]
blt 12b
.endif
13:
/*
* Process 64 unaligned bytes from source at a time and copy
* them to the 32-byte aligned destination.
*/
14:
.if \prefetch_distance > 0
pld [r1, ip]
.endif
15:
.if \line_size == 32
vld1.8 {d0-d3}, [r1]!
subs r2, r2, #32
vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
.else /* line_size == 64 */
vld1.8 {d0-d3}, [r1]!
vld1.8 {d4-d7}, [r1]!
vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
subs r2, r2, #64
vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
.endif
bge 14b
.if \prefetch_distance > 0
cmn r2, #(\prefetch_distance * \line_size)
bge 15b
.endif
/* Correct the count. */
adds r2, r2, #((\prefetch_distance + 1) * \line_size)
.if \prefetch_distance > 0
pop {r5}
.endif
/*
* Handle last 0-(line_size - 1) bytes (destination 32-byte
* aligned source unaligned).
*/
#ifdef CONFIG_THUMB
/*
* Use conditional NEON instructions when
* available (Thumb2 mode)
*/
.if \line_size == 64
cmp r2, #32
vld1ge.8 {d0-d3}, [r1]!
vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]!
tst r2, #16
vld1ne.8 {d0, d1}, [r1]!
vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
.else
cmp r2, #16
vld1ge.8 {d0, d1}, [r1]!
vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
.endif
tst r2, #8
vld1ne.8 {d2}, [r1]!
vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]!
tst r2, #4
ldrne r3, [r1], #4
strne r3, [r0 :32], #4
pop {r4}
b 8b
#else
/*
* Fall through to the code below. It is not entirely
* optimal because it does not indicate the destination
* is word aligned.
*/
#endif
/* Handle small size of 0-63 bytes, unaligned. */
16: bic r3, r2, #7
rsb r4, r3, #64
tst r2, #7
add pc, pc, r4
nop
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0]!
vld1.8 {d1}, [r1]!
vst1.8 {d1}, [r0]!
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0]!
vld1.8 {d1}, [r1]!
vst1.8 {d1}, [r0]!
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0]!
vld1.8 {d1}, [r1]!
vst1.8 {d1}, [r0]!
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0]!
vld1.8 {d1}, [r1]!
vst1.8 {d1}, [r0]!
pop {r4}
beq 9b
tst r2, #4
ldrne r3, [r1 :8], #4 /* Unaligned access. */
strne r3, [r0], #4
b 8b
/* Handle small size of 0-63 bytes, word aligned. */
17:
#ifdef CONFIG_THUMB
cmp r2, #32
vld1ge.32 {d0-d3}, [r1]!
vst1ge.32 {d0-d3}, [r0]!
tst r2, #16
vld1ne.32 {d0, d1}, [r1]!
vst1ne.32 {d0, d1}, [r0]!
tst r2, #8
vld1ne.32 {d2}, [r1]!
vst1ne.32 {d2}, [r0]!
tst r2, #7
#else
bic r3, r2, #7
rsb r4, r3, #64
tst r2, #7
add pc, pc, r4
nop
vld1.32 {d0}, [r1]!
vst1.32 {d0}, [r0]!
vld1.32 {d1}, [r1]!
vst1.32 {d1}, [r0]!
vld1.32 {d0}, [r1]!
vst1.32 {d0}, [r0]!
vld1.32 {d1}, [r1]!
vst1.32 {d1}, [r0]!
vld1.32 {d0}, [r1]!
vst1.32 {d0}, [r0]!
vld1.32 {d1}, [r1]!
vst1.32 {d1}, [r0]!
vld1.32 {d0}, [r1]!
vst1.32 {d0}, [r0]!
vld1.32 {d1}, [r1]!
vst1.32 {d1}, [r0]!
#endif
pop {r4}
beq 9b
tst r2, #4
ldrne r3, [r1], #4
strne r3, [r0], #4
b 8b
/*
* Fast path for <= 256 bytes, word aligned.
* This is hardcoded for a preload offset of 128 bytes,
* which seems to work well in practice for small sizes.
*/
18: bics r3, r2, #31
.if \early_prefetch == 1
pld [ip, #32]
beq 21f
pld [ip, #64]
pld [ip, #96]
.endif
rsb r4, r3, #256
ands r2, r2, #31
/*
* Each code block handling 32 bytes is
* 12 bytes long.
*/
lsr r4, r4, #2
add ip, ip, #128
add r4, r4, r4, lsr #1
sub ip, ip, r1
add pc, pc, r4
nop
pld [r1, ip]
vld1.32 {d0-d3}, [r1]!
vst1.32 {d0-d3}, [r0]!
pld [r1, ip]
vld1.32 {d4-d7}, [r1]!
vst1.32 {d4-d7}, [r0]!
pld [r1, ip]
vld1.32 {d0-d3}, [r1]!
vst1.32 {d0-d3}, [r0]!
pld [r1, ip]
vld1.32 {d4-d7}, [r1]!
vst1.32 {d4-d7}, [r0]!
pld [r1, ip]
vld1.32 {d0-d3}, [r1]!
vst1.32 {d0-d3}, [r0]!
W(nop)
vld1.32 {d4-d7}, [r1]!
vst1.32 {d4-d7}, [r0]!
W(nop)
vld1.32 {d0-d3}, [r1]!
vst1.32 {d0-d3}, [r0]!
W(nop)
vld1.32 {d4-d7}, [r1]!
vst1.32 {d4-d7}, [r0]!
beq 19f
21:
#ifdef CONFIG_THUMB
cmp r2, #16
vld1ge.32 {d0-d1}, [r1]!
vst1ge.32 {d0-d1}, [r0]!
tst r2, #8
vld1ne.32 {d0}, [r1]!
vst1ne.32 {d0}, [r0]!
#else
cmp r2, #16
ldmiage r1!, {r3, r4}
stmiage r0!, {r3, r4}
ldmiage r1!, {r3, r4}
stmiage r0!, {r3, r4}
tst r2, #8
ldmiane r1!, {r3, r4}
stmiane r0!, {r3, r4}
#endif
tst r2, #4
pop {r4}
ldrne r3, [r1], #4
strne r3, [r0 :32], #4
and r2, r2, #3
b 8b
19:
pop {r4}
.if \prefetch_distance > 0 || \early_prefetch == 1
pop {r0}
.else
mov r0, ip
.endif
bx lr
.endm
#if defined(MEMCPY_REPLACEMENT_RPI) || defined(MEMCPY_REPLACEMENT_ARMV7_32) \
|| defined(MEMCPY_REPLACEMENT_ARMV7_64) || defined(MEMCPY_REPLACEMENT_NEON_32) \
|| defined(MEMCPY_REPLACEMENT_NEON_64)
#ifdef MEMCPY_REPLACEMENT_RPI
asm_function memcpy
memcpy_variant 32, 3, 8, 0
.endfunc
#endif
#ifdef MEMCPY_REPLACEMENT_ARMV7_32
asm_function memcpy
memcpy_variant 32, 6, 0, 0
.endfunc
#endif
#ifdef MEMCPY_REPLACEMENT_ARMV7_64
asm_function memcpy
memcpy_variant 64, 3, 0, 0
.endfunc
#endif
#ifdef MEMCPY_REPLACEMENT_NEON_32
asm_function memcpy
neon_memcpy_variant 32, 6, 1
.endfunc
#endif
#ifdef MEMCPY_REPLACEMENT_NEON_64
asm_function memcpy
neon_memcpy_variant 64, 3, 1
.endfunc
#endif
#ifdef MEMCPY_REPLACEMENT_NEON_AUTO
asm_function memcpy
neon_memcpy_variant 32, 0, 1
.endfunc
#endif
#else
asm_function memcpy_new_line_size_64_preload_192
memcpy_variant 64, 3, 0, 0
.endfunc
asm_function memcpy_new_line_size_64_preload_192_align_32
memcpy_variant 64, 3, 32, 0
.endfunc
asm_function memcpy_new_line_size_64_preload_192_aligned_access
memcpy_variant 64, 3, 0, 1
.endfunc
asm_function memcpy_new_line_size_32_preload_192
memcpy_variant 32, 6, 0, 0
.endfunc
asm_function memcpy_new_line_size_32_preload_192_align_32
memcpy_variant 32, 6, 32, 0
.endfunc
asm_function memcpy_new_line_size_32_preload_96
memcpy_variant 32, 3, 8, 0
.endfunc
asm_function memcpy_new_line_size_32_preload_96_aligned_access
memcpy_variant 32, 3, 8, 1
.endfunc
asm_function memcpy_new_neon_line_size_64
neon_memcpy_variant 64, 3, 1
.endfunc
asm_function memcpy_new_neon_line_size_32
neon_memcpy_variant 32, 6, 1
.endfunc
asm_function memcpy_new_neon_line_size_32_auto
neon_memcpy_variant 32, 0, 1
.endfunc
#endif
/*
* Macro for memset replacement.
* write_align must be 0, 8, or 32.
* use_neon must be 0 or 1.
*/
.macro memset_variant write_align, use_neon
.if \use_neon == 1
.fpu neon
.endif
ands r3, r0, #3
mov ip, r0
bne 7f
/* Destination is word aligned. */
1: orr r1, r1, r1, lsl #8
.if \use_neon == 1
cmp r2, #16
.else
cmp r2, #8
.endif
orr r1, r1, r1, lsl #16
.if \use_neon == 1
blt 13f
vmov d0, r1, r1
vmov d1, r1, r1
.else
blt 5f
mov r3, r1
.endif
cmp r2, #64
push {r4}
.if \use_neon == 1
blt 10f
.else
ble 10f
.endif
.if \write_align > 0
ands r4, r0, #(\write_align - 1)
.if \use_neon == 1
#ifndef CONFIG_THUMB
add r3, r4, #7
#endif
.endif
/* Let r4 be equal to the number of bytes to align. */
rsb r4, r4, #\write_align
/*
* At this point r4 contains the number of bytes to align
* if eq is not set. The eq flag is set if there are no bytes
* to align.
*/
.if \write_align == 8
subne r2, r2, r4
strne r1, [r0], #4
.elseif \write_align == 32
beq 2f
tst r4, #4
sub r2, r2, r4
strne r1, [r0], #4
.if \use_neon == 1
#ifdef CONFIG_THUMB
tst r4, #8
vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
cmp r4, #16
vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
#else
bic r4, r3, #7
lsr r4, r4, #1
add pc, pc, r4
nop
vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
#endif
.else
tst r4, #8
stmiane r0!, {r1, r3}
cmp r4, #16
stmiage r0!, {r1, r3}
stmiage r0!, {r1, r3}
.endif
.endif /* \write_align == 32 */
cmp r2, #64
blt 4f
.endif /* \write_align > 0 */
2:
.if \use_neon == 1
/*
* When NEON is enabled, \write_align is
* equal to 32 so specify 256-bit alignment in the
* NEON store instructions.
*/
subs r2, r2, #64
vmov q1, q0
3: vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
subs r2, r2, #64
vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
bge 3b
adds r2, r2, #64
.else
mov r4, r1
subs r2, r2, #64
push {r5}
mov r5, r1
3: stmia r0!, {r1, r3, r4, r5}
subs r2, r2, #64 /* Thumb16 */
stmia r0!, {r1, r3, r4, r5}
stmia r0!, {r1, r3, r4, r5}
stmia r0!, {r1, r3, r4, r5}
bge 3b
adds r2, r2, #64 /* Thumb16 */
pop {r5}
.endif
/* Early exit if there are 0 bytes left. */
/* THUMB( cbz r2, 9f ) */
THUMB( cmp r2, #0 )
THUMB( beq 9f )
ARM( teq r2, #0 )
ARM( beq 9f )
/*
* Handle 8-64 bytes (or 16-63 bytes in case of NEON).
* In case of NEON, destination must be 8-byte aligned.
*/
4:
.if \use_neon == 1
#ifdef CONFIG_THUMB
vmov q1, q0
cmp r2, #32
vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(64)]!
tst r2, #16
vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
tst r2, #8
vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
and r2, r2, #7
#else
bic r4, r2, #15
subs r2, r2, r4
rsb r4, r4, #64
/*
* When using NEON, the vst instruction
* (storing 16 bytes) is always 32-bit.
*/
lsr r4, r4, #2
add pc, pc, r4
nop
vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
cmp r2, #8
strge r1, [r0], #4
strge r1, [r0], #4
subge r2, r2, #8
#endif
.else /* use_neon == 0 */
bic r4, r2, #7
subs r2, r2, r4
rsb r4, r4, #64
/*
* The stmia instruction (storing 8 bytes) is 32-bit for ARM,
* 16-bit for Thumb2.
*/
THUMB( lsrs r4, r4, #2 )
ARM( lsr r4, r4, #1 )
add pc, pc, r4
nop
stmia r0!, {r1, r3}
stmia r0!, {r1, r3}
stmia r0!, {r1, r3}
stmia r0!, {r1, r3}
stmia r0!, {r1, r3}
stmia r0!, {r1, r3}
stmia r0!, {r1, r3}
stmia r0!, {r1, r3}
.endif
14: pop {r4}
5: cmp r2, #4
strge r1, [r0], #4
/* Early exit for multiple of 4 size. */
ands r2, r2, #3
moveq r0, ip
bxeq lr
/*
* At this point there are 1, 2 or 3 bytes,
* and the destination is aligned.
*/
6: cmp r2, #2
strhge r1, [r0], #2
strbne r1, [r0]
mov r0, ip
bx lr
.if \use_neon == 1
/* 0-15 bytes left, word aligned. */
13: cmp r2, #8
strge r1, [r0]
strge r1, [r0, #4]
addge r0, r0, #8
subge r2, r2, #8
b 5b
.endif
/* Unaligned case. */
7: cmp r2, #4
blt 8f
#ifdef CONFIG_THUMB
.if \use_neon == 1
/*
* When Thumb2 is enabled with NEON, use the optimized
* unaligned NEON code path for small sizes.
*/
cmp r2, #64
blt 11f
.endif
#endif
/* Align the destination. */
cmp r3, #2
sub r2, r2, #4
strble r1, [r0]
strble r1, [r0, #1]
addle r0, r0, #2
add r2, r2, r3
strbne r1, [r0], #1
b 1b
/* 0 to 3 bytes left. */
8: cmp r2, #2
strbge r1, [r0]
strbge r1, [r0, #1]
addge r0, r0, #2
tst r2, #1
strbne r1, [r0]
mov r0, ip
bx lr
9: pop {r4}
mov r0, ip
bx lr
/*
* Word aligned 8 <= size <= 64
* (16 <= size <= 63 in case of NEON).
*/
10:
/* Align the destination to an 8 byte boundary. */
tst r0, #4
strne r1, [r0], #4
subne r2, r2, #4
.if \use_neon == 1
cmp r2, #16
poplt {r4}
blt 13b
.else
cmp r2, #8
blt 14b
.endif
b 4b
#ifdef CONFIG_THUMB
.if \use_neon == 1
/*
* Handle 4 <= size <= 63 bytes, unaligned.
* Use unaligned NEON instructions with Thumb2.
*/
11:
orr r1, r1, r1, lsl #8
tst r2, #8
orr r1, r1, r1, lsl #16
vmov d0, r1, r1
vst1ne.8 {d0}, [r0]!
vmov d1, r1, r1
tst r2, #16
vst1ne.8 {d0, d1}, [r0]!
vmov q1, q0
cmp r2, #32
and r2, r2, #7
vst1ge.8 {d0-d3}, [r0]!
cmp r2, #4
/* The following store is unaligned. */
strge r1, [r0], #4
subge r2, r2, #4
b 8b
.endif
#endif
.endm
#if defined(MEMSET_REPLACEMENT_RPI) || defined(MEMSET_REPLACEMENT_ARMV7_32) \
|| defined(MEMSET_REPLACEMENT_ARMV7_64) || defined(MEMSET_REPLACEMENT_NEON_32) \
|| defined(MEMSET_REPLACEMENT_NEON_64)
#ifdef MEMSET_REPLACEMENT_RPI
asm_function memset
memset_variant 32, 0
.endfunc
#endif
#if defined(MEMSET_REPLACEMENT_ARMV7_32) || defined(MEMSET_REPLACEMENT_ARMV7_64)
asm_function memset
memset_variant 8, 0
.endfunc
#endif
#if defined(MEMSET_REPLACEMENT_NEON_32) || defined(MEMSET_REPLACEMENT_NEON_64)
asm_function memset
memset_variant 32, 1
.endfunc
#endif
#else
asm_function memset_new_align_0
memset_variant 0, 0
.endfunc
asm_function memset_new_align_8
memset_variant 8, 0
.endfunc
asm_function memset_new_align_32
memset_variant 32, 0
.endfunc
asm_function memset_neon
memset_variant 32, 1
.endfunc
#endif