/*
 * Copyright 2013 Harm Hanemaaijer <fgenfb@yahoo.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 */

#ifdef CONFIG_THUMB
#define W(instr) instr.w
#define THUMB(instr...)	instr
#define ARM(instr...)
#else
#define W(instr) instr
#define THUMB(instr...)
#define ARM(instr...) instr
#endif

/*
 * In practice, because the way NEON is configured on most systems,
 * specifying alignment hints for NEON instructions doesn't seem
 * to improve performance, or even degrade performance in some cases.
 * However, actually having the address aligned to an element
 * boundary or greater is beneficial.
 */
#define NEON_ALIGN(n)
/* #define NEON_ALIGN(n) :n */

/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

.text
.syntax unified
.arch armv7a
.fpu neon

.macro asm_function function_name
    .global \function_name
.func \function_name
.type \function_name, function
ARM(    .p2align 5      )
THUMB(  .p2align 2      )
\function_name:
.endm

/*
 * The following memcpy implementation is optimized with a fast path
 * for common, word aligned cases and optionally use unaligned access for
 * small sizes.
 *
 * - line_size is the cache line size used for prefetches. Must be 64 or 32.
 * - prefetch_distance is the number of cache lines to look ahead and must be
 *   >= 2.
 * - write_align is the write alignment enforced before the main loop for larger
 *   sizes (word aligned case) and must be 0, 16, 32, or 64.
 * - aligned_access must be 0 or 1. When enabled, no unaligned memory accesses
 *   will occur. Both small size tresholds for unaligned access are not used
 *   in this case.
 */

/* The threshold size for using the fast path for the word-aligned case. */
#define FAST_PATH_THRESHOLD 256
/* The threshold size for using the small size path for the word-aligned case. */
#define SMALL_SIZE_THRESHOLD 15
/*
 * The threshold size for using the small size path for the unaligned case.
 * Unaligned memory accesses will be generated for requests smaller or equal to
 * this size.
 */
#define UNALIGNED_SMALL_SIZE_THRESHOLD 64
/*
 * The threshold size for using the small size path when both the source and
 * the destination are unaligned. Unaligned memory accesses will be generated
 * for requests smaller of equal to this size.
 */
#define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 32

/*
 * For a code-reduced version, define all four of the above constants to 0,
 * eliminating the fast path and small size special cases. With Thumb2
 * enabled, this resulted in a reduction in code size from 1150 to 824 bytes,
 * at the cost of lower performance for smaller sizes.
 */
// #define FAST_PATH_THRESHOLD 0
// #define SMALL_SIZE_THRESHOLD 0
// #define UNALIGNED_SMALL_SIZE_THRESHOLD 0
// #define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 0

/*
 * EARLY_PREFETCHES is used in the fast path implementation.
 * The optimal value for EARLY_PREFETCHES was determined empirically.
 * It is equal to prefetch_distance + 1 for line_size 32.
 *            and prefetch_distance - 1 for line_size 64.
 */
#define EARLY_PREFETCHES (\prefetch_distance - (\line_size / 32) * 2 + 3)

#if FAST_PATH_THRESHOLD > 0
#define FAST_PATH(instr...) instr
#define NO_FAST_PATH(instr...)
#else
#define FAST_PATH(instr...)
#define NO_FAST_PATH(instr...) instr
#endif


/* Helper macro for the fast-path implementation. */

.macro copy_16_bytes bytes_to_go, line_size, prefetch_distance
#ifdef CONFIG_THUMB
                /*
                 * When Thumb2 mode is enabled, the ldmia/stmia instructions
                 * will be 16-bit, and the preload instruction will be
                 * 32-bit, so we only need one 32-bit wide nop instruction
                 * when there's no preload, for a total size of two words.
                 */
.if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \
 (\bytes_to_go % \line_size) == 0
        	pld     [r1, ip]
                ldmia   r1!, {r3, r4, r5, r6}
                stmia   r0!, {r3, r4, r5, r6}
.else
                ldmia   r1!, {r3, r4, r5, r6}
        W(	nop	)
                stmia   r0!, {r3, r4, r5, r6}
.endif
#else
                /*
                 * When ARM mode is enabled, every instruction is one word,
                 * so make sure the entire block is four instructions.
                 */
.if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \
(\bytes_to_go % \line_size) == 0
       		pld     [r1, ip]
.else
                nop
.endif
                ldmia   r1!, {r3, r4, r5, r6}
                nop
                stmia   r0!, {r3, r4, r5, r6}
#endif
.endm


/* Helper macro implementing unaligned copy. */

.macro unaligned_copy shift, line_size, prefetch_distance, write_align, \
aligned_access
		/*
		 * ip is the aligned source base address.
		 * r3 is a word of data from the source.
		 */
.if \write_align > 0
		cmp	r2, #(32 + \write_align - 4)
.else
		cmp	r2, #32
.endif
		push 	{r5}
		blt	55f
		subs	r2, r2, #32

		/* Handle write alignment. */
.if \write_align > 0
.if \write_align == 8
		tst	r0, #4
	        mov	r4, r3, lsr #\shift
                ldrne	r3, [r1], #4
		subne	r2, r2, #4
                orrne   r4, r4, r3, lsl #(32 - \shift)
                strne   r4, [r0], #4
.else
		ands	r5, r0, #(\write_align - 1)
		rsb	r5, r5, #\write_align
		beq	59f
		sub	r2, r2, r5

58:             movs    r4, r3, lsr #\shift
                ldr	r3, [r1], #4
                subs    r5, r5, #4
                orr     r4, r4, r3, lsl #(32 - \shift)
                str     r4, [r0], #4
                bgt     58b
59:
.endif
.endif

                /*
                 * Assume a preload at aligned base + line_size will
                 * be useful.
                 */
		pld     [ip, #\line_size]
		push	{r6-r11}
		mov	r11, r3

		mov	r4, ip
                add     r5, r1, #(\prefetch_distance * \line_size)
                subs    r2, r2, #(\prefetch_distance * \line_size)
                bic     r3, r5, #31
                add     r4, r4, #(2 * \line_size)
                blt     54f
                cmp     r4, r3
                sub     ip, r3, r1
                /*
                 * "Catch-up" the early preloads (which have been performed up
                 * to aligned source address + line_size) to the preload offset
                 * used in the main loop.
                 */
                bge     52f
51:             adds    r4, r4, #\line_size		/* Thumb16 */
                cmp     r4, r3
                pld     [r4, #(- \line_size)]
                blt     51b
52:
		/*
		 * Note that when L1_CACHE_BYTES is 64, we are
		 * prefetching every 32 bytes. Although not optimal
		 * there doesn't seem to be big penalty for the extra
		 * preload instructions and it prevents greater
		 * code size and complexity.
		 */
53:		pld	[r1, ip]
54:
		ldmia	r1!, {r4-r7}
		mov	r3, r11, lsr #\shift
		ldmia	r1!, {r8-r11}
		orr	r3, r3, r4, lsl #(32 - \shift)
		movs	r4, r4, lsr #\shift	/* Thumb16 */
		orr	r4, r4, r5, lsl #(32 - \shift)
		movs	r5, r5, lsr #\shift	/* Thumb16 */
		orr	r5, r5, r6, lsl #(32 - \shift)
		movs	r6, r6, lsr #\shift	/* Thumb16 */
		orr	r6, r6, r7, lsl #(32 - \shift)
		movs	r7, r7, lsr #\shift	/* Thumb16 */
		orr	r7, r7, r8, lsl #(32 - \shift)
		mov	r8, r8, lsr #\shift
		orr	r8, r8, r9, lsl #(32 - \shift)
		mov	r9, r9, lsr #\shift
		orr	r9, r9, r10, lsl #(32 - \shift)
		mov	r10, r10, lsr #\shift
		orr	r10, r10, r11, lsl #(32 - \shift)
		subs	r2, r2, #32
		stmia	r0!, {r3-r10}
		bge	53b
		cmn	r2, #(\prefetch_distance * \line_size)
		bge	54b
		/* Correct the count. */
		adds	r2, r2, #(\prefetch_distance * \line_size + 32)

		mov	r3, r11
		pop	{r6-r11}

55:		bics	r5, r2, #3
		beq	57f

56:             movs    r4, r3, lsr #\shift
                ldr	r3, [r1], #4
                subs    r5, r5, #4
                orr     r4, r4, r3, lsl #(32 - \shift)
                str     r4, [r0], #4
                bgt     56b

57:		pop	{r5}
		pop	{r4}
		subs	r1, r1, #((32 - \shift) / 8)
.if \aligned_access == 1
		b	7b
.else
		b	3b
.endif
.endm


/* The main memcpy function macro. */

.macro memcpy_variant line_size, prefetch_distance, write_align, \
aligned_access

.if \aligned_access == 1
		cmp	r2, #3
.else
NO_FAST_PATH(	cmp	r2, #3	)
.endif
		orr	r3, r0, r1
.if \aligned_access == 1
		push	{r0}
		ble	7f
.else
NO_FAST_PATH(	push	{r0}	)
NO_FAST_PATH(	ble	3f	)
.endif
		bic	ip, r1, #(\line_size - 1)
		tst	r3, #3
		pld	[ip]
.if \aligned_access == 1
FAST_PATH(	bne	30f	)
.else
FAST_PATH(	push	{r0}	)
FAST_PATH(	bne	7f	)	/* Unaligned source or destination. */
.endif
FAST_PATH(	cmp	r2, #FAST_PATH_THRESHOLD )
FAST_PATH(	bgt     10f	)
NO_FAST_PATH(	bne	30f	)
#if FAST_PATH_THRESHOLD == 0
		/*
		 * When the fast path is disabled, check whether there are
		 * enough bytes for alignment, and jump to the main handling
		 * code for larger sizes.
		 */
.if \write_align > 0
 		cmp	r2, #(\write_align - 4)
		bge	10f
.endif
		push	{r4}
		b	18f
#endif

		/*
		 * Fast path for aligned copies of size <= FAST_PATH_THRESHOLD.
		 */
#if FAST_PATH_THRESHOLD > 0
#if SMALL_SIZE_THRESHOLD == 15
                bics    r3, r2, #15
                pld     [ip, #\line_size]
		/* Jump for small sizes <= 15 bytes. */
		beq	5f
#else
		cmp	r2, #SMALL_SIZE_THRESHOLD
                pld     [ip, #\line_size]
		/* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */
		ble	5f
		bic	r3, r2, #15
#endif

9:		/*
		 * This is the entry-point into the fast path from
		 * an unaligned request that has been aligned.
		 */
		push	{r4, r5, r6}

                /*
                 * Use a heuristic to determine whether the preload
                 * at aligned_base + 2 * line_size will be useful.
                 */
.if EARLY_PREFETCHES >= 3
                cmp     r2, #(2 * \line_size - \line_size / 2)
.endif
                add     r5, ip, #(EARLY_PREFETCHES * \line_size)
.if EARLY_PREFETCHES >= 3
                blt     1f
.endif
.if EARLY_PREFETCHES == 3
                pld     [ip, #(2 * \line_size)] )
.endif
.if EARLY_PREFETCHES == 4
                cmp     r2, #(3 * \line_size - \line_size / 2)
                pld     [ip, #(2 * \line_size)]
                blt     1f
                pld     [ip, #(3 * \line_size)]
.endif
.if EARLY_PREFETCHES == 5
                cmp     r2, #(3 * \line_size - \line_size / 2)
                pld     [ip, #(2 * \line_size)]
                blt     1f
                cmp     r2, #(4 * \line_size - \line_size / 2)
                pld     [ip, #(3 * \line_size)]
                blt     1f
                pld     [ip, #(4 * \line_size)]
.endif

1:              /*
                 * Set r5 so that the next preload will occur
                 * exactly at aligned_base + EARLY_PREFETCHES *
                 * line_size. For example, if line_size is 64
                 * and the number of bytes is 240, the next preload
                 * will occur after processing 48 bytes, which is derived
                 * from the formula r3 & (line_size - 1),
                 * where r3 is equal to number_of_bytes & (~15).
                 */
                rsb     r4, r3, #256
        	subs    r5, r5, r1
        	and     ip, r3, #(\line_size - 1)
                subs    r2, r2, r3		/* Thumb16 */
THUMB(		lsrs    r4, r4, #1	)	/* Thumb16 */
        	sub     ip, r5, ip
                add     pc, pc, r4
                nop
                /* >= 256 bytes to go. */
                copy_16_bytes 256, \line_size, \prefetch_distance
                /* >= 240 bytes go. */
                copy_16_bytes 240, \line_size, \prefetch_distance
                /* >= 224 bytes to go. */
                copy_16_bytes 224, \line_size, \prefetch_distance
                /* >= 204 bytes go. */
                copy_16_bytes 204, \line_size, \prefetch_distance
                /* >= 192 bytes to go. */
                copy_16_bytes 192, \line_size, \prefetch_distance
                /* >= 176 bytes go. */
                copy_16_bytes 176, \line_size, \prefetch_distance
                /* >= 160 bytes to go. */
                copy_16_bytes 160, \line_size, \prefetch_distance
                /* >= 144 bytes go. */
                copy_16_bytes 144, \line_size, \prefetch_distance
                /* >= 128 bytes to go. */
                copy_16_bytes 128, \line_size, \prefetch_distance
                /* >= 112 bytes go. */
                copy_16_bytes 112, \line_size, \prefetch_distance
                /* >= 96 bytes to go. */
                copy_16_bytes 96, \line_size, \prefetch_distance
                /* >= 80 bytes to go. */
                copy_16_bytes 80, \line_size, \prefetch_distance
                /* >= 64 bytes to go. */
                copy_16_bytes 64, \line_size, \prefetch_distance
                /* >= 48 bytes to go. */
                copy_16_bytes 48, \line_size, \prefetch_distance
                /* >= 32 bytes to go. */
                copy_16_bytes 32, \line_size, \prefetch_distance
                /* At this point there are 16 to 31 bytes to go. */
                tst     r2, #15
                ldmia   r1!, {r3, r4, r5, r6}
                cmpne   r2, #8
                /*
                 * If r2 == 8, we need to clear the eq flag while
                 * making sure carry remains set.
                 */
                tsteq   r2, #15
                stmia   r0!, {r3, r4, r5, r6}
                /*
                 * The equal flag is set if there are no bytes left.
                 * The carry flag is set is there are >= 8 bytes left.
                 */
		pop	{r4, r5, r6}
                beq     4f

2:
		/*
		 * ARM mode imposes restrictions on the registers used
		 * in double-word loads and stored so we have to use
		 * single-word operations.
		 */
.if \aligned_access == 0
	ARM(	ldrcs	r3, [r1], #4	)
	ARM(	ldrcs	ip, [r1], #4	)
	ARM(	strcs	r3, [r0], #4	)
	ARM(	strcs	ip, [r0], #4	)
	THUMB(	ldrdcs  r3, ip, [r1], #8	)
	THUMB(	strdcs  r3, ip, [r0], #8	)
.else
		ldrcs	r3, [r1], #4
		ldrcs	ip, [r1], #4
		strcs	r3, [r0], #4
		strcs	ip, [r0], #4
.endif
                tst     r2, #4
                ldrne   ip, [r1], #4
                strne   ip, [r0], #4
                tst     r2, #3
                popeq	{r0}
                bxeq	lr

	        /*
		 * Handle the last up to three bytes. Unaligned access
		 * make take place if source or destination is not
		 * half-word aligned.
		 */
3:		movs    r2, r2, lsl #31
                ldrhcs  r3, [r1], #2
                strhcs  r3, [r0], #2
                ldrbne  r3, [r1], #1
                strbne  r3, [r0], #1
4:		pop	{r0}
		bx	lr

5:		/*
		 * Sizes <= SMALL_SIZE_THRESHOLD bytes, both source and
		 * destination aligned.
		 */
#if SMALL_SIZE_THRESHOLD <= 15
		cmp	r2, #8		/* cs if r2 >= 8. */
		b	2b
#else
101:		tst	r2, #4
		ldrne	r3, [r1], #4
		subne	r2, r2, #4
		strne	r3, [r0], #4
		cmp	r2, #8
		blt	3b
6:		cmp	r2, #16
		ldr	r3, [r1], #4
		ldr	ip, [r1], #4
		str	r3, [r0], #4
		sub	r2, r2, #8
		str	ip, [r0], #4
		bge	6b
		cmp	r2, #0
		popeq	{r0}
		bxeq	lr
		b	3b
#endif

#endif	/* FAST_PATH_THRESHOLD > 0 */

.if \aligned_access == 1
		/*
		 * Handle the last up to three bytes avoiding
		 * unaligned memory access.
		 */
7:		movs    r2, r2, lsl #31
                ldrbcs  r3, [r1], #1
                ldrbcs  ip, [r1], #1
                strbcs  r3, [r0], #1
                strbcs  ip, [r0], #1
                ldrbne  r3, [r1], #1
                strbne  r3, [r0], #1
		pop	{r0}
		bx	lr
.endif

#if FAST_PATH_THRESHOLD > 0
.if \aligned_access == 0
7:		/*
		 * Unaligned source or destination. There are seperate small
		 * size thresholds for when both source and destination are
		 * unaligned and the other case.
		 */
		tst	r0, #3
		mov	r3, #UNALIGNED_SMALL_SIZE_THRESHOLD
		tstne	r1, #3
		movne	r3, #BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD
		cmp	r2, r3
		bgt	30f

		/* Small sizes, unaligned case. Use single word load/stores. */
#if SMALL_SIZE_THRESHOLD >= 16
		/* Use the identical code path already defined above. */
		b	101b
#else
		tst	r2, #4
		ldrne	r3, [r1], #4
		subne	r2, r2, #4
		strne	r3, [r0], #4
		cmp	r2, #8
		blt	3b
8:		cmp	r2, #16
		ldr	r3, [r1], #4
		ldr	ip, [r1], #4
		str	r3, [r0], #4
		sub	r2, r2, #8
		str	ip, [r0], #4
		bge	8b
		b	3b
#endif
.endif
#endif		/* FAST_PATH_THRESHOLD > 0 */

10:		/*
		 * This is the start of the handling of larger sizes for
		 * aligned copies.
		 *
		 * Size > FAST_PATH_THRESHOLD (256).
		 * ip is the line_sized aligned source address for preloads.
		 */

.if \write_align >= 16
		ands	r3, r0, #(\write_align - 1)
		push	{r4}
		rsb	r3, r3, #\write_align
		beq	17f
		push	{lr}
		bl	20f
		pop	{lr}
17:
.elseif \write_align == 8
		/*
		 * For write alignment of 8, it is quickest to do a simple
		 * conditional load/store.
		 */
		tst	r0, #4
		push	{r4}
		ldrne	r3, [r1], #4
		subne	r2, r2, #4
		strne	r3, [r0], #4
.else
		push	{r4}
.endif

18:
.if (FAST_PATH_THRESHOLD - (\write_align - 4)) < \line_size
		cmp	r2, #\line_size
		blt	15f
.endif
		subs	r2, r2, #\line_size

16:		/*
		 * This is the entry-point when source and destination were
		 * initially unaligned but are now aligned because they had
		 * the same alignment within a word. Write alignment and
		 * size check has already been handled.
		 */

		push	{r5-r11}

                /*
                 * Assume a preload at aligned base + line_size will
                 * be useful.
                 */
		mov	r4, ip
		pld     [ip, #\line_size]
                add     r5, r1, #(\prefetch_distance * \line_size)
                subs    r2, r2, #(\prefetch_distance * \line_size)
                bic     r3, r5, #(\line_size - 1)
                add     r4, r4, #(2 * \line_size)
                blt     14f
                cmp     r4, r3
                sub     ip, r3, r1
                /*
                 * "Catch-up" the early preloads (which have been performed up
                 * to aligned source address + line_size) to the preload offset
                 * used in the main loop.
                 */
                bge     12f
11:             adds    r4, r4, #\line_size		/* Thumb16 */
                cmp     r4, r3
                pld     [r4, #(- \line_size)]
                blt     11b
12:

		/*
		 * The main loop for large sizes. Copy 32 bytes at a time
		 * using ldmia/stmia while prefetching a 32-byte aligned
		 * address for line size 32, or 64 bytes at a time while
		 * prefetching a 64-byte aligned address for line size 64.
		 */
13:		pld     [r1, ip]
14:
.if \line_size == 32
		ldmia   r1!, {r4-r7}
		subs    r2, r2, #32
		ldmia   r1!, {r8-r11}
		stmia   r0!, {r4-r7}
		stmia   r0!, {r8-r11}
.else
		ldmia   r1!, {r4-r11}
		subs    r2, r2, #64
		stmia   r0!, {r4-r11}
		ldmia   r1!, {r4-r11}
		stmia   r0!, {r4-r11}
.endif
		bge	13b
		cmn	r2, #(\prefetch_distance * \line_size)
		bge	14b
		/* Correct the count. */
		adds	r2, r2, #((\prefetch_distance + 1) * \line_size)
		pop	{r5-r11}

15:		ands	r3, r2, #60
.if \write_align <= 8
		/*
		 * When the subroutine is not used for write alignment, the
		 * subroutine will only be called once, so branch without
		 * linking.
		 */
		bne	20f
19:
.else
		mov	ip, lr
		blne	20f
		mov	lr, ip
.endif
		pop	{r4}
#if FAST_PATH_THRESHOLD > 0
		cmp	r2, #0
		bne	3b
#else
	ARM(	cmp	r2, #0	)
	ARM(	beq	4f	)
	THUMB(	cbz	r2, 4f	)
	        /* Handle the last up to three bytes. */
3:		movs    r2, r2, lsl #31
                ldrhcs  r3, [r1], #2
                strhcs  r3, [r0], #2
                ldrbne  r3, [r1], #1
                strbne  r3, [r0], #1
4:
#endif
		pop	{r0}
		bx	lr

                /*
                 * Subroutine that copies a multiple of 4 bytes of size
                 * r3 from 0 to 64 or 32 bytes. r2 is decremented by the
		 * number of bytes copied.
                 */
20:		tst     r3, #4
                sub     r2, r2, r3
                ldrne   r4, [r1], #4
                subne   r3, r3, #4
                strne   r4, [r0], #4
.if \write_align <= 32 && \line_size == 32
                rsb     r3, r3, #32
.else
                rsb     r3, r3, #64
.endif
		/*
		 * These ldmia/stmia instructions are 16-bit on Thumb2,
		 * 32-bit on ARM.
		 */
	THUMB(	lsrs    r3, r3, #1	)
                add     pc, pc, r3
                nop
                ldmia   r1!, {r3, r4}
                stmia   r0!, {r3, r4}
                ldmia   r1!, {r3, r4}
                stmia   r0!, {r3, r4}
                ldmia   r1!, {r3, r4}
                stmia   r0!, {r3, r4}
                ldmia   r1!, {r3, r4}
                stmia   r0!, {r3, r4}
.if \write_align > 32 || \line_size > 32
                ldmia   r1!, {r3, r4}
                stmia   r0!, {r3, r4}
                ldmia   r1!, {r3, r4}
                stmia   r0!, {r3, r4}
                ldmia   r1!, {r3, r4}
                stmia   r0!, {r3, r4}
                ldmia   r1!, {r3, r4}
                stmia   r0!, {r3, r4}
.endif
.if \write_align <= 8
		b	19b
.else
		mov	pc, lr
.endif

30:		/*
		 * Unaligned case. Align the destination.
		 * Number of bytes is > UNALIGNED_SMALL_SIZE_THRESHOLD.
		 * Note: This may use unaligned access.
		 * ip is the line_size aligned source address for preloads.
		 */
		ands	r3, r0, #3
		push	{r4}
		andeq	r3, r1, #3
		beq	40f	/* Destination is aligned but source is not. */
		/* Align the destination. */
		cmp	r3, #2
.if \aligned_access == 1
                ldrble  r4, [r1], #1
		ldrble	r3, [r1], #1
		suble	r2, r2, #2
		strble	r4, [r0], #1
		strble	r3, [r0], #1
.else
                ldrhle  r4, [r1], #2
		suble	r2, r2, #2
		strhle	r4, [r0], #2
.endif
		ldrbne	r4, [r1], #1
		subne	r2, r2, #1
		strbne	r4, [r0], #1
		ands	r3, r1, #3
		bne	40f	/* Destination is aligned but source is not. */

#if 0 && FAST_PATH_THRESHOLD > 0
		/*
		 * Source and destination are now aligned.
		 * Now recreate the situation of a word-aligned memcpy
		 * with the current source and destination,
		 * which may require an extra preload instruction.
		 *
		 * This path is currently disabled disabled in favour
		 * of the one below this which does write alignment and
		 * jumps into the main loop for larger sizes.
		 */
		bic	r3, r1, #(\line_size - 1)
		pop	{r4}
		cmp	r3, ip
	THUMB(	pldne	[r3]				)
	THUMB(	cmp	r2, #FAST_PATH_THRESHOLD	)
	THUMB(	mov	ip, r3				)
	ARM(	beq	31f				)
	ARM(	pld	[r3]				)
	ARM(	mov	ip, r3				)
31:	ARM(	cmp	r2, #FAST_PATH_THRESHOLD	)
		bgt	10b

		/*
		 * Recreate the fast path small size check here,
		 * but only if it necessary.
		 */
.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) <= SMALL_SIZE_THRESHOLD ||
\aligned_access == 1
		cmp	r2, #SMALL_SIZE_THRESHOLD
                pld     [ip, #\line_size]
		/* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */
		ble	5b
.else
		pld	[ip, #\line_size]
.endif
		bic	r3, r2, #15
		b	9b

#else
		/*
		 * Source and destination are now aligned. Check carefully
		 * whether there are enough bytes to do alignment.
		 */
.if \write_align > 0
.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < (\write_align - 4) \
|| \aligned_access == 1
		cmp	r2, #(\write_align - 4)
		blt	31f
.endif
.if \write_align == 8
		/*
		 * For write alignment of 8, it is quickest to do a simple
		 * conditional load/store.
		 */
		tst	r0, #4
		ldrne	r3, [r1], #4
		subne	r2, r2, #4
		strne	r3, [r0], #4
.else
		ands	r3, r0, #(\write_align - 1)
		rsb	r3, r3, #\write_align
		beq	31f
		push	{lr}
		bl	20b
		pop	{lr}
.endif

31:		/*
		 * Check whether there are enough bytes to do one iteration
		 * of the main loop.
		 */
.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3 - (\write_align - 4)) < \line_size \
|| \aligned_access == 1
		cmp	r2, #\line_size
		blt	15b
.endif
		subs	r2, r2, #\line_size
.else
		/*
		 * No write alignment. Only have to check for enough bytes to
		 * do one iteration of the main loop.
		 */

.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < \line_size \
|| \aligned_access == 1
		cmp	r2, #\line_size
		blt	15b
.endif
		subs	r2, r2, #\line_size
.endif
		b	16b
#endif

40:		/*
		 * Unaligned case. Size is > SMALL_SIZE_THRESHOLD - 3.
		 */
		bic	r1, r1, #3
		cmp	r3, #2
		ldr	r3, [r1], #4
		beq	41f
		bgt	42f

		unaligned_copy 8, \line_size, \prefetch_distance, \
			\write_align, \aligned_access

41:		unaligned_copy 16, \line_size, \prefetch_distance, \
			\write_align, \aligned_access

42:		unaligned_copy 24, \line_size, \prefetch_distance, \
			\write_align, \aligned_access

.endm

/*
 * The following is a NEON-based memcpy implementation that may use unaligned
 * access, but NEON instruction addresses are always at least element aligned.
 * It is optimized for both Thumb2 (CONFIG_THUMB) and ARM mode.
 *
 * - line_size is the cache line size used for prefetches. Must be 64 or 32.
 * - prefetch_distance is the number of cache lines to look ahead and must be
 *   >= 2, or 0 to disable prefetching in the main copying loop.
 * - early_prefetch indicates whether to perform early preloads. Must be 0 or 1.
 *   When prefetch_distance > 0, early_prefetch should be 1. To remove all PLD
 *   instructions altogether, set both prefetch_distance and early_prefetch
 *   to 0.
 */

.macro neon_memcpy_variant line_size, prefetch_distance, early_prefetch

		cmp	r2, #3
.if \prefetch_distance > 0 || \early_prefetch == 1
		push	{r0}
.else
		mov	ip, r0
.endif
		orr	r3, r0, r1
		ble	8f
.if \prefetch_distance > 0 || \early_prefetch == 1
		bic	ip, r1, #(\line_size - 1)
.endif
		tst	r3, #3
.if \early_prefetch == 1
		pld	[ip]
.endif
		bne	10f		/* Unaligned source or destination. */
		push	{r4}

		/* Aligned source and destination. */
1:		cmp	r2, #256
		/*
		 * Jump to word-aligned NEON fast path <= 256 bytes.
                 */
		ble	18f
		subs	r2, r2, #\line_size

                /* Align to a 32-byte boundary. */
#ifdef CONFIG_THUMB
		/*
		 * Use conditional NEON instructions when
		 * available (Thumb2 mode)
		 */
                ands    r4, r0, #31
                rsb     r4, r4, #32
                beq     31f
                tst     r4, #4
                sub     r2, r2, r4
                ldrne   r3, [r1 :32], #4
                strne   r3, [r0 :32], #4
                tst     r4, #8
		vld1ne.32 {d0}, [r1]!
		vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
                cmp     r4, #16
		vld1ge.32 {d2, d3}, [r1]!
		vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
#else
		/*
		 * Otherwise, branch into a series of single
		 * loads/stores.
	 	 */
                ands    r4, r0, #31
                beq     31f
		rsb	r3, r4, #32
		lsl	r4, r4, #1
		sub	r2, r2, r3
		add	pc, pc, r4
		nop
		ldr	r3, [r1], #4
		str	r3, [r0], #4
		ldr	r4, [r1], #4
		str	r4, [r0], #4
		ldr	r3, [r1], #4
		str	r3, [r0], #4
		ldr	r4, [r1], #4
		str	r4, [r0], #4
		ldr	r3, [r1], #4
		str	r3, [r0], #4
		ldr	r4, [r1], #4
		str	r4, [r0], #4
		ldr	r3, [r1], #4
		str	r3, [r0], #4
		ldr	r4, [r1], #4
		str	r4, [r0], #4
#endif
                cmp     r2, #0
		addlt	r2, r2, \line_size
                blt     6f

31:
.if \early_prefetch == 1
		pld     [ip, #\line_size]
.endif
.if \prefetch_distance > 0
                /*
                 * Assume a preload at aligned base + line_size will
                 * be useful.
                 */
		push	{r5}
		mov	r4, ip
                add     r5, r1, #(\prefetch_distance * \line_size)
                subs    r2, r2, #(\prefetch_distance * \line_size)
                bic     r3, r5, #(\line_size - 1)
                add     r4, r4, #(2 * \line_size)
                blt     5f
                cmp     r4, r3
                sub     ip, r3, r1
                /*
                 * "Catch-up" the early preloads (which have been performed up
                 * to aligned source address + line_size) to the preload offset
                 * used in the main loop.
                 */
                bge     3f
2:              adds    r4, r4, #\line_size		/* Thumb16 */
                cmp     r4, r3
                pld     [r4, #(- \line_size)]
                blt     2b
3:
.endif

		sub	ip, ip, #\line_size
4:
                /*
                 * Since the destination is 32-byte aligned,
                 * specify 256-bit alignment for the NEON stores.
                 */
.if \line_size == 32
		vld1.32 {d0-d3}, [r1]!
                subs    r2, r2, #32
.if \prefetch_distance > 0
		pld	[r1, ip]
.endif
                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
.else	/* line_size == 64 */
		vld1.32 {d0-d3}, [r1]!
                vld1.32 {d4-d7}, [r1]!
.if \prefetch_distance > 0
		pld	[r1, ip]
.endif
                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
                subs    r2, r2, #64
                vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
.endif
                bge     4b
.if \prefetch_distance > 0
5:
.if \line_size == 32
		vld1.32 {d0-d3}, [r1]!
                subs    r2, r2, #32
                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
.else	/* line_size == 64 */
		vld1.32 {d0-d3}, [r1]!
                vld1.32 {d4-d7}, [r1]!
                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
                subs    r2, r2, #64
                vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
.endif
        	cmn     r2, #(\prefetch_distance * \line_size)
        	bge     5b
.endif
                /* Correct the count. */
23:     	adds    r2, r2, #((\prefetch_distance + 1) * \line_size)
.if \prefetch_distance > 0
		pop	{r5}
.endif

                /*
		 * Process the last 0-(line_size - 1) bytes, destination
		 * 32-byte aligned, source word aligned.
		 */
6:
#ifdef CONFIG_THUMB
		/*
		 * Use conditional NEON instructions when
		 * available (Thumb2 mode).
		 */
.if \line_size == 64
		cmp     r2, #32
                vld1ge.32 {d0-d3}, [r1]!
                vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]!
                tst     r2, #16
                vld1ne.32 {d0, d1}, [r1]!
                vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
.else
                cmp     r2, #16
                vld1ge.32 {d0, d1}, [r1]!
                vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
.endif
                tst     r2, #8
                vld1ne.32 {d2}, [r1]!
                vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]!
                tst     r2, #4
                ldrne   r3, [r1], #4
                strne   r3, [r0 :32], #4

                pop     {r4}
#else
		/*
		 * Just use the world-aligned tail code if we
                 * don't have Thumb2.
		 */
		b	17f
#endif

	        /*
		 * Handle the last up to three bytes. Unaligned access
		 * may take place if source or destination is not
		 * half-word aligned.
		 */
8:		movs    r2, r2, lsl #31
                ldrhcs  r3, [r1], #2
                strhcs  r3, [r0], #2
                ldrbne  r3, [r1], #1
                strbne  r3, [r0]
9:
.if \prefetch_distance > 0 || \early_prefetch == 1
		pop	{r0}
.else
		mov	r0, ip
.endif
		bx	lr

10:             /*
                 * Unaligned case. Align the destination.
                 * Number of bytes is > 3.
                 * Note: This may use unaligned access.
                 * ip is the line_size aligned source address for preloads.
                 */
		cmp	r2, #64
                push    {r4}
		/* For small sizes < 64 bytes just use the unaligned tail code. */
		blt	16f
		ands	r3, r0, #3
                beq     11f     /* Destination is aligned but source is not. */
                /* Align the destination. */
		cmp	r3, #2
                ldrbne  r4, [r1], #1
                subne   r2, r2, #1
                strbne  r4, [r0], #1
                ldrhle  r4, [r1], #2
                suble   r2, r2, #2
                strhle  r4, [r0], #2
                tst	r1, #3
                beq     1b      /* Destination and source are now aligned. */
		/* Destination is now aligned to a word boundary. */
11:
		cmp	r2, #64
		/*
		 * Jump to non-aligned NEON tail code for <= 64 bytes.
                 */
		ble	16f
		subs	r2, r2, #\line_size

                /* Align destination to a 32-byte boundary. */
                ands    r4, r0, #31
                rsb     r4, r4, #32
                beq     20f
                tst     r4, #4
                sub     r2, r2, r4
                ldrne   r3, [r1 :8], #4         /* Unaligned access. */
                strne   r3, [r0 :32], #4
                tst     r4, #8
#ifdef CONFIG_THUMB
		/*
		 * Use conditional NEON instructions when
		 * available (Thumb2 mode)
		 */
                vld1ne.8 {d0}, [r1]!
                vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
                cmp     r4, #16
                vld1ge.8 {d2, d3}, [r1]!
                vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
#else
		beq	31f
                vld1.8 {d0}, [r1]!
                vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
31:             cmp     r4, #16
		blt	32f
                vld1.8 {d2, d3}, [r1]!
                vst1.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
32:
#endif
                cmp     r2, #0
                addlt   r2, r2, #\line_size
                blt     16f
20:

.if \early_prefetch == 1
		pld     [ip, #\line_size]
.endif
.if \prefetch_distance > 0
                /*
                 * Assume a preload at aligned base + line_size will
                 * be useful.
                 */
		push	{r5}
		mov	r4, ip
                add     r5, r1, #(\prefetch_distance * \line_size)
                subs    r2, r2, #(\prefetch_distance * \line_size)
                bic     r3, r5, #(\line_size - 1)
                add     r4, r4, #(2 * \line_size)
                blt     15f
                cmp     r4, r3
                sub     ip, r3, r1
                /*
                 * "Catch-up" the early preloads (which have been performed up
                 * to aligned source address + line_size) to the preload offset
                 * used in the main loop.
                 */
                bge     13f
12:             adds    r4, r4, #\line_size		/* Thumb16 */
                cmp     r4, r3
                pld     [r4, #(- \line_size)]
                blt     12b
.endif

13:
                /*
                 * Process 64 unaligned bytes from source at a time and copy
                 * them to the 32-byte aligned destination.
                 */
14:
.if \prefetch_distance > 0
		pld     [r1, ip]
.endif
15:
.if \line_size == 32
		vld1.8  {d0-d3}, [r1]!
                subs    r2, r2, #32
                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
.else	/* line_size == 64 */
		vld1.8  {d0-d3}, [r1]!
                vld1.8  {d4-d7}, [r1]!
                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
                subs    r2, r2, #64
                vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
.endif
                bge     14b
.if \prefetch_distance > 0
		cmn     r2, #(\prefetch_distance * \line_size)
        	bge     15b
.endif
                /* Correct the count. */
                adds    r2, r2, #((\prefetch_distance + 1) * \line_size)
.if \prefetch_distance > 0
        	pop     {r5}
.endif

		/*
		 * Handle last 0-(line_size - 1) bytes (destination 32-byte
		 * aligned source unaligned).
		 */
#ifdef CONFIG_THUMB
		/*
		 * Use conditional NEON instructions when
		 * available (Thumb2 mode)
		 */
.if \line_size == 64
		cmp     r2, #32
                vld1ge.8 {d0-d3}, [r1]!
                vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]!
                tst     r2, #16
                vld1ne.8 {d0, d1}, [r1]!
                vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
.else
                cmp     r2, #16
                vld1ge.8 {d0, d1}, [r1]!
                vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
.endif
                tst     r2, #8
                vld1ne.8 {d2}, [r1]!
                vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]!
                tst     r2, #4
                ldrne   r3, [r1], #4
                strne   r3, [r0 :32], #4

                pop     {r4}
		b	8b
#else
		/*
		 * Fall through to the code below. It is not entirely
		 * optimal because it does not indicate the destination
		 * is word aligned.
		 */
#endif

                /* Handle small size of 0-63 bytes, unaligned. */
16:		bic	r3, r2, #7
		rsb	r4, r3, #64
		tst	r2, #7
		add	pc, pc, r4
		nop
		vld1.8 {d0}, [r1]!
		vst1.8 {d0}, [r0]!
		vld1.8 {d1}, [r1]!
		vst1.8 {d1}, [r0]!
		vld1.8 {d0}, [r1]!
		vst1.8 {d0}, [r0]!
		vld1.8 {d1}, [r1]!
		vst1.8 {d1}, [r0]!
		vld1.8 {d0}, [r1]!
		vst1.8 {d0}, [r0]!
		vld1.8 {d1}, [r1]!
		vst1.8 {d1}, [r0]!
		vld1.8 {d0}, [r1]!
		vst1.8 {d0}, [r0]!
		vld1.8 {d1}, [r1]!
		vst1.8 {d1}, [r0]!
                pop     {r4}
		beq	9b
                tst     r2, #4
                ldrne   r3, [r1 :8], #4         /* Unaligned access. */
                strne   r3, [r0], #4
		b	8b

                /* Handle small size of 0-63 bytes, word aligned. */
17:
#ifdef CONFIG_THUMB
		cmp     r2, #32
                vld1ge.32 {d0-d3}, [r1]!
                vst1ge.32 {d0-d3}, [r0]!
                tst     r2, #16
                vld1ne.32 {d0, d1}, [r1]!
                vst1ne.32 {d0, d1}, [r0]!
                tst     r2, #8
                vld1ne.32 {d2}, [r1]!
                vst1ne.32 {d2}, [r0]!
		tst	r2, #7
#else
		bic	r3, r2, #7
		rsb	r4, r3, #64
		tst	r2, #7
		add	pc, pc, r4
		nop
		vld1.32 {d0}, [r1]!
		vst1.32 {d0}, [r0]!
		vld1.32 {d1}, [r1]!
		vst1.32 {d1}, [r0]!
		vld1.32 {d0}, [r1]!
		vst1.32 {d0}, [r0]!
		vld1.32 {d1}, [r1]!
		vst1.32 {d1}, [r0]!
		vld1.32 {d0}, [r1]!
		vst1.32 {d0}, [r0]!
		vld1.32 {d1}, [r1]!
		vst1.32 {d1}, [r0]!
		vld1.32 {d0}, [r1]!
		vst1.32 {d0}, [r0]!
		vld1.32 {d1}, [r1]!
		vst1.32 {d1}, [r0]!
#endif
                pop     {r4}
		beq	9b
		tst	r2, #4
		ldrne	r3, [r1], #4
		strne	r3, [r0], #4
		b	8b

		/*
		 * Fast path for <= 256 bytes, word aligned.
		 * This is hardcoded for a preload offset of 128 bytes,
		 * which seems to work well in practice for small sizes.
		 */
18:		bics	r3, r2, #31
.if \early_prefetch == 1
		pld	[ip, #32]
		beq	21f
		pld	[ip, #64]
		pld	[ip, #96]
.endif
		rsb	r4, r3, #256
		ands	r2, r2, #31
		/*
		 * Each code block handling 32 bytes is
                 * 12 bytes long.
		 */
		lsr	r4, r4, #2
                add     ip, ip, #128
		add	r4, r4, r4, lsr #1
		sub	ip, ip, r1
		add	pc, pc, r4
		nop
		pld	[r1, ip]
		vld1.32 {d0-d3}, [r1]!
		vst1.32 {d0-d3}, [r0]!
		pld	[r1, ip]
		vld1.32 {d4-d7}, [r1]!
		vst1.32 {d4-d7}, [r0]!
		pld	[r1, ip]
		vld1.32 {d0-d3}, [r1]!
		vst1.32 {d0-d3}, [r0]!
		pld	[r1, ip]
		vld1.32 {d4-d7}, [r1]!
		vst1.32 {d4-d7}, [r0]!
		pld	[r1, ip]
		vld1.32 {d0-d3}, [r1]!
		vst1.32 {d0-d3}, [r0]!
		W(nop)
		vld1.32 {d4-d7}, [r1]!
		vst1.32 {d4-d7}, [r0]!
		W(nop)
		vld1.32 {d0-d3}, [r1]!
		vst1.32 {d0-d3}, [r0]!
		W(nop)
		vld1.32 {d4-d7}, [r1]!
		vst1.32 {d4-d7}, [r0]!
		beq	19f
21:
#ifdef CONFIG_THUMB
		cmp	r2, #16
		vld1ge.32 {d0-d1}, [r1]!
		vst1ge.32 {d0-d1}, [r0]!
		tst	r2, #8
		vld1ne.32 {d0}, [r1]!
		vst1ne.32 {d0}, [r0]!
#else
		cmp	r2, #16
		ldmiage r1!, {r3, r4}
		stmiage r0!, {r3, r4}
		ldmiage r1!, {r3, r4}
		stmiage r0!, {r3, r4}
		tst	r2, #8
		ldmiane r1!, {r3, r4}
		stmiane r0!, {r3, r4}
#endif
                tst     r2, #4
		pop	{r4}
                ldrne   r3, [r1], #4
                strne   r3, [r0 :32], #4
		and	r2, r2, #3
		b	8b
19:
		pop	{r4}
.if \prefetch_distance > 0 || \early_prefetch == 1
		pop	{r0}
.else
		mov	r0, ip
.endif
		bx	lr
.endm


#if defined(MEMCPY_REPLACEMENT_RPI) || defined(MEMCPY_REPLACEMENT_ARMV7_32) \
|| defined(MEMCPY_REPLACEMENT_ARMV7_64) || defined(MEMCPY_REPLACEMENT_NEON_32) \
|| defined(MEMCPY_REPLACEMENT_NEON_64)

#ifdef MEMCPY_REPLACEMENT_RPI
asm_function memcpy
		memcpy_variant 32, 3, 8, 0
.endfunc
#endif

#ifdef MEMCPY_REPLACEMENT_ARMV7_32
asm_function memcpy
		memcpy_variant 32, 6, 0, 0
.endfunc
#endif

#ifdef MEMCPY_REPLACEMENT_ARMV7_64
asm_function memcpy
		memcpy_variant 64, 3, 0, 0
.endfunc
#endif

#ifdef MEMCPY_REPLACEMENT_NEON_32
asm_function memcpy
		neon_memcpy_variant 32, 6, 1
.endfunc
#endif

#ifdef MEMCPY_REPLACEMENT_NEON_64
asm_function memcpy
		neon_memcpy_variant 64, 3, 1
.endfunc
#endif

#ifdef MEMCPY_REPLACEMENT_NEON_AUTO
asm_function memcpy
		neon_memcpy_variant 32, 0, 1
.endfunc
#endif

#else

asm_function memcpy_new_line_size_64_preload_192
		memcpy_variant 64, 3, 0, 0
.endfunc

asm_function memcpy_new_line_size_64_preload_192_align_32
		memcpy_variant 64, 3, 32, 0
.endfunc

asm_function memcpy_new_line_size_64_preload_192_aligned_access
		memcpy_variant 64, 3, 0, 1
.endfunc

asm_function memcpy_new_line_size_32_preload_192
		memcpy_variant 32, 6, 0, 0
.endfunc

asm_function memcpy_new_line_size_32_preload_192_align_32
		memcpy_variant 32, 6, 32, 0
.endfunc

asm_function memcpy_new_line_size_32_preload_96
		memcpy_variant 32, 3, 8, 0
.endfunc

asm_function memcpy_new_line_size_32_preload_96_aligned_access
		memcpy_variant 32, 3, 8, 1
.endfunc

asm_function memcpy_new_neon_line_size_64
		neon_memcpy_variant 64, 3, 1
.endfunc

asm_function memcpy_new_neon_line_size_32
		neon_memcpy_variant 32, 6, 1
.endfunc

asm_function memcpy_new_neon_line_size_32_auto
		neon_memcpy_variant 32, 0, 1
.endfunc

#endif

/*
 * Macro for memset replacement.
 * write_align must be 0, 8, or 32.
 * use_neon must be 0 or 1.
 */

.macro memset_variant write_align, use_neon
.if \use_neon == 1
	.fpu neon
.endif
	ands	r3, r0, #3
	mov	ip, r0
	bne	7f

	/* Destination is word aligned. */
1:	orr	r1, r1, r1, lsl #8
.if \use_neon == 1
	cmp	r2, #16
.else
	cmp	r2, #8
.endif
	orr	r1, r1, r1, lsl #16
.if \use_neon == 1
	blt	13f
        vmov	d0, r1, r1
	vmov	d1, r1, r1
.else
	blt	5f
	mov	r3, r1
.endif

	cmp	r2, #64
	push 	{r4}
.if \use_neon == 1
	blt	10f
.else
	ble	10f
.endif
.if \write_align > 0
	ands	r4, r0, #(\write_align - 1)
.if \use_neon == 1
#ifndef CONFIG_THUMB
	add	r3, r4, #7
#endif
.endif
	/* Let r4 be equal to the number of bytes to align.  */
	rsb	r4, r4, #\write_align
	/*
	 * At this point r4 contains the number of bytes to align
	 * if eq is not set. The eq flag is set if there are no bytes
	 * to align.
	 */
.if \write_align == 8
	subne	r2, r2, r4
	strne	r1, [r0], #4
.elseif \write_align == 32
	beq	2f
	tst     r4, #4
	sub	r2, r2, r4
	strne	r1, [r0], #4
.if \use_neon == 1
#ifdef CONFIG_THUMB
	tst     r4, #8
	vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
	cmp	r4, #16
	vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
#else
	bic	r4, r3, #7
	lsr	r4, r4, #1
	add	pc, pc, r4
	nop
	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
#endif
.else
	tst     r4, #8
	stmiane r0!, {r1, r3}
	cmp	r4, #16
	stmiage r0!, {r1, r3}
	stmiage r0!, {r1, r3}
.endif
.endif	/* \write_align == 32 */
	cmp	r2, #64
	blt	4f
.endif	/* \write_align > 0 */

2:
.if \use_neon == 1
        /*
         * When NEON is enabled, \write_align is
         * equal to 32 so specify 256-bit alignment in the
         * NEON store instructions.
         */
	subs	r2, r2, #64
        vmov	q1, q0
3:	vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
	subs	r2, r2, #64
        vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
        bge     3b
	adds	r2, r2, #64
.else
	mov	r4, r1
	subs	r2, r2, #64
	push	{r5}
	mov	r5, r1

3:	stmia	r0!, {r1, r3, r4, r5}
	subs	r2, r2, #64		/* Thumb16 */
	stmia	r0!, {r1, r3, r4, r5}
	stmia	r0!, {r1, r3, r4, r5}
	stmia	r0!, {r1, r3, r4, r5}
	bge	3b
	adds	r2, r2, #64		/* Thumb16 */

	pop	{r5}
.endif
	/* Early exit if there are 0 bytes left. */
/* THUMB(	cbz	r2, 9f	) */
THUMB(	cmp	r2, #0	)
THUMB(	beq	9f	)
ARM(	teq	r2, #0	)
ARM(	beq	9f	)
	/*
	 * Handle 8-64 bytes (or 16-63 bytes in case of NEON).
	 * In case of NEON, destination must be 8-byte aligned.
	 */
4:
.if \use_neon == 1
#ifdef CONFIG_THUMB
	vmov	q1, q0
	cmp	r2, #32
	vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(64)]!
	tst	r2, #16
	vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
	tst	r2, #8
	vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
	and	r2, r2, #7
#else
	bic	r4, r2, #15
	subs	r2, r2, r4
	rsb	r4, r4, #64
	/*
	 * When using NEON, the vst instruction
	 * (storing 16 bytes) is always 32-bit.
	 */
	lsr	r4, r4, #2
	add	pc, pc, r4
	nop
	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
	cmp	r2, #8
	strge	r1, [r0], #4
	strge	r1, [r0], #4
	subge	r2, r2, #8
#endif
.else	/* use_neon == 0 */
	bic	r4, r2, #7
	subs	r2, r2, r4
	rsb	r4, r4, #64
	/*
	 * The stmia instruction (storing 8 bytes) is 32-bit for ARM,
	 * 16-bit for Thumb2.
	 */
THUMB(	lsrs	r4, r4, #2	)
ARM(	lsr	r4, r4, #1	)
	add	pc, pc, r4
	nop
	stmia	r0!, {r1, r3}
	stmia	r0!, {r1, r3}
	stmia	r0!, {r1, r3}
	stmia	r0!, {r1, r3}
	stmia	r0!, {r1, r3}
	stmia	r0!, {r1, r3}
	stmia	r0!, {r1, r3}
	stmia	r0!, {r1, r3}
.endif
14:	pop	{r4}

5:	cmp	r2, #4
	strge	r1, [r0], #4
	/* Early exit for multiple of 4 size. */
	ands	r2, r2, #3
	moveq	r0, ip
	bxeq	lr

	/*
	 * At this point there are 1, 2 or 3 bytes,
	 * and the destination is aligned.
	 */
6:	cmp	r2, #2
	strhge	r1, [r0], #2
	strbne	r1, [r0]
	mov	r0, ip
	bx	lr

.if \use_neon == 1
	/* 0-15 bytes left, word aligned. */
13:	cmp	r2, #8
	strge	r1, [r0]
	strge	r1, [r0, #4]
	addge	r0, r0, #8
	subge	r2, r2, #8
	b	5b
.endif

	/* Unaligned case. */
7:	cmp	r2, #4
	blt	8f
#ifdef CONFIG_THUMB
.if \use_neon == 1
	/*
	 * When Thumb2 is enabled with NEON, use the optimized
	 * unaligned NEON code path for small sizes.
	 */
	cmp 	r2, #64
	blt	11f
.endif
#endif
	/* Align the destination. */
	cmp	r3, #2
	sub	r2, r2, #4
	strble	r1, [r0]
	strble	r1, [r0, #1]
	addle	r0, r0, #2
	add	r2, r2, r3
	strbne	r1, [r0], #1
	b	1b

	/* 0 to 3 bytes left. */
8:	cmp	r2, #2
	strbge  r1, [r0]
	strbge  r1, [r0, #1]
	addge	r0, r0, #2
	tst	r2, #1
	strbne  r1, [r0]
	mov	r0, ip
	bx	lr

9:	pop	{r4}
	mov	r0, ip
	bx	lr

	/*
	 * Word aligned 8 <= size <= 64
	 * (16 <= size <= 63 in case of NEON).
	 */
10:
	/* Align the destination to an 8 byte boundary. */
	tst     r0, #4
	strne	r1, [r0], #4
	subne	r2, r2, #4
.if \use_neon == 1
	cmp	r2, #16
	poplt	{r4}
	blt	13b
.else
	cmp	r2, #8
	blt	14b
.endif
	b	4b

#ifdef CONFIG_THUMB
.if \use_neon == 1
	/*
	 * Handle 4 <= size <= 63 bytes, unaligned.
	 * Use unaligned NEON instructions with Thumb2.
	 */
11:
	orr	r1, r1, r1, lsl #8
	tst	r2, #8
	orr	r1, r1, r1, lsl #16
        vmov	d0, r1, r1
	vst1ne.8 {d0}, [r0]!
	vmov	d1, r1, r1
	tst	r2, #16
	vst1ne.8 {d0, d1}, [r0]!
	vmov	q1, q0
	cmp	r2, #32
	and	r2, r2, #7
	vst1ge.8 {d0-d3}, [r0]!
	cmp	r2, #4
	/* The following store is unaligned. */
	strge	r1, [r0], #4
	subge	r2, r2, #4
	b	8b
.endif
#endif
.endm

#if defined(MEMSET_REPLACEMENT_RPI) || defined(MEMSET_REPLACEMENT_ARMV7_32) \
|| defined(MEMSET_REPLACEMENT_ARMV7_64) || defined(MEMSET_REPLACEMENT_NEON_32) \
|| defined(MEMSET_REPLACEMENT_NEON_64)

#ifdef MEMSET_REPLACEMENT_RPI
asm_function memset
		memset_variant 32, 0
.endfunc
#endif

#if defined(MEMSET_REPLACEMENT_ARMV7_32) || defined(MEMSET_REPLACEMENT_ARMV7_64)
asm_function memset
		memset_variant 8, 0
.endfunc
#endif

#if defined(MEMSET_REPLACEMENT_NEON_32) || defined(MEMSET_REPLACEMENT_NEON_64)
asm_function memset
		memset_variant 32, 1
.endfunc
#endif

#else

asm_function memset_new_align_0
		memset_variant 0, 0
.endfunc

asm_function memset_new_align_8
		memset_variant 8, 0
.endfunc

asm_function memset_new_align_32
		memset_variant 32, 0
.endfunc

asm_function memset_neon
		memset_variant 32, 1
.endfunc

#endif