libc/arch-arm/denver/bionic/memcpy_base.S - nest-cam/4320010/libc - Git at Google

 /*
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
  * Copyright (c) 2013-2014, NVIDIA Corporation.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */

 #define CACHE_LINE_SIZE         (64)
 #define PREFETCH_DISTANCE       (CACHE_LINE_SIZE*6)

 ENTRY_PRIVATE(MEMCPY_BASE)
         .cfi_def_cfa_offset 8
         .cfi_rel_offset r0, 0
         .cfi_rel_offset lr, 4

         cmp         r2, #0
         beq         .L_memcpy_done
         cmp         r0, r1
         beq         .L_memcpy_done

         /* preload next cache line */
         pld         [r1, #CACHE_LINE_SIZE*1]

         /* Deal with very small blocks (< 32bytes) asap */
         cmp         r2, #32
         blo         .L_memcpy_lt_32bytes
         /* no need to align if len < 128 bytes */
         cmp         r2, #128
         blo         .L_memcpy_lt_128bytes

         /* large copy, align dest to 64 byte boundry */
         pld         [r1, #CACHE_LINE_SIZE*2]
         rsb         r3, r0, #0
         ands        r3, r3, #0x3F
         pld         [r1, #CACHE_LINE_SIZE*3]
         beq         .L_memcpy_dispatch
         sub         r2, r2, r3
         /* copy 1 byte */
         movs        ip, r3, lsl #31
         itt         mi
         ldrbmi      ip, [r1], #1
         strbmi      ip, [r0], #1
         /* copy 2 bytes */
         itt         cs
         ldrhcs      ip, [r1], #2
         strhcs      ip, [r0], #2
         /* copy 4 bytes */
         movs        ip, r3, lsl #29
         itt         mi
         ldrmi       ip, [r1], #4
         strmi       ip, [r0], #4
         /* copy 8 bytes */
         bcc         1f
         vld1.8      {d0}, [r1]!
         vst1.8      {d0}, [r0, :64]!
 1:      /* copy 16 bytes */
         movs        ip, r3, lsl #27
         bpl         1f
         vld1.8      {q0}, [r1]!
         vst1.8      {q0}, [r0, :128]!
 1:      /* copy 32 bytes */
         bcc         .L_memcpy_dispatch
         vld1.8      {q0, q1}, [r1]!
         vst1.8      {q0, q1}, [r0, :256]!

 .L_memcpy_dispatch:
         // pre-decrement by 128 to detect nearly-done condition easily, but
         // also need to check if we have less than 128 bytes left at this
         // point due to alignment code above
         subs        r2, r2, #128
         blo         .L_memcpy_lt_128presub

         // Denver does better if both source and dest are aligned so
         // we'll special-case that even though the code is virually identical
         tst         r1, #0xF
         bne         .L_memcpy_neon_unalign_src_pld

         // DRAM memcpy should be throttled slightly to get full bandwidth
         //
         cmp         r2, #32768
         bhi         .L_memcpy_neon_unalign_src_pld
         .align      4
 1:
         /* copy 128 bytes in each loop */
         subs        r2, r2, #128

         /* preload a cache line */
         pld         [r1, #PREFETCH_DISTANCE]
         /* copy a cache line */
         vld1.8      {q0, q1}, [r1, :128]!
         vst1.8      {q0, q1}, [r0, :256]!
         vld1.8      {q0, q1}, [r1, :128]!
         vst1.8      {q0, q1}, [r0, :256]!
         /* preload a cache line */
         pld         [r1, #PREFETCH_DISTANCE]
         /* copy a cache line */
         vld1.8      {q0, q1}, [r1, :128]!
         vst1.8      {q0, q1}, [r0, :256]!
         vld1.8      {q0, q1}, [r1, :128]!
         vst1.8      {q0, q1}, [r0, :256]!

         bhs         1b
         adds        r2, r2, #128
         bne         .L_memcpy_lt_128bytes_align
         pop         {r0, pc}

         .align      4
 .L_memcpy_neon_unalign_src_pld:
 1:
         /* copy 128 bytes in each loop */
         subs        r2, r2, #128

         /* preload a cache line */
         pld         [r1, #PREFETCH_DISTANCE]
         /* copy a cache line */
         vld1.8      {q0, q1}, [r1]!
         vst1.8      {q0, q1}, [r0, :256]!
         vld1.8      {q0, q1}, [r1]!
         vst1.8      {q0, q1}, [r0, :256]!
         /* preload a cache line */
         pld         [r1, #PREFETCH_DISTANCE]
         /* copy a cache line */
         vld1.8      {q0, q1}, [r1]!
         vst1.8      {q0, q1}, [r0, :256]!
         vld1.8      {q0, q1}, [r1]!
         vst1.8      {q0, q1}, [r0, :256]!

         bhs         1b
         adds        r2, r2, #128
         bne         .L_memcpy_lt_128bytes_align
         pop         {r0, pc}

 .L_memcpy_lt_128presub:
         add         r2, r2, #128
 .L_memcpy_lt_128bytes_align:
         /* copy 64 bytes */
         movs        ip, r2, lsl #26
         bcc         1f
         vld1.8      {q0, q1}, [r1]!
         vst1.8      {q0, q1}, [r0, :256]!
         vld1.8      {q0, q1}, [r1]!
         vst1.8      {q0, q1}, [r0, :256]!
 1:      /* copy 32 bytes */
         bpl         1f
         vld1.8      {q0, q1}, [r1]!
         vst1.8      {q0, q1}, [r0, :256]!
 1:      /* copy 16 bytes */
         movs        ip, r2, lsl #28
         bcc         1f
         vld1.8      {q0}, [r1]!
         vst1.8      {q0}, [r0, :128]!
 1:      /* copy 8 bytes */
         bpl         1f
         vld1.8      {d0}, [r1]!
         vst1.8      {d0}, [r0, :64]!
 1:      /* copy 4 bytes */
         tst         r2, #4
         itt         ne
         ldrne       ip, [r1], #4
         strne       ip, [r0], #4
         /* copy 2 bytes */
         movs        ip, r2, lsl #31
         itt         cs
         ldrhcs      ip, [r1], #2
         strhcs      ip, [r0], #2
         /* copy 1 byte */
         itt         mi
         ldrbmi      ip, [r1]
         strbmi      ip, [r0]

         pop         {r0, pc}

 .L_memcpy_lt_128bytes:
         /* copy 64 bytes */
         movs        ip, r2, lsl #26
         bcc         1f
         vld1.8      {q0, q1}, [r1]!
         vst1.8      {q0, q1}, [r0]!
         vld1.8      {q0, q1}, [r1]!
         vst1.8      {q0, q1}, [r0]!
 1:      /* copy 32 bytes */
         bpl	    .L_memcpy_lt_32bytes
         vld1.8      {q0, q1}, [r1]!
         vst1.8      {q0, q1}, [r0]!
 .L_memcpy_lt_32bytes:
         /* copy 16 bytes */
         movs        ip, r2, lsl #28
         bcc         1f
         vld1.8      {q0}, [r1]!
         vst1.8      {q0}, [r0]!
 1:      /* copy 8 bytes */
         bpl         1f
         vld1.8      {d0}, [r1]!
         vst1.8      {d0}, [r0]!
 1:      /* copy 4 bytes */
         tst         r2, #4
         itt         ne
         ldrne       ip, [r1], #4
         strne       ip, [r0], #4
         /* copy 2 bytes */
         movs        ip, r2, lsl #31
         itt         cs
         ldrhcs      ip, [r1], #2
         strhcs      ip, [r0], #2
         /* copy 1 byte */
         itt         mi
         ldrbmi      ip, [r1]
         strbmi      ip, [r0]

 .L_memcpy_done:
         pop         {r0, pc}
 END(MEMCPY_BASE)
	/*
	* Copyright (C) 2008 The Android Open Source Project
	* All rights reserved.
	* Copyright (c) 2013-2014, NVIDIA Corporation. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#define CACHE_LINE_SIZE (64)
	#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*6)

	ENTRY_PRIVATE(MEMCPY_BASE)
	.cfi_def_cfa_offset 8
	.cfi_rel_offset r0, 0
	.cfi_rel_offset lr, 4

	cmp r2, #0
	beq .L_memcpy_done
	cmp r0, r1
	beq .L_memcpy_done

	/* preload next cache line */
	pld [r1, #CACHE_LINE_SIZE*1]

	/* Deal with very small blocks (< 32bytes) asap */
	cmp r2, #32
	blo .L_memcpy_lt_32bytes
	/* no need to align if len < 128 bytes */
	cmp r2, #128
	blo .L_memcpy_lt_128bytes

	/* large copy, align dest to 64 byte boundry */
	pld [r1, #CACHE_LINE_SIZE*2]
	rsb r3, r0, #0
	ands r3, r3, #0x3F
	pld [r1, #CACHE_LINE_SIZE*3]
	beq .L_memcpy_dispatch
	sub r2, r2, r3
	/* copy 1 byte */
	movs ip, r3, lsl #31
	itt mi
	ldrbmi ip, [r1], #1
	strbmi ip, [r0], #1
	/* copy 2 bytes */
	itt cs
	ldrhcs ip, [r1], #2
	strhcs ip, [r0], #2
	/* copy 4 bytes */
	movs ip, r3, lsl #29
	itt mi
	ldrmi ip, [r1], #4
	strmi ip, [r0], #4
	/* copy 8 bytes */
	bcc 1f
	vld1.8 {d0}, [r1]!
	vst1.8 {d0}, [r0, :64]!
	1: /* copy 16 bytes */
	movs ip, r3, lsl #27
	bpl 1f
	vld1.8 {q0}, [r1]!
	vst1.8 {q0}, [r0, :128]!
	1: /* copy 32 bytes */
	bcc .L_memcpy_dispatch
	vld1.8 {q0, q1}, [r1]!
	vst1.8 {q0, q1}, [r0, :256]!

	.L_memcpy_dispatch:
	// pre-decrement by 128 to detect nearly-done condition easily, but
	// also need to check if we have less than 128 bytes left at this
	// point due to alignment code above
	subs r2, r2, #128
	blo .L_memcpy_lt_128presub

	// Denver does better if both source and dest are aligned so
	// we'll special-case that even though the code is virually identical
	tst r1, #0xF
	bne .L_memcpy_neon_unalign_src_pld

	// DRAM memcpy should be throttled slightly to get full bandwidth
	//
	cmp r2, #32768
	bhi .L_memcpy_neon_unalign_src_pld
	.align 4
	1:
	/* copy 128 bytes in each loop */
	subs r2, r2, #128

	/* preload a cache line */
	pld [r1, #PREFETCH_DISTANCE]
	/* copy a cache line */
	vld1.8 {q0, q1}, [r1, :128]!
	vst1.8 {q0, q1}, [r0, :256]!
	vld1.8 {q0, q1}, [r1, :128]!
	vst1.8 {q0, q1}, [r0, :256]!
	/* preload a cache line */
	pld [r1, #PREFETCH_DISTANCE]
	/* copy a cache line */
	vld1.8 {q0, q1}, [r1, :128]!
	vst1.8 {q0, q1}, [r0, :256]!
	vld1.8 {q0, q1}, [r1, :128]!
	vst1.8 {q0, q1}, [r0, :256]!

	bhs 1b
	adds r2, r2, #128
	bne .L_memcpy_lt_128bytes_align
	pop {r0, pc}

	.align 4
	.L_memcpy_neon_unalign_src_pld:
	1:
	/* copy 128 bytes in each loop */
	subs r2, r2, #128

	/* preload a cache line */
	pld [r1, #PREFETCH_DISTANCE]
	/* copy a cache line */
	vld1.8 {q0, q1}, [r1]!
	vst1.8 {q0, q1}, [r0, :256]!
	vld1.8 {q0, q1}, [r1]!
	vst1.8 {q0, q1}, [r0, :256]!
	/* preload a cache line */
	pld [r1, #PREFETCH_DISTANCE]
	/* copy a cache line */
	vld1.8 {q0, q1}, [r1]!
	vst1.8 {q0, q1}, [r0, :256]!
	vld1.8 {q0, q1}, [r1]!
	vst1.8 {q0, q1}, [r0, :256]!

	bhs 1b
	adds r2, r2, #128
	bne .L_memcpy_lt_128bytes_align
	pop {r0, pc}

	.L_memcpy_lt_128presub:
	add r2, r2, #128
	.L_memcpy_lt_128bytes_align:
	/* copy 64 bytes */
	movs ip, r2, lsl #26
	bcc 1f
	vld1.8 {q0, q1}, [r1]!
	vst1.8 {q0, q1}, [r0, :256]!
	vld1.8 {q0, q1}, [r1]!
	vst1.8 {q0, q1}, [r0, :256]!
	1: /* copy 32 bytes */
	bpl 1f
	vld1.8 {q0, q1}, [r1]!
	vst1.8 {q0, q1}, [r0, :256]!
	1: /* copy 16 bytes */
	movs ip, r2, lsl #28
	bcc 1f
	vld1.8 {q0}, [r1]!
	vst1.8 {q0}, [r0, :128]!
	1: /* copy 8 bytes */
	bpl 1f
	vld1.8 {d0}, [r1]!
	vst1.8 {d0}, [r0, :64]!
	1: /* copy 4 bytes */
	tst r2, #4
	itt ne
	ldrne ip, [r1], #4
	strne ip, [r0], #4
	/* copy 2 bytes */
	movs ip, r2, lsl #31
	itt cs
	ldrhcs ip, [r1], #2
	strhcs ip, [r0], #2
	/* copy 1 byte */
	itt mi
	ldrbmi ip, [r1]
	strbmi ip, [r0]

	pop {r0, pc}

	.L_memcpy_lt_128bytes:
	/* copy 64 bytes */
	movs ip, r2, lsl #26
	bcc 1f
	vld1.8 {q0, q1}, [r1]!
	vst1.8 {q0, q1}, [r0]!
	vld1.8 {q0, q1}, [r1]!
	vst1.8 {q0, q1}, [r0]!
	1: /* copy 32 bytes */
	bpl .L_memcpy_lt_32bytes
	vld1.8 {q0, q1}, [r1]!
	vst1.8 {q0, q1}, [r0]!
	.L_memcpy_lt_32bytes:
	/* copy 16 bytes */
	movs ip, r2, lsl #28
	bcc 1f
	vld1.8 {q0}, [r1]!
	vst1.8 {q0}, [r0]!
	1: /* copy 8 bytes */
	bpl 1f
	vld1.8 {d0}, [r1]!
	vst1.8 {d0}, [r0]!
	1: /* copy 4 bytes */
	tst r2, #4
	itt ne
	ldrne ip, [r1], #4
	strne ip, [r0], #4
	/* copy 2 bytes */
	movs ip, r2, lsl #31
	itt cs
	ldrhcs ip, [r1], #2
	strhcs ip, [r0], #2
	/* copy 1 byte */
	itt mi
	ldrbmi ip, [r1]
	strbmi ip, [r0]

	.L_memcpy_done:
	pop {r0, pc}
	END(MEMCPY_BASE)