| /* |
| Copyright (c) 2014, Intel Corporation |
| All rights reserved. |
| |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are met: |
| |
| * Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| |
| * Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| |
| * Neither the name of Intel Corporation nor the names of its contributors |
| * may be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
| ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "cache.h" |
| |
| #ifndef MEMMOVE |
| # define MEMMOVE memmove |
| #endif |
| |
| #ifndef L |
| # define L(label) .L##label |
| #endif |
| |
| #ifndef cfi_startproc |
| # define cfi_startproc .cfi_startproc |
| #endif |
| |
| #ifndef cfi_endproc |
| # define cfi_endproc .cfi_endproc |
| #endif |
| |
| #ifndef cfi_rel_offset |
| # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off |
| #endif |
| |
| #ifndef cfi_restore |
| # define cfi_restore(reg) .cfi_restore reg |
| #endif |
| |
| #ifndef cfi_adjust_cfa_offset |
| # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off |
| #endif |
| |
| #ifndef ENTRY |
| # define ENTRY(name) \ |
| .type name, @function; \ |
| .globl name; \ |
| .p2align 4; \ |
| name: \ |
| cfi_startproc |
| #endif |
| |
| #ifndef END |
| # define END(name) \ |
| cfi_endproc; \ |
| .size name, .-name |
| #endif |
| |
| #define DEST PARMS |
| #define SRC DEST+4 |
| #define LEN SRC+4 |
| |
| #define CFI_PUSH(REG) \ |
| cfi_adjust_cfa_offset (4); \ |
| cfi_rel_offset (REG, 0) |
| |
| #define CFI_POP(REG) \ |
| cfi_adjust_cfa_offset (-4); \ |
| cfi_restore (REG) |
| |
| #define PUSH(REG) pushl REG; CFI_PUSH (REG) |
| #define POP(REG) popl REG; CFI_POP (REG) |
| |
| #define PARMS 8 /* Preserve EBX. */ |
| #define ENTRANCE PUSH (%ebx); |
| #define RETURN_END POP (%ebx); ret |
| #define RETURN RETURN_END; CFI_PUSH (%ebx) |
| |
| .section .text.sse2,"ax",@progbits |
| ENTRY (MEMMOVE) |
| ENTRANCE |
| movl LEN(%esp), %ecx |
| movl SRC(%esp), %eax |
| movl DEST(%esp), %edx |
| |
| /* Check whether we should copy backward or forward. */ |
| cmp %eax, %edx |
| je L(mm_return) |
| jg L(mm_len_0_or_more_backward) |
| |
| /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] |
| separately. */ |
| cmp $16, %ecx |
| jbe L(mm_len_0_16_bytes_forward) |
| |
| cmpl $32, %ecx |
| ja L(mm_len_32_or_more_forward) |
| |
| /* Copy [0..32] and return. */ |
| movdqu (%eax), %xmm0 |
| movdqu -16(%eax, %ecx), %xmm1 |
| movdqu %xmm0, (%edx) |
| movdqu %xmm1, -16(%edx, %ecx) |
| jmp L(mm_return) |
| |
| L(mm_len_32_or_more_forward): |
| cmpl $64, %ecx |
| ja L(mm_len_64_or_more_forward) |
| |
| /* Copy [0..64] and return. */ |
| movdqu (%eax), %xmm0 |
| movdqu 16(%eax), %xmm1 |
| movdqu -16(%eax, %ecx), %xmm2 |
| movdqu -32(%eax, %ecx), %xmm3 |
| movdqu %xmm0, (%edx) |
| movdqu %xmm1, 16(%edx) |
| movdqu %xmm2, -16(%edx, %ecx) |
| movdqu %xmm3, -32(%edx, %ecx) |
| jmp L(mm_return) |
| |
| L(mm_len_64_or_more_forward): |
| cmpl $128, %ecx |
| ja L(mm_len_128_or_more_forward) |
| |
| /* Copy [0..128] and return. */ |
| movdqu (%eax), %xmm0 |
| movdqu 16(%eax), %xmm1 |
| movdqu 32(%eax), %xmm2 |
| movdqu 48(%eax), %xmm3 |
| movdqu -64(%eax, %ecx), %xmm4 |
| movdqu -48(%eax, %ecx), %xmm5 |
| movdqu -32(%eax, %ecx), %xmm6 |
| movdqu -16(%eax, %ecx), %xmm7 |
| movdqu %xmm0, (%edx) |
| movdqu %xmm1, 16(%edx) |
| movdqu %xmm2, 32(%edx) |
| movdqu %xmm3, 48(%edx) |
| movdqu %xmm4, -64(%edx, %ecx) |
| movdqu %xmm5, -48(%edx, %ecx) |
| movdqu %xmm6, -32(%edx, %ecx) |
| movdqu %xmm7, -16(%edx, %ecx) |
| jmp L(mm_return) |
| |
| L(mm_len_128_or_more_forward): |
| PUSH (%esi) |
| PUSH (%edi) |
| |
| /* Aligning the address of destination. */ |
| movdqu (%eax), %xmm0 |
| movdqu 16(%eax), %xmm1 |
| movdqu 32(%eax), %xmm2 |
| movdqu 48(%eax), %xmm3 |
| |
| leal 64(%edx), %edi |
| andl $-64, %edi |
| subl %edx, %eax |
| |
| movdqu (%eax, %edi), %xmm4 |
| movdqu 16(%eax, %edi), %xmm5 |
| movdqu 32(%eax, %edi), %xmm6 |
| movdqu 48(%eax, %edi), %xmm7 |
| |
| movdqu %xmm0, (%edx) |
| movdqu %xmm1, 16(%edx) |
| movdqu %xmm2, 32(%edx) |
| movdqu %xmm3, 48(%edx) |
| movdqa %xmm4, (%edi) |
| movaps %xmm5, 16(%edi) |
| movaps %xmm6, 32(%edi) |
| movaps %xmm7, 48(%edi) |
| addl $64, %edi |
| |
| leal (%edx, %ecx), %ebx |
| andl $-64, %ebx |
| cmp %edi, %ebx |
| jbe L(mm_copy_remaining_forward) |
| |
| cmp $SHARED_CACHE_SIZE_HALF, %ecx |
| jae L(mm_large_page_loop_forward) |
| |
| .p2align 4 |
| L(mm_main_loop_forward): |
| |
| prefetcht0 128(%eax, %edi) |
| |
| movdqu (%eax, %edi), %xmm0 |
| movdqu 16(%eax, %edi), %xmm1 |
| movdqu 32(%eax, %edi), %xmm2 |
| movdqu 48(%eax, %edi), %xmm3 |
| movdqa %xmm0, (%edi) |
| movaps %xmm1, 16(%edi) |
| movaps %xmm2, 32(%edi) |
| movaps %xmm3, 48(%edi) |
| leal 64(%edi), %edi |
| cmp %edi, %ebx |
| ja L(mm_main_loop_forward) |
| |
| L(mm_copy_remaining_forward): |
| addl %edx, %ecx |
| subl %edi, %ecx |
| /* We copied all up till %edi position in the dst. |
| In %ecx now is how many bytes are left to copy. |
| Now we need to advance %esi. */ |
| leal (%edi, %eax), %esi |
| |
| L(mm_remaining_0_64_bytes_forward): |
| cmp $32, %ecx |
| ja L(mm_remaining_33_64_bytes_forward) |
| cmp $16, %ecx |
| ja L(mm_remaining_17_32_bytes_forward) |
| testl %ecx, %ecx |
| .p2align 4,,2 |
| je L(mm_return_pop_all) |
| |
| cmpb $8, %cl |
| ja L(mm_remaining_9_16_bytes_forward) |
| cmpb $4, %cl |
| .p2align 4,,5 |
| ja L(mm_remaining_5_8_bytes_forward) |
| cmpb $2, %cl |
| .p2align 4,,1 |
| ja L(mm_remaining_3_4_bytes_forward) |
| movzbl -1(%esi,%ecx), %eax |
| movzbl (%esi), %ebx |
| movb %al, -1(%edi,%ecx) |
| movb %bl, (%edi) |
| jmp L(mm_return_pop_all) |
| |
| L(mm_remaining_33_64_bytes_forward): |
| movdqu (%esi), %xmm0 |
| movdqu 16(%esi), %xmm1 |
| movdqu -32(%esi, %ecx), %xmm2 |
| movdqu -16(%esi, %ecx), %xmm3 |
| movdqu %xmm0, (%edi) |
| movdqu %xmm1, 16(%edi) |
| movdqu %xmm2, -32(%edi, %ecx) |
| movdqu %xmm3, -16(%edi, %ecx) |
| jmp L(mm_return_pop_all) |
| |
| L(mm_remaining_17_32_bytes_forward): |
| movdqu (%esi), %xmm0 |
| movdqu -16(%esi, %ecx), %xmm1 |
| movdqu %xmm0, (%edi) |
| movdqu %xmm1, -16(%edi, %ecx) |
| jmp L(mm_return_pop_all) |
| |
| L(mm_remaining_9_16_bytes_forward): |
| movq (%esi), %xmm0 |
| movq -8(%esi, %ecx), %xmm1 |
| movq %xmm0, (%edi) |
| movq %xmm1, -8(%edi, %ecx) |
| jmp L(mm_return_pop_all) |
| |
| L(mm_remaining_5_8_bytes_forward): |
| movl (%esi), %eax |
| movl -4(%esi,%ecx), %ebx |
| movl %eax, (%edi) |
| movl %ebx, -4(%edi,%ecx) |
| jmp L(mm_return_pop_all) |
| |
| L(mm_remaining_3_4_bytes_forward): |
| movzwl -2(%esi,%ecx), %eax |
| movzwl (%esi), %ebx |
| movw %ax, -2(%edi,%ecx) |
| movw %bx, (%edi) |
| jmp L(mm_return_pop_all) |
| |
| L(mm_len_0_16_bytes_forward): |
| testb $24, %cl |
| jne L(mm_len_9_16_bytes_forward) |
| testb $4, %cl |
| .p2align 4,,5 |
| jne L(mm_len_5_8_bytes_forward) |
| testl %ecx, %ecx |
| .p2align 4,,2 |
| je L(mm_return) |
| testb $2, %cl |
| .p2align 4,,1 |
| jne L(mm_len_2_4_bytes_forward) |
| movzbl -1(%eax,%ecx), %ebx |
| movzbl (%eax), %eax |
| movb %bl, -1(%edx,%ecx) |
| movb %al, (%edx) |
| jmp L(mm_return) |
| |
| L(mm_len_2_4_bytes_forward): |
| movzwl -2(%eax,%ecx), %ebx |
| movzwl (%eax), %eax |
| movw %bx, -2(%edx,%ecx) |
| movw %ax, (%edx) |
| jmp L(mm_return) |
| |
| L(mm_len_5_8_bytes_forward): |
| movl (%eax), %ebx |
| movl -4(%eax,%ecx), %eax |
| movl %ebx, (%edx) |
| movl %eax, -4(%edx,%ecx) |
| jmp L(mm_return) |
| |
| L(mm_len_9_16_bytes_forward): |
| movq (%eax), %xmm0 |
| movq -8(%eax, %ecx), %xmm1 |
| movq %xmm0, (%edx) |
| movq %xmm1, -8(%edx, %ecx) |
| jmp L(mm_return) |
| |
| CFI_POP (%edi) |
| CFI_POP (%esi) |
| |
| L(mm_recalc_len): |
| /* Compute in %ecx how many bytes are left to copy after |
| the main loop stops. */ |
| movl %ebx, %ecx |
| subl %edx, %ecx |
| /* The code for copying backwards. */ |
| L(mm_len_0_or_more_backward): |
| |
| /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] |
| separately. */ |
| cmp $16, %ecx |
| jbe L(mm_len_0_16_bytes_backward) |
| |
| cmpl $32, %ecx |
| jg L(mm_len_32_or_more_backward) |
| |
| /* Copy [0..32] and return. */ |
| movdqu (%eax), %xmm0 |
| movdqu -16(%eax, %ecx), %xmm1 |
| movdqu %xmm0, (%edx) |
| movdqu %xmm1, -16(%edx, %ecx) |
| jmp L(mm_return) |
| |
| L(mm_len_32_or_more_backward): |
| cmpl $64, %ecx |
| jg L(mm_len_64_or_more_backward) |
| |
| /* Copy [0..64] and return. */ |
| movdqu (%eax), %xmm0 |
| movdqu 16(%eax), %xmm1 |
| movdqu -16(%eax, %ecx), %xmm2 |
| movdqu -32(%eax, %ecx), %xmm3 |
| movdqu %xmm0, (%edx) |
| movdqu %xmm1, 16(%edx) |
| movdqu %xmm2, -16(%edx, %ecx) |
| movdqu %xmm3, -32(%edx, %ecx) |
| jmp L(mm_return) |
| |
| L(mm_len_64_or_more_backward): |
| cmpl $128, %ecx |
| jg L(mm_len_128_or_more_backward) |
| |
| /* Copy [0..128] and return. */ |
| movdqu (%eax), %xmm0 |
| movdqu 16(%eax), %xmm1 |
| movdqu 32(%eax), %xmm2 |
| movdqu 48(%eax), %xmm3 |
| movdqu -64(%eax, %ecx), %xmm4 |
| movdqu -48(%eax, %ecx), %xmm5 |
| movdqu -32(%eax, %ecx), %xmm6 |
| movdqu -16(%eax, %ecx), %xmm7 |
| movdqu %xmm0, (%edx) |
| movdqu %xmm1, 16(%edx) |
| movdqu %xmm2, 32(%edx) |
| movdqu %xmm3, 48(%edx) |
| movdqu %xmm4, -64(%edx, %ecx) |
| movdqu %xmm5, -48(%edx, %ecx) |
| movdqu %xmm6, -32(%edx, %ecx) |
| movdqu %xmm7, -16(%edx, %ecx) |
| jmp L(mm_return) |
| |
| L(mm_len_128_or_more_backward): |
| PUSH (%esi) |
| PUSH (%edi) |
| |
| /* Aligning the address of destination. We need to save |
| 16 bits from the source in order not to overwrite them. */ |
| movdqu -16(%eax, %ecx), %xmm0 |
| movdqu -32(%eax, %ecx), %xmm1 |
| movdqu -48(%eax, %ecx), %xmm2 |
| movdqu -64(%eax, %ecx), %xmm3 |
| |
| leal (%edx, %ecx), %edi |
| andl $-64, %edi |
| |
| movl %eax, %esi |
| subl %edx, %esi |
| |
| movdqu -16(%edi, %esi), %xmm4 |
| movdqu -32(%edi, %esi), %xmm5 |
| movdqu -48(%edi, %esi), %xmm6 |
| movdqu -64(%edi, %esi), %xmm7 |
| |
| movdqu %xmm0, -16(%edx, %ecx) |
| movdqu %xmm1, -32(%edx, %ecx) |
| movdqu %xmm2, -48(%edx, %ecx) |
| movdqu %xmm3, -64(%edx, %ecx) |
| movdqa %xmm4, -16(%edi) |
| movdqa %xmm5, -32(%edi) |
| movdqa %xmm6, -48(%edi) |
| movdqa %xmm7, -64(%edi) |
| leal -64(%edi), %edi |
| |
| leal 64(%edx), %ebx |
| andl $-64, %ebx |
| |
| cmp %edi, %ebx |
| jae L(mm_main_loop_backward_end) |
| |
| cmp $SHARED_CACHE_SIZE_HALF, %ecx |
| jae L(mm_large_page_loop_backward) |
| |
| .p2align 4 |
| L(mm_main_loop_backward): |
| |
| prefetcht0 -128(%edi, %esi) |
| |
| movdqu -64(%edi, %esi), %xmm0 |
| movdqu -48(%edi, %esi), %xmm1 |
| movdqu -32(%edi, %esi), %xmm2 |
| movdqu -16(%edi, %esi), %xmm3 |
| movdqa %xmm0, -64(%edi) |
| movdqa %xmm1, -48(%edi) |
| movdqa %xmm2, -32(%edi) |
| movdqa %xmm3, -16(%edi) |
| leal -64(%edi), %edi |
| cmp %edi, %ebx |
| jb L(mm_main_loop_backward) |
| L(mm_main_loop_backward_end): |
| POP (%edi) |
| POP (%esi) |
| jmp L(mm_recalc_len) |
| |
| /* Copy [0..16] and return. */ |
| L(mm_len_0_16_bytes_backward): |
| testb $24, %cl |
| jnz L(mm_len_9_16_bytes_backward) |
| testb $4, %cl |
| .p2align 4,,5 |
| jnz L(mm_len_5_8_bytes_backward) |
| testl %ecx, %ecx |
| .p2align 4,,2 |
| je L(mm_return) |
| testb $2, %cl |
| .p2align 4,,1 |
| jne L(mm_len_3_4_bytes_backward) |
| movzbl -1(%eax,%ecx), %ebx |
| movzbl (%eax), %eax |
| movb %bl, -1(%edx,%ecx) |
| movb %al, (%edx) |
| jmp L(mm_return) |
| |
| L(mm_len_3_4_bytes_backward): |
| movzwl -2(%eax,%ecx), %ebx |
| movzwl (%eax), %eax |
| movw %bx, -2(%edx,%ecx) |
| movw %ax, (%edx) |
| jmp L(mm_return) |
| |
| L(mm_len_9_16_bytes_backward): |
| PUSH (%esi) |
| movl -4(%eax,%ecx), %ebx |
| movl -8(%eax,%ecx), %esi |
| movl %ebx, -4(%edx,%ecx) |
| movl %esi, -8(%edx,%ecx) |
| subl $8, %ecx |
| POP (%esi) |
| jmp L(mm_len_0_16_bytes_backward) |
| |
| L(mm_len_5_8_bytes_backward): |
| movl (%eax), %ebx |
| movl -4(%eax,%ecx), %eax |
| movl %ebx, (%edx) |
| movl %eax, -4(%edx,%ecx) |
| |
| L(mm_return): |
| movl %edx, %eax |
| RETURN |
| |
| L(mm_return_pop_all): |
| movl %edx, %eax |
| POP (%edi) |
| POP (%esi) |
| RETURN |
| |
| /* Big length copy forward part. */ |
| |
| .p2align 4 |
| L(mm_large_page_loop_forward): |
| movdqu (%eax, %edi), %xmm0 |
| movdqu 16(%eax, %edi), %xmm1 |
| movdqu 32(%eax, %edi), %xmm2 |
| movdqu 48(%eax, %edi), %xmm3 |
| movntdq %xmm0, (%edi) |
| movntdq %xmm1, 16(%edi) |
| movntdq %xmm2, 32(%edi) |
| movntdq %xmm3, 48(%edi) |
| leal 64(%edi), %edi |
| cmp %edi, %ebx |
| ja L(mm_large_page_loop_forward) |
| sfence |
| jmp L(mm_copy_remaining_forward) |
| |
| /* Big length copy backward part. */ |
| .p2align 4 |
| L(mm_large_page_loop_backward): |
| movdqu -64(%edi, %esi), %xmm0 |
| movdqu -48(%edi, %esi), %xmm1 |
| movdqu -32(%edi, %esi), %xmm2 |
| movdqu -16(%edi, %esi), %xmm3 |
| movntdq %xmm0, -64(%edi) |
| movntdq %xmm1, -48(%edi) |
| movntdq %xmm2, -32(%edi) |
| movntdq %xmm3, -16(%edi) |
| leal -64(%edi), %edi |
| cmp %edi, %ebx |
| jb L(mm_large_page_loop_backward) |
| sfence |
| POP (%edi) |
| POP (%esi) |
| jmp L(mm_recalc_len) |
| |
| END (MEMMOVE) |