| /* |
| Copyright (c) 2014, Intel Corporation |
| All rights reserved. |
| |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are met: |
| |
| * Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| |
| * Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| |
| * Neither the name of Intel Corporation nor the names of its contributors |
| * may be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
| ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifdef USE_AS_STRNCMP |
| /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz |
| if the new counter > the old one or is 0. */ |
| #define UPDATE_STRNCMP_COUNTER \ |
| /* calculate left number to compare */ \ |
| lea -16(%rcx, %r11), %r9; \ |
| cmp %r9, %r11; \ |
| jb L(strcmp_exitz); \ |
| test %r9, %r9; \ |
| je L(strcmp_exitz); \ |
| mov %r9, %r11 |
| |
| #else |
| #define UPDATE_STRNCMP_COUNTER |
| #ifndef STRCMP |
| #define STRCMP strcmp |
| #endif |
| #endif |
| |
| #ifndef L |
| # define L(label) .L##label |
| #endif |
| |
| #ifndef cfi_startproc |
| # define cfi_startproc .cfi_startproc |
| #endif |
| |
| #ifndef cfi_endproc |
| # define cfi_endproc .cfi_endproc |
| #endif |
| |
| #ifndef ENTRY |
| # define ENTRY(name) \ |
| .type name, @function; \ |
| .globl name; \ |
| .p2align 4; \ |
| name: \ |
| cfi_startproc |
| #endif |
| |
| #ifndef END |
| # define END(name) \ |
| cfi_endproc; \ |
| .size name, .-name |
| #endif |
| #define RETURN ret |
| .section .text.ssse3,"ax",@progbits |
| ENTRY (STRCMP) |
| /* |
| * This implementation uses SSE to compare up to 16 bytes at a time. |
| */ |
| #ifdef USE_AS_STRNCMP |
| test %rdx, %rdx |
| je L(strcmp_exitz) |
| cmp $1, %rdx |
| je L(Byte0) |
| mov %rdx, %r11 |
| #endif |
| mov %esi, %ecx |
| mov %edi, %eax |
| /* Use 64bit AND here to avoid long NOP padding. */ |
| and $0x3f, %rcx /* rsi alignment in cache line */ |
| and $0x3f, %rax /* rdi alignment in cache line */ |
| cmp $0x30, %ecx |
| ja L(crosscache) /* rsi: 16-byte load will cross cache line */ |
| cmp $0x30, %eax |
| ja L(crosscache) /* rdi: 16-byte load will cross cache line */ |
| movlpd (%rdi), %xmm1 |
| movlpd (%rsi), %xmm2 |
| movhpd 8(%rdi), %xmm1 |
| movhpd 8(%rsi), %xmm2 |
| pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ |
| pcmpeqb %xmm1, %xmm0 /* Any null chars? */ |
| pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ |
| psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ |
| jnz L(less16bytes) /* If not, find different value or null char */ |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) /* finish comparision */ |
| #endif |
| add $16, %rsi /* prepare to search next 16 bytes */ |
| add $16, %rdi /* prepare to search next 16 bytes */ |
| |
| /* |
| * Determine source and destination string offsets from 16-byte alignment. |
| * Use relative offset difference between the two to determine which case |
| * below to use. |
| */ |
| .p2align 4 |
| L(crosscache): |
| and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ |
| and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ |
| mov $0xffff, %edx /* for equivalent offset */ |
| xor %r8d, %r8d |
| and $0xf, %ecx /* offset of rsi */ |
| and $0xf, %eax /* offset of rdi */ |
| cmp %eax, %ecx |
| je L(ashr_0) /* rsi and rdi relative offset same */ |
| ja L(bigger) |
| mov %edx, %r8d /* r8d is offset flag for exit tail */ |
| xchg %ecx, %eax |
| xchg %rsi, %rdi |
| L(bigger): |
| lea 15(%rax), %r9 |
| sub %rcx, %r9 |
| lea L(unaligned_table)(%rip), %r10 |
| movslq (%r10, %r9,4), %r9 |
| lea (%r10, %r9), %r10 |
| jmp *%r10 /* jump to corresponding case */ |
| |
| /* |
| * The following cases will be handled by ashr_0 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(0~15) n(0~15) 15(15+ n-n) ashr_0 |
| */ |
| .p2align 4 |
| L(ashr_0): |
| |
| movdqa (%rsi), %xmm1 |
| pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ |
| pcmpeqb %xmm1, %xmm0 /* Any null chars? */ |
| pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ |
| psubb %xmm0, %xmm1 /* packed sub of comparison results*/ |
| pmovmskb %xmm1, %r9d |
| shr %cl, %edx /* adjust 0xffff for offset */ |
| shr %cl, %r9d /* adjust for 16-byte offset */ |
| sub %r9d, %edx |
| /* |
| * edx must be the same with r9d if in left byte (16-rcx) is equal to |
| * the start from (16-rax) and no null char was seen. |
| */ |
| jne L(less32bytes) /* mismatch or null char */ |
| UPDATE_STRNCMP_COUNTER |
| mov $16, %rcx |
| mov $16, %r9 |
| pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ |
| |
| /* |
| * Now both strings are aligned at 16-byte boundary. Loop over strings |
| * checking 32-bytes per iteration. |
| */ |
| .p2align 4 |
| L(loop_ashr_0): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) /* mismatch or null char seen */ |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| add $16, %rcx |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| add $16, %rcx |
| jmp L(loop_ashr_0) |
| |
| /* |
| * The following cases will be handled by ashr_1 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(15) n -15 0(15 +(n-15) - n) ashr_1 |
| */ |
| .p2align 4 |
| L(ashr_1): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 /* Any null chars? */ |
| pslldq $15, %xmm2 /* shift first string to align with second */ |
| pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ |
| psubb %xmm0, %xmm2 /* packed sub of comparison results*/ |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx /* adjust 0xffff for offset */ |
| shr %cl, %r9d /* adjust for 16-byte offset */ |
| sub %r9d, %edx |
| jnz L(less32bytes) /* mismatch or null char seen */ |
| movdqa (%rdi), %xmm3 |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads*/ |
| mov $1, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 1(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_1): |
| add $16, %r10 |
| jg L(nibble_ashr_1) /* cross page boundary */ |
| |
| L(gobble_ashr_1): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 /* store for next cycle */ |
| |
| palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_1) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 /* store for next cycle */ |
| |
| palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_1) |
| |
| /* |
| * Nibble avoids loads across page boundary. This is to avoid a potential |
| * access into unmapped memory. |
| */ |
| .p2align 4 |
| L(nibble_ashr_1): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ |
| pmovmskb %xmm0, %edx |
| test $0xfffe, %edx |
| jnz L(ashr_1_exittail) /* find null char*/ |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $14, %r11 |
| jbe L(ashr_1_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 /* substract 4K from %r10 */ |
| jmp L(gobble_ashr_1) |
| |
| /* |
| * Once find null char, determine if there is a string mismatch |
| * before the null char. |
| */ |
| .p2align 4 |
| L(ashr_1_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $1, %xmm0 |
| psrldq $1, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_2 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 |
| */ |
| .p2align 4 |
| L(ashr_2): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $14, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $2, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 2(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_2): |
| add $16, %r10 |
| jg L(nibble_ashr_2) |
| |
| L(gobble_ashr_2): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_2) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_2) |
| |
| .p2align 4 |
| L(nibble_ashr_2): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xfffc, %edx |
| jnz L(ashr_2_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $13, %r11 |
| jbe L(ashr_2_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_2) |
| |
| .p2align 4 |
| L(ashr_2_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $2, %xmm0 |
| psrldq $2, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_3 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 |
| */ |
| .p2align 4 |
| L(ashr_3): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $13, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $3, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 3(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_3): |
| add $16, %r10 |
| jg L(nibble_ashr_3) |
| |
| L(gobble_ashr_3): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_3) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_3) |
| |
| .p2align 4 |
| L(nibble_ashr_3): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xfff8, %edx |
| jnz L(ashr_3_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $12, %r11 |
| jbe L(ashr_3_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_3) |
| |
| .p2align 4 |
| L(ashr_3_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $3, %xmm0 |
| psrldq $3, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_4 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 |
| */ |
| .p2align 4 |
| L(ashr_4): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $12, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $4, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 4(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_4): |
| add $16, %r10 |
| jg L(nibble_ashr_4) |
| |
| L(gobble_ashr_4): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_4) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_4) |
| |
| .p2align 4 |
| L(nibble_ashr_4): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xfff0, %edx |
| jnz L(ashr_4_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $11, %r11 |
| jbe L(ashr_4_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_4) |
| |
| .p2align 4 |
| L(ashr_4_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $4, %xmm0 |
| psrldq $4, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_5 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 |
| */ |
| .p2align 4 |
| L(ashr_5): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $11, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $5, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 5(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_5): |
| add $16, %r10 |
| jg L(nibble_ashr_5) |
| |
| L(gobble_ashr_5): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_5) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_5) |
| |
| .p2align 4 |
| L(nibble_ashr_5): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xffe0, %edx |
| jnz L(ashr_5_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $10, %r11 |
| jbe L(ashr_5_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_5) |
| |
| .p2align 4 |
| L(ashr_5_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $5, %xmm0 |
| psrldq $5, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_6 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 |
| */ |
| .p2align 4 |
| L(ashr_6): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $10, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $6, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 6(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_6): |
| add $16, %r10 |
| jg L(nibble_ashr_6) |
| |
| L(gobble_ashr_6): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_6) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_6) |
| |
| .p2align 4 |
| L(nibble_ashr_6): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xffc0, %edx |
| jnz L(ashr_6_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $9, %r11 |
| jbe L(ashr_6_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_6) |
| |
| .p2align 4 |
| L(ashr_6_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $6, %xmm0 |
| psrldq $6, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_7 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 |
| */ |
| .p2align 4 |
| L(ashr_7): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $9, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $7, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 7(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_7): |
| add $16, %r10 |
| jg L(nibble_ashr_7) |
| |
| L(gobble_ashr_7): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_7) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_7) |
| |
| .p2align 4 |
| L(nibble_ashr_7): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xff80, %edx |
| jnz L(ashr_7_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $8, %r11 |
| jbe L(ashr_7_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_7) |
| |
| .p2align 4 |
| L(ashr_7_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $7, %xmm0 |
| psrldq $7, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_8 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 |
| */ |
| .p2align 4 |
| L(ashr_8): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $8, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $8, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 8(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_8): |
| add $16, %r10 |
| jg L(nibble_ashr_8) |
| |
| L(gobble_ashr_8): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_8) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_8) |
| |
| .p2align 4 |
| L(nibble_ashr_8): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xff00, %edx |
| jnz L(ashr_8_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $7, %r11 |
| jbe L(ashr_8_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_8) |
| |
| .p2align 4 |
| L(ashr_8_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $8, %xmm0 |
| psrldq $8, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_9 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 |
| */ |
| .p2align 4 |
| L(ashr_9): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $7, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $9, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 9(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_9): |
| add $16, %r10 |
| jg L(nibble_ashr_9) |
| |
| L(gobble_ashr_9): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_9) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 /* store for next cycle */ |
| jmp L(loop_ashr_9) |
| |
| .p2align 4 |
| L(nibble_ashr_9): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xfe00, %edx |
| jnz L(ashr_9_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $6, %r11 |
| jbe L(ashr_9_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_9) |
| |
| .p2align 4 |
| L(ashr_9_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $9, %xmm0 |
| psrldq $9, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_10 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 |
| */ |
| .p2align 4 |
| L(ashr_10): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $6, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $10, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 10(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_10): |
| add $16, %r10 |
| jg L(nibble_ashr_10) |
| |
| L(gobble_ashr_10): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_10) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_10) |
| |
| .p2align 4 |
| L(nibble_ashr_10): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xfc00, %edx |
| jnz L(ashr_10_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $5, %r11 |
| jbe L(ashr_10_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_10) |
| |
| .p2align 4 |
| L(ashr_10_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $10, %xmm0 |
| psrldq $10, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_11 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 |
| */ |
| .p2align 4 |
| L(ashr_11): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $5, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $11, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 11(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_11): |
| add $16, %r10 |
| jg L(nibble_ashr_11) |
| |
| L(gobble_ashr_11): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_11) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_11) |
| |
| .p2align 4 |
| L(nibble_ashr_11): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xf800, %edx |
| jnz L(ashr_11_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $4, %r11 |
| jbe L(ashr_11_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_11) |
| |
| .p2align 4 |
| L(ashr_11_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $11, %xmm0 |
| psrldq $11, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_12 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 |
| */ |
| .p2align 4 |
| L(ashr_12): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $4, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $12, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 12(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_12): |
| add $16, %r10 |
| jg L(nibble_ashr_12) |
| |
| L(gobble_ashr_12): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_12) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_12) |
| |
| .p2align 4 |
| L(nibble_ashr_12): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xf000, %edx |
| jnz L(ashr_12_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $3, %r11 |
| jbe L(ashr_12_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_12) |
| |
| .p2align 4 |
| L(ashr_12_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $12, %xmm0 |
| psrldq $12, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_13 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 |
| */ |
| .p2align 4 |
| L(ashr_13): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $3, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $13, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 13(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_13): |
| add $16, %r10 |
| jg L(nibble_ashr_13) |
| |
| L(gobble_ashr_13): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_13) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_13) |
| |
| .p2align 4 |
| L(nibble_ashr_13): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xe000, %edx |
| jnz L(ashr_13_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $2, %r11 |
| jbe L(ashr_13_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_13) |
| |
| .p2align 4 |
| L(ashr_13_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $13, %xmm0 |
| psrldq $13, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_14 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 |
| */ |
| .p2align 4 |
| L(ashr_14): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $2, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $14, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 14(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_14): |
| add $16, %r10 |
| jg L(nibble_ashr_14) |
| |
| L(gobble_ashr_14): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_14) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_14) |
| |
| .p2align 4 |
| L(nibble_ashr_14): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0xc000, %edx |
| jnz L(ashr_14_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| cmp $1, %r11 |
| jbe L(ashr_14_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_14) |
| |
| .p2align 4 |
| L(ashr_14_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $14, %xmm0 |
| psrldq $14, %xmm3 |
| jmp L(aftertail) |
| |
| /* |
| * The following cases will be handled by ashr_15 |
| * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case |
| * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 |
| */ |
| .p2align 4 |
| L(ashr_15): |
| pxor %xmm0, %xmm0 |
| movdqa (%rdi), %xmm2 |
| movdqa (%rsi), %xmm1 |
| pcmpeqb %xmm1, %xmm0 |
| pslldq $1, %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| psubb %xmm0, %xmm2 |
| pmovmskb %xmm2, %r9d |
| shr %cl, %edx |
| shr %cl, %r9d |
| sub %r9d, %edx |
| jnz L(less32bytes) |
| |
| movdqa (%rdi), %xmm3 |
| |
| UPDATE_STRNCMP_COUNTER |
| |
| pxor %xmm0, %xmm0 |
| mov $16, %rcx /* index for loads */ |
| mov $15, %r9d /* byte position left over from less32bytes case */ |
| /* |
| * Setup %r10 value allows us to detect crossing a page boundary. |
| * When %r10 goes positive we have crossed a page boundary and |
| * need to do a nibble. |
| */ |
| lea 15(%rdi), %r10 |
| and $0xfff, %r10 /* offset into 4K page */ |
| |
| sub $0x1000, %r10 /* subtract 4K pagesize */ |
| |
| .p2align 4 |
| L(loop_ashr_15): |
| add $16, %r10 |
| jg L(nibble_ashr_15) |
| |
| L(gobble_ashr_15): |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| |
| add $16, %r10 |
| jg L(nibble_ashr_15) /* cross page boundary */ |
| |
| movdqa (%rsi, %rcx), %xmm1 |
| movdqa (%rdi, %rcx), %xmm2 |
| movdqa %xmm2, %xmm4 |
| |
| palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm2, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| sub $0xffff, %edx |
| jnz L(exit) |
| |
| #ifdef USE_AS_STRNCMP |
| sub $16, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| |
| add $16, %rcx |
| movdqa %xmm4, %xmm3 |
| jmp L(loop_ashr_15) |
| |
| .p2align 4 |
| L(nibble_ashr_15): |
| pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ |
| pmovmskb %xmm0, %edx |
| test $0x8000, %edx |
| jnz L(ashr_15_exittail) |
| |
| #ifdef USE_AS_STRNCMP |
| test %r11, %r11 |
| je L(ashr_15_exittail) |
| #endif |
| |
| pxor %xmm0, %xmm0 |
| sub $0x1000, %r10 |
| jmp L(gobble_ashr_15) |
| |
| .p2align 4 |
| L(ashr_15_exittail): |
| movdqa (%rsi, %rcx), %xmm1 |
| psrldq $15, %xmm3 |
| psrldq $15, %xmm0 |
| |
| .p2align 4 |
| L(aftertail): |
| pcmpeqb %xmm3, %xmm1 |
| psubb %xmm0, %xmm1 |
| pmovmskb %xmm1, %edx |
| not %edx |
| |
| .p2align 4 |
| L(exit): |
| lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ |
| L(less32bytes): |
| lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ |
| lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ |
| test %r8d, %r8d |
| jz L(ret) |
| xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ |
| |
| .p2align 4 |
| L(ret): |
| L(less16bytes): |
| bsf %rdx, %rdx /* find and store bit index in %rdx */ |
| |
| #ifdef USE_AS_STRNCMP |
| sub %rdx, %r11 |
| jbe L(strcmp_exitz) |
| #endif |
| movzbl (%rsi, %rdx), %ecx |
| movzbl (%rdi, %rdx), %eax |
| |
| sub %ecx, %eax |
| ret |
| |
| L(strcmp_exitz): |
| xor %eax, %eax |
| ret |
| |
| .p2align 4 |
| L(Byte0): |
| movzbl (%rsi), %ecx |
| movzbl (%rdi), %eax |
| |
| sub %ecx, %eax |
| ret |
| END (STRCMP) |
| |
| .section .rodata,"a",@progbits |
| .p2align 3 |
| L(unaligned_table): |
| .int L(ashr_1) - L(unaligned_table) |
| .int L(ashr_2) - L(unaligned_table) |
| .int L(ashr_3) - L(unaligned_table) |
| .int L(ashr_4) - L(unaligned_table) |
| .int L(ashr_5) - L(unaligned_table) |
| .int L(ashr_6) - L(unaligned_table) |
| .int L(ashr_7) - L(unaligned_table) |
| .int L(ashr_8) - L(unaligned_table) |
| .int L(ashr_9) - L(unaligned_table) |
| .int L(ashr_10) - L(unaligned_table) |
| .int L(ashr_11) - L(unaligned_table) |
| .int L(ashr_12) - L(unaligned_table) |
| .int L(ashr_13) - L(unaligned_table) |
| .int L(ashr_14) - L(unaligned_table) |
| .int L(ashr_15) - L(unaligned_table) |
| .int L(ashr_0) - L(unaligned_table) |