| /* salsa20-amd64.S - AMD64 implementation of Salsa20 |
| * |
| * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
| * |
| * This file is part of Libgcrypt. |
| * |
| * Libgcrypt is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU Lesser General Public License as |
| * published by the Free Software Foundation; either version 2.1 of |
| * the License, or (at your option) any later version. |
| * |
| * Libgcrypt is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with this program; if not, see <http://www.gnu.org/licenses/>. |
| */ |
| |
| /* |
| * Based on public domain implementation by D. J. Bernstein at |
| * http://cr.yp.to/snuffle.html |
| */ |
| |
| #ifdef __x86_64 |
| #include <config.h> |
| #if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_SALSA20) |
| |
| .text |
| |
| .align 8 |
| .globl _gcry_salsa20_amd64_keysetup |
| .type _gcry_salsa20_amd64_keysetup,@function; |
| _gcry_salsa20_amd64_keysetup: |
| movl 0(%rsi),%r8d |
| movl 4(%rsi),%r9d |
| movl 8(%rsi),%eax |
| movl 12(%rsi),%r10d |
| movl %r8d,20(%rdi) |
| movl %r9d,40(%rdi) |
| movl %eax,60(%rdi) |
| movl %r10d,48(%rdi) |
| cmp $256,%rdx |
| jb .L_kbits128 |
| .L_kbits256: |
| movl 16(%rsi),%edx |
| movl 20(%rsi),%ecx |
| movl 24(%rsi),%r8d |
| movl 28(%rsi),%esi |
| movl %edx,28(%rdi) |
| movl %ecx,16(%rdi) |
| movl %r8d,36(%rdi) |
| movl %esi,56(%rdi) |
| mov $1634760805,%rsi |
| mov $857760878,%rdx |
| mov $2036477234,%rcx |
| mov $1797285236,%r8 |
| movl %esi,0(%rdi) |
| movl %edx,4(%rdi) |
| movl %ecx,8(%rdi) |
| movl %r8d,12(%rdi) |
| jmp .L_keysetupdone |
| .L_kbits128: |
| movl 0(%rsi),%edx |
| movl 4(%rsi),%ecx |
| movl 8(%rsi),%r8d |
| movl 12(%rsi),%esi |
| movl %edx,28(%rdi) |
| movl %ecx,16(%rdi) |
| movl %r8d,36(%rdi) |
| movl %esi,56(%rdi) |
| mov $1634760805,%rsi |
| mov $824206446,%rdx |
| mov $2036477238,%rcx |
| mov $1797285236,%r8 |
| movl %esi,0(%rdi) |
| movl %edx,4(%rdi) |
| movl %ecx,8(%rdi) |
| movl %r8d,12(%rdi) |
| .L_keysetupdone: |
| ret |
| |
| .align 8 |
| .globl _gcry_salsa20_amd64_ivsetup |
| .type _gcry_salsa20_amd64_ivsetup,@function; |
| _gcry_salsa20_amd64_ivsetup: |
| movl 0(%rsi),%r8d |
| movl 4(%rsi),%esi |
| mov $0,%r9 |
| mov $0,%rax |
| movl %r8d,24(%rdi) |
| movl %esi,44(%rdi) |
| movl %r9d,32(%rdi) |
| movl %eax,52(%rdi) |
| ret |
| |
| .align 8 |
| .globl _gcry_salsa20_amd64_encrypt_blocks |
| .type _gcry_salsa20_amd64_encrypt_blocks,@function; |
| _gcry_salsa20_amd64_encrypt_blocks: |
| /* |
| * Modifications to original implementation: |
| * - Number of rounds passing in register %r8 (for Salsa20/12). |
| * - Length is input as number of blocks, so don't handle tail bytes |
| * (this is done in salsa20.c). |
| */ |
| push %rbx |
| shlq $6, %rcx /* blocks to bytes */ |
| mov %r8, %rbx |
| mov %rsp,%r11 |
| and $31,%r11 |
| add $384,%r11 |
| sub %r11,%rsp |
| mov %rdi,%r8 |
| mov %rsi,%rsi |
| mov %rdx,%rdi |
| mov %rcx,%rdx |
| cmp $0,%rdx |
| jbe .L_done |
| .L_start: |
| cmp $256,%rdx |
| jb .L_bytes_are_64_128_or_192 |
| movdqa 0(%r8),%xmm0 |
| pshufd $0x55,%xmm0,%xmm1 |
| pshufd $0xaa,%xmm0,%xmm2 |
| pshufd $0xff,%xmm0,%xmm3 |
| pshufd $0x00,%xmm0,%xmm0 |
| movdqa %xmm1,0(%rsp) |
| movdqa %xmm2,16(%rsp) |
| movdqa %xmm3,32(%rsp) |
| movdqa %xmm0,48(%rsp) |
| movdqa 16(%r8),%xmm0 |
| pshufd $0xaa,%xmm0,%xmm1 |
| pshufd $0xff,%xmm0,%xmm2 |
| pshufd $0x00,%xmm0,%xmm3 |
| pshufd $0x55,%xmm0,%xmm0 |
| movdqa %xmm1,64(%rsp) |
| movdqa %xmm2,80(%rsp) |
| movdqa %xmm3,96(%rsp) |
| movdqa %xmm0,112(%rsp) |
| movdqa 32(%r8),%xmm0 |
| pshufd $0xff,%xmm0,%xmm1 |
| pshufd $0x55,%xmm0,%xmm2 |
| pshufd $0xaa,%xmm0,%xmm0 |
| movdqa %xmm1,128(%rsp) |
| movdqa %xmm2,144(%rsp) |
| movdqa %xmm0,160(%rsp) |
| movdqa 48(%r8),%xmm0 |
| pshufd $0x00,%xmm0,%xmm1 |
| pshufd $0xaa,%xmm0,%xmm2 |
| pshufd $0xff,%xmm0,%xmm0 |
| movdqa %xmm1,176(%rsp) |
| movdqa %xmm2,192(%rsp) |
| movdqa %xmm0,208(%rsp) |
| .L_bytesatleast256: |
| movl 32(%r8),%ecx |
| movl 52(%r8),%r9d |
| movl %ecx,224(%rsp) |
| movl %r9d,240(%rsp) |
| add $1,%ecx |
| adc $0,%r9d |
| movl %ecx,4+224(%rsp) |
| movl %r9d,4+240(%rsp) |
| add $1,%ecx |
| adc $0,%r9d |
| movl %ecx,8+224(%rsp) |
| movl %r9d,8+240(%rsp) |
| add $1,%ecx |
| adc $0,%r9d |
| movl %ecx,12+224(%rsp) |
| movl %r9d,12+240(%rsp) |
| add $1,%ecx |
| adc $0,%r9d |
| movl %ecx,32(%r8) |
| movl %r9d,52(%r8) |
| movq %rdx,288(%rsp) |
| mov %rbx,%rdx |
| movdqa 0(%rsp),%xmm0 |
| movdqa 16(%rsp),%xmm1 |
| movdqa 32(%rsp),%xmm2 |
| movdqa 192(%rsp),%xmm3 |
| movdqa 208(%rsp),%xmm4 |
| movdqa 64(%rsp),%xmm5 |
| movdqa 80(%rsp),%xmm6 |
| movdqa 112(%rsp),%xmm7 |
| movdqa 128(%rsp),%xmm8 |
| movdqa 144(%rsp),%xmm9 |
| movdqa 160(%rsp),%xmm10 |
| movdqa 240(%rsp),%xmm11 |
| movdqa 48(%rsp),%xmm12 |
| movdqa 96(%rsp),%xmm13 |
| movdqa 176(%rsp),%xmm14 |
| movdqa 224(%rsp),%xmm15 |
| .L_mainloop1: |
| movdqa %xmm1,256(%rsp) |
| movdqa %xmm2,272(%rsp) |
| movdqa %xmm13,%xmm1 |
| paddd %xmm12,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $7,%xmm1 |
| pxor %xmm1,%xmm14 |
| psrld $25,%xmm2 |
| pxor %xmm2,%xmm14 |
| movdqa %xmm7,%xmm1 |
| paddd %xmm0,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $7,%xmm1 |
| pxor %xmm1,%xmm11 |
| psrld $25,%xmm2 |
| pxor %xmm2,%xmm11 |
| movdqa %xmm12,%xmm1 |
| paddd %xmm14,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $9,%xmm1 |
| pxor %xmm1,%xmm15 |
| psrld $23,%xmm2 |
| pxor %xmm2,%xmm15 |
| movdqa %xmm0,%xmm1 |
| paddd %xmm11,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $9,%xmm1 |
| pxor %xmm1,%xmm9 |
| psrld $23,%xmm2 |
| pxor %xmm2,%xmm9 |
| movdqa %xmm14,%xmm1 |
| paddd %xmm15,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $13,%xmm1 |
| pxor %xmm1,%xmm13 |
| psrld $19,%xmm2 |
| pxor %xmm2,%xmm13 |
| movdqa %xmm11,%xmm1 |
| paddd %xmm9,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $13,%xmm1 |
| pxor %xmm1,%xmm7 |
| psrld $19,%xmm2 |
| pxor %xmm2,%xmm7 |
| movdqa %xmm15,%xmm1 |
| paddd %xmm13,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $18,%xmm1 |
| pxor %xmm1,%xmm12 |
| psrld $14,%xmm2 |
| pxor %xmm2,%xmm12 |
| movdqa 256(%rsp),%xmm1 |
| movdqa %xmm12,256(%rsp) |
| movdqa %xmm9,%xmm2 |
| paddd %xmm7,%xmm2 |
| movdqa %xmm2,%xmm12 |
| pslld $18,%xmm2 |
| pxor %xmm2,%xmm0 |
| psrld $14,%xmm12 |
| pxor %xmm12,%xmm0 |
| movdqa %xmm5,%xmm2 |
| paddd %xmm1,%xmm2 |
| movdqa %xmm2,%xmm12 |
| pslld $7,%xmm2 |
| pxor %xmm2,%xmm3 |
| psrld $25,%xmm12 |
| pxor %xmm12,%xmm3 |
| movdqa 272(%rsp),%xmm2 |
| movdqa %xmm0,272(%rsp) |
| movdqa %xmm6,%xmm0 |
| paddd %xmm2,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $7,%xmm0 |
| pxor %xmm0,%xmm4 |
| psrld $25,%xmm12 |
| pxor %xmm12,%xmm4 |
| movdqa %xmm1,%xmm0 |
| paddd %xmm3,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $9,%xmm0 |
| pxor %xmm0,%xmm10 |
| psrld $23,%xmm12 |
| pxor %xmm12,%xmm10 |
| movdqa %xmm2,%xmm0 |
| paddd %xmm4,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $9,%xmm0 |
| pxor %xmm0,%xmm8 |
| psrld $23,%xmm12 |
| pxor %xmm12,%xmm8 |
| movdqa %xmm3,%xmm0 |
| paddd %xmm10,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $13,%xmm0 |
| pxor %xmm0,%xmm5 |
| psrld $19,%xmm12 |
| pxor %xmm12,%xmm5 |
| movdqa %xmm4,%xmm0 |
| paddd %xmm8,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $13,%xmm0 |
| pxor %xmm0,%xmm6 |
| psrld $19,%xmm12 |
| pxor %xmm12,%xmm6 |
| movdqa %xmm10,%xmm0 |
| paddd %xmm5,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $18,%xmm0 |
| pxor %xmm0,%xmm1 |
| psrld $14,%xmm12 |
| pxor %xmm12,%xmm1 |
| movdqa 256(%rsp),%xmm0 |
| movdqa %xmm1,256(%rsp) |
| movdqa %xmm4,%xmm1 |
| paddd %xmm0,%xmm1 |
| movdqa %xmm1,%xmm12 |
| pslld $7,%xmm1 |
| pxor %xmm1,%xmm7 |
| psrld $25,%xmm12 |
| pxor %xmm12,%xmm7 |
| movdqa %xmm8,%xmm1 |
| paddd %xmm6,%xmm1 |
| movdqa %xmm1,%xmm12 |
| pslld $18,%xmm1 |
| pxor %xmm1,%xmm2 |
| psrld $14,%xmm12 |
| pxor %xmm12,%xmm2 |
| movdqa 272(%rsp),%xmm12 |
| movdqa %xmm2,272(%rsp) |
| movdqa %xmm14,%xmm1 |
| paddd %xmm12,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $7,%xmm1 |
| pxor %xmm1,%xmm5 |
| psrld $25,%xmm2 |
| pxor %xmm2,%xmm5 |
| movdqa %xmm0,%xmm1 |
| paddd %xmm7,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $9,%xmm1 |
| pxor %xmm1,%xmm10 |
| psrld $23,%xmm2 |
| pxor %xmm2,%xmm10 |
| movdqa %xmm12,%xmm1 |
| paddd %xmm5,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $9,%xmm1 |
| pxor %xmm1,%xmm8 |
| psrld $23,%xmm2 |
| pxor %xmm2,%xmm8 |
| movdqa %xmm7,%xmm1 |
| paddd %xmm10,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $13,%xmm1 |
| pxor %xmm1,%xmm4 |
| psrld $19,%xmm2 |
| pxor %xmm2,%xmm4 |
| movdqa %xmm5,%xmm1 |
| paddd %xmm8,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $13,%xmm1 |
| pxor %xmm1,%xmm14 |
| psrld $19,%xmm2 |
| pxor %xmm2,%xmm14 |
| movdqa %xmm10,%xmm1 |
| paddd %xmm4,%xmm1 |
| movdqa %xmm1,%xmm2 |
| pslld $18,%xmm1 |
| pxor %xmm1,%xmm0 |
| psrld $14,%xmm2 |
| pxor %xmm2,%xmm0 |
| movdqa 256(%rsp),%xmm1 |
| movdqa %xmm0,256(%rsp) |
| movdqa %xmm8,%xmm0 |
| paddd %xmm14,%xmm0 |
| movdqa %xmm0,%xmm2 |
| pslld $18,%xmm0 |
| pxor %xmm0,%xmm12 |
| psrld $14,%xmm2 |
| pxor %xmm2,%xmm12 |
| movdqa %xmm11,%xmm0 |
| paddd %xmm1,%xmm0 |
| movdqa %xmm0,%xmm2 |
| pslld $7,%xmm0 |
| pxor %xmm0,%xmm6 |
| psrld $25,%xmm2 |
| pxor %xmm2,%xmm6 |
| movdqa 272(%rsp),%xmm2 |
| movdqa %xmm12,272(%rsp) |
| movdqa %xmm3,%xmm0 |
| paddd %xmm2,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $7,%xmm0 |
| pxor %xmm0,%xmm13 |
| psrld $25,%xmm12 |
| pxor %xmm12,%xmm13 |
| movdqa %xmm1,%xmm0 |
| paddd %xmm6,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $9,%xmm0 |
| pxor %xmm0,%xmm15 |
| psrld $23,%xmm12 |
| pxor %xmm12,%xmm15 |
| movdqa %xmm2,%xmm0 |
| paddd %xmm13,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $9,%xmm0 |
| pxor %xmm0,%xmm9 |
| psrld $23,%xmm12 |
| pxor %xmm12,%xmm9 |
| movdqa %xmm6,%xmm0 |
| paddd %xmm15,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $13,%xmm0 |
| pxor %xmm0,%xmm11 |
| psrld $19,%xmm12 |
| pxor %xmm12,%xmm11 |
| movdqa %xmm13,%xmm0 |
| paddd %xmm9,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $13,%xmm0 |
| pxor %xmm0,%xmm3 |
| psrld $19,%xmm12 |
| pxor %xmm12,%xmm3 |
| movdqa %xmm15,%xmm0 |
| paddd %xmm11,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $18,%xmm0 |
| pxor %xmm0,%xmm1 |
| psrld $14,%xmm12 |
| pxor %xmm12,%xmm1 |
| movdqa %xmm9,%xmm0 |
| paddd %xmm3,%xmm0 |
| movdqa %xmm0,%xmm12 |
| pslld $18,%xmm0 |
| pxor %xmm0,%xmm2 |
| psrld $14,%xmm12 |
| pxor %xmm12,%xmm2 |
| movdqa 256(%rsp),%xmm12 |
| movdqa 272(%rsp),%xmm0 |
| sub $2,%rdx |
| ja .L_mainloop1 |
| paddd 48(%rsp),%xmm12 |
| paddd 112(%rsp),%xmm7 |
| paddd 160(%rsp),%xmm10 |
| paddd 208(%rsp),%xmm4 |
| movd %xmm12,%rdx |
| movd %xmm7,%rcx |
| movd %xmm10,%r9 |
| movd %xmm4,%rax |
| pshufd $0x39,%xmm12,%xmm12 |
| pshufd $0x39,%xmm7,%xmm7 |
| pshufd $0x39,%xmm10,%xmm10 |
| pshufd $0x39,%xmm4,%xmm4 |
| xorl 0(%rsi),%edx |
| xorl 4(%rsi),%ecx |
| xorl 8(%rsi),%r9d |
| xorl 12(%rsi),%eax |
| movl %edx,0(%rdi) |
| movl %ecx,4(%rdi) |
| movl %r9d,8(%rdi) |
| movl %eax,12(%rdi) |
| movd %xmm12,%rdx |
| movd %xmm7,%rcx |
| movd %xmm10,%r9 |
| movd %xmm4,%rax |
| pshufd $0x39,%xmm12,%xmm12 |
| pshufd $0x39,%xmm7,%xmm7 |
| pshufd $0x39,%xmm10,%xmm10 |
| pshufd $0x39,%xmm4,%xmm4 |
| xorl 64(%rsi),%edx |
| xorl 68(%rsi),%ecx |
| xorl 72(%rsi),%r9d |
| xorl 76(%rsi),%eax |
| movl %edx,64(%rdi) |
| movl %ecx,68(%rdi) |
| movl %r9d,72(%rdi) |
| movl %eax,76(%rdi) |
| movd %xmm12,%rdx |
| movd %xmm7,%rcx |
| movd %xmm10,%r9 |
| movd %xmm4,%rax |
| pshufd $0x39,%xmm12,%xmm12 |
| pshufd $0x39,%xmm7,%xmm7 |
| pshufd $0x39,%xmm10,%xmm10 |
| pshufd $0x39,%xmm4,%xmm4 |
| xorl 128(%rsi),%edx |
| xorl 132(%rsi),%ecx |
| xorl 136(%rsi),%r9d |
| xorl 140(%rsi),%eax |
| movl %edx,128(%rdi) |
| movl %ecx,132(%rdi) |
| movl %r9d,136(%rdi) |
| movl %eax,140(%rdi) |
| movd %xmm12,%rdx |
| movd %xmm7,%rcx |
| movd %xmm10,%r9 |
| movd %xmm4,%rax |
| xorl 192(%rsi),%edx |
| xorl 196(%rsi),%ecx |
| xorl 200(%rsi),%r9d |
| xorl 204(%rsi),%eax |
| movl %edx,192(%rdi) |
| movl %ecx,196(%rdi) |
| movl %r9d,200(%rdi) |
| movl %eax,204(%rdi) |
| paddd 176(%rsp),%xmm14 |
| paddd 0(%rsp),%xmm0 |
| paddd 64(%rsp),%xmm5 |
| paddd 128(%rsp),%xmm8 |
| movd %xmm14,%rdx |
| movd %xmm0,%rcx |
| movd %xmm5,%r9 |
| movd %xmm8,%rax |
| pshufd $0x39,%xmm14,%xmm14 |
| pshufd $0x39,%xmm0,%xmm0 |
| pshufd $0x39,%xmm5,%xmm5 |
| pshufd $0x39,%xmm8,%xmm8 |
| xorl 16(%rsi),%edx |
| xorl 20(%rsi),%ecx |
| xorl 24(%rsi),%r9d |
| xorl 28(%rsi),%eax |
| movl %edx,16(%rdi) |
| movl %ecx,20(%rdi) |
| movl %r9d,24(%rdi) |
| movl %eax,28(%rdi) |
| movd %xmm14,%rdx |
| movd %xmm0,%rcx |
| movd %xmm5,%r9 |
| movd %xmm8,%rax |
| pshufd $0x39,%xmm14,%xmm14 |
| pshufd $0x39,%xmm0,%xmm0 |
| pshufd $0x39,%xmm5,%xmm5 |
| pshufd $0x39,%xmm8,%xmm8 |
| xorl 80(%rsi),%edx |
| xorl 84(%rsi),%ecx |
| xorl 88(%rsi),%r9d |
| xorl 92(%rsi),%eax |
| movl %edx,80(%rdi) |
| movl %ecx,84(%rdi) |
| movl %r9d,88(%rdi) |
| movl %eax,92(%rdi) |
| movd %xmm14,%rdx |
| movd %xmm0,%rcx |
| movd %xmm5,%r9 |
| movd %xmm8,%rax |
| pshufd $0x39,%xmm14,%xmm14 |
| pshufd $0x39,%xmm0,%xmm0 |
| pshufd $0x39,%xmm5,%xmm5 |
| pshufd $0x39,%xmm8,%xmm8 |
| xorl 144(%rsi),%edx |
| xorl 148(%rsi),%ecx |
| xorl 152(%rsi),%r9d |
| xorl 156(%rsi),%eax |
| movl %edx,144(%rdi) |
| movl %ecx,148(%rdi) |
| movl %r9d,152(%rdi) |
| movl %eax,156(%rdi) |
| movd %xmm14,%rdx |
| movd %xmm0,%rcx |
| movd %xmm5,%r9 |
| movd %xmm8,%rax |
| xorl 208(%rsi),%edx |
| xorl 212(%rsi),%ecx |
| xorl 216(%rsi),%r9d |
| xorl 220(%rsi),%eax |
| movl %edx,208(%rdi) |
| movl %ecx,212(%rdi) |
| movl %r9d,216(%rdi) |
| movl %eax,220(%rdi) |
| paddd 224(%rsp),%xmm15 |
| paddd 240(%rsp),%xmm11 |
| paddd 16(%rsp),%xmm1 |
| paddd 80(%rsp),%xmm6 |
| movd %xmm15,%rdx |
| movd %xmm11,%rcx |
| movd %xmm1,%r9 |
| movd %xmm6,%rax |
| pshufd $0x39,%xmm15,%xmm15 |
| pshufd $0x39,%xmm11,%xmm11 |
| pshufd $0x39,%xmm1,%xmm1 |
| pshufd $0x39,%xmm6,%xmm6 |
| xorl 32(%rsi),%edx |
| xorl 36(%rsi),%ecx |
| xorl 40(%rsi),%r9d |
| xorl 44(%rsi),%eax |
| movl %edx,32(%rdi) |
| movl %ecx,36(%rdi) |
| movl %r9d,40(%rdi) |
| movl %eax,44(%rdi) |
| movd %xmm15,%rdx |
| movd %xmm11,%rcx |
| movd %xmm1,%r9 |
| movd %xmm6,%rax |
| pshufd $0x39,%xmm15,%xmm15 |
| pshufd $0x39,%xmm11,%xmm11 |
| pshufd $0x39,%xmm1,%xmm1 |
| pshufd $0x39,%xmm6,%xmm6 |
| xorl 96(%rsi),%edx |
| xorl 100(%rsi),%ecx |
| xorl 104(%rsi),%r9d |
| xorl 108(%rsi),%eax |
| movl %edx,96(%rdi) |
| movl %ecx,100(%rdi) |
| movl %r9d,104(%rdi) |
| movl %eax,108(%rdi) |
| movd %xmm15,%rdx |
| movd %xmm11,%rcx |
| movd %xmm1,%r9 |
| movd %xmm6,%rax |
| pshufd $0x39,%xmm15,%xmm15 |
| pshufd $0x39,%xmm11,%xmm11 |
| pshufd $0x39,%xmm1,%xmm1 |
| pshufd $0x39,%xmm6,%xmm6 |
| xorl 160(%rsi),%edx |
| xorl 164(%rsi),%ecx |
| xorl 168(%rsi),%r9d |
| xorl 172(%rsi),%eax |
| movl %edx,160(%rdi) |
| movl %ecx,164(%rdi) |
| movl %r9d,168(%rdi) |
| movl %eax,172(%rdi) |
| movd %xmm15,%rdx |
| movd %xmm11,%rcx |
| movd %xmm1,%r9 |
| movd %xmm6,%rax |
| xorl 224(%rsi),%edx |
| xorl 228(%rsi),%ecx |
| xorl 232(%rsi),%r9d |
| xorl 236(%rsi),%eax |
| movl %edx,224(%rdi) |
| movl %ecx,228(%rdi) |
| movl %r9d,232(%rdi) |
| movl %eax,236(%rdi) |
| paddd 96(%rsp),%xmm13 |
| paddd 144(%rsp),%xmm9 |
| paddd 192(%rsp),%xmm3 |
| paddd 32(%rsp),%xmm2 |
| movd %xmm13,%rdx |
| movd %xmm9,%rcx |
| movd %xmm3,%r9 |
| movd %xmm2,%rax |
| pshufd $0x39,%xmm13,%xmm13 |
| pshufd $0x39,%xmm9,%xmm9 |
| pshufd $0x39,%xmm3,%xmm3 |
| pshufd $0x39,%xmm2,%xmm2 |
| xorl 48(%rsi),%edx |
| xorl 52(%rsi),%ecx |
| xorl 56(%rsi),%r9d |
| xorl 60(%rsi),%eax |
| movl %edx,48(%rdi) |
| movl %ecx,52(%rdi) |
| movl %r9d,56(%rdi) |
| movl %eax,60(%rdi) |
| movd %xmm13,%rdx |
| movd %xmm9,%rcx |
| movd %xmm3,%r9 |
| movd %xmm2,%rax |
| pshufd $0x39,%xmm13,%xmm13 |
| pshufd $0x39,%xmm9,%xmm9 |
| pshufd $0x39,%xmm3,%xmm3 |
| pshufd $0x39,%xmm2,%xmm2 |
| xorl 112(%rsi),%edx |
| xorl 116(%rsi),%ecx |
| xorl 120(%rsi),%r9d |
| xorl 124(%rsi),%eax |
| movl %edx,112(%rdi) |
| movl %ecx,116(%rdi) |
| movl %r9d,120(%rdi) |
| movl %eax,124(%rdi) |
| movd %xmm13,%rdx |
| movd %xmm9,%rcx |
| movd %xmm3,%r9 |
| movd %xmm2,%rax |
| pshufd $0x39,%xmm13,%xmm13 |
| pshufd $0x39,%xmm9,%xmm9 |
| pshufd $0x39,%xmm3,%xmm3 |
| pshufd $0x39,%xmm2,%xmm2 |
| xorl 176(%rsi),%edx |
| xorl 180(%rsi),%ecx |
| xorl 184(%rsi),%r9d |
| xorl 188(%rsi),%eax |
| movl %edx,176(%rdi) |
| movl %ecx,180(%rdi) |
| movl %r9d,184(%rdi) |
| movl %eax,188(%rdi) |
| movd %xmm13,%rdx |
| movd %xmm9,%rcx |
| movd %xmm3,%r9 |
| movd %xmm2,%rax |
| xorl 240(%rsi),%edx |
| xorl 244(%rsi),%ecx |
| xorl 248(%rsi),%r9d |
| xorl 252(%rsi),%eax |
| movl %edx,240(%rdi) |
| movl %ecx,244(%rdi) |
| movl %r9d,248(%rdi) |
| movl %eax,252(%rdi) |
| movq 288(%rsp),%rdx |
| sub $256,%rdx |
| add $256,%rsi |
| add $256,%rdi |
| cmp $256,%rdx |
| jae .L_bytesatleast256 |
| cmp $0,%rdx |
| jbe .L_done |
| .L_bytes_are_64_128_or_192: |
| movq %rdx,288(%rsp) |
| movdqa 0(%r8),%xmm0 |
| movdqa 16(%r8),%xmm1 |
| movdqa 32(%r8),%xmm2 |
| movdqa 48(%r8),%xmm3 |
| movdqa %xmm1,%xmm4 |
| mov %rbx,%rdx |
| .L_mainloop2: |
| paddd %xmm0,%xmm4 |
| movdqa %xmm0,%xmm5 |
| movdqa %xmm4,%xmm6 |
| pslld $7,%xmm4 |
| psrld $25,%xmm6 |
| pxor %xmm4,%xmm3 |
| pxor %xmm6,%xmm3 |
| paddd %xmm3,%xmm5 |
| movdqa %xmm3,%xmm4 |
| movdqa %xmm5,%xmm6 |
| pslld $9,%xmm5 |
| psrld $23,%xmm6 |
| pxor %xmm5,%xmm2 |
| pshufd $0x93,%xmm3,%xmm3 |
| pxor %xmm6,%xmm2 |
| paddd %xmm2,%xmm4 |
| movdqa %xmm2,%xmm5 |
| movdqa %xmm4,%xmm6 |
| pslld $13,%xmm4 |
| psrld $19,%xmm6 |
| pxor %xmm4,%xmm1 |
| pshufd $0x4e,%xmm2,%xmm2 |
| pxor %xmm6,%xmm1 |
| paddd %xmm1,%xmm5 |
| movdqa %xmm3,%xmm4 |
| movdqa %xmm5,%xmm6 |
| pslld $18,%xmm5 |
| psrld $14,%xmm6 |
| pxor %xmm5,%xmm0 |
| pshufd $0x39,%xmm1,%xmm1 |
| pxor %xmm6,%xmm0 |
| paddd %xmm0,%xmm4 |
| movdqa %xmm0,%xmm5 |
| movdqa %xmm4,%xmm6 |
| pslld $7,%xmm4 |
| psrld $25,%xmm6 |
| pxor %xmm4,%xmm1 |
| pxor %xmm6,%xmm1 |
| paddd %xmm1,%xmm5 |
| movdqa %xmm1,%xmm4 |
| movdqa %xmm5,%xmm6 |
| pslld $9,%xmm5 |
| psrld $23,%xmm6 |
| pxor %xmm5,%xmm2 |
| pshufd $0x93,%xmm1,%xmm1 |
| pxor %xmm6,%xmm2 |
| paddd %xmm2,%xmm4 |
| movdqa %xmm2,%xmm5 |
| movdqa %xmm4,%xmm6 |
| pslld $13,%xmm4 |
| psrld $19,%xmm6 |
| pxor %xmm4,%xmm3 |
| pshufd $0x4e,%xmm2,%xmm2 |
| pxor %xmm6,%xmm3 |
| paddd %xmm3,%xmm5 |
| movdqa %xmm1,%xmm4 |
| movdqa %xmm5,%xmm6 |
| pslld $18,%xmm5 |
| psrld $14,%xmm6 |
| pxor %xmm5,%xmm0 |
| pshufd $0x39,%xmm3,%xmm3 |
| pxor %xmm6,%xmm0 |
| paddd %xmm0,%xmm4 |
| movdqa %xmm0,%xmm5 |
| movdqa %xmm4,%xmm6 |
| pslld $7,%xmm4 |
| psrld $25,%xmm6 |
| pxor %xmm4,%xmm3 |
| pxor %xmm6,%xmm3 |
| paddd %xmm3,%xmm5 |
| movdqa %xmm3,%xmm4 |
| movdqa %xmm5,%xmm6 |
| pslld $9,%xmm5 |
| psrld $23,%xmm6 |
| pxor %xmm5,%xmm2 |
| pshufd $0x93,%xmm3,%xmm3 |
| pxor %xmm6,%xmm2 |
| paddd %xmm2,%xmm4 |
| movdqa %xmm2,%xmm5 |
| movdqa %xmm4,%xmm6 |
| pslld $13,%xmm4 |
| psrld $19,%xmm6 |
| pxor %xmm4,%xmm1 |
| pshufd $0x4e,%xmm2,%xmm2 |
| pxor %xmm6,%xmm1 |
| paddd %xmm1,%xmm5 |
| movdqa %xmm3,%xmm4 |
| movdqa %xmm5,%xmm6 |
| pslld $18,%xmm5 |
| psrld $14,%xmm6 |
| pxor %xmm5,%xmm0 |
| pshufd $0x39,%xmm1,%xmm1 |
| pxor %xmm6,%xmm0 |
| paddd %xmm0,%xmm4 |
| movdqa %xmm0,%xmm5 |
| movdqa %xmm4,%xmm6 |
| pslld $7,%xmm4 |
| psrld $25,%xmm6 |
| pxor %xmm4,%xmm1 |
| pxor %xmm6,%xmm1 |
| paddd %xmm1,%xmm5 |
| movdqa %xmm1,%xmm4 |
| movdqa %xmm5,%xmm6 |
| pslld $9,%xmm5 |
| psrld $23,%xmm6 |
| pxor %xmm5,%xmm2 |
| pshufd $0x93,%xmm1,%xmm1 |
| pxor %xmm6,%xmm2 |
| paddd %xmm2,%xmm4 |
| movdqa %xmm2,%xmm5 |
| movdqa %xmm4,%xmm6 |
| pslld $13,%xmm4 |
| psrld $19,%xmm6 |
| pxor %xmm4,%xmm3 |
| pshufd $0x4e,%xmm2,%xmm2 |
| pxor %xmm6,%xmm3 |
| sub $4,%rdx |
| paddd %xmm3,%xmm5 |
| movdqa %xmm1,%xmm4 |
| movdqa %xmm5,%xmm6 |
| pslld $18,%xmm5 |
| pxor %xmm7,%xmm7 |
| psrld $14,%xmm6 |
| pxor %xmm5,%xmm0 |
| pshufd $0x39,%xmm3,%xmm3 |
| pxor %xmm6,%xmm0 |
| ja .L_mainloop2 |
| paddd 0(%r8),%xmm0 |
| paddd 16(%r8),%xmm1 |
| paddd 32(%r8),%xmm2 |
| paddd 48(%r8),%xmm3 |
| movd %xmm0,%rdx |
| movd %xmm1,%rcx |
| movd %xmm2,%rax |
| movd %xmm3,%r10 |
| pshufd $0x39,%xmm0,%xmm0 |
| pshufd $0x39,%xmm1,%xmm1 |
| pshufd $0x39,%xmm2,%xmm2 |
| pshufd $0x39,%xmm3,%xmm3 |
| xorl 0(%rsi),%edx |
| xorl 48(%rsi),%ecx |
| xorl 32(%rsi),%eax |
| xorl 16(%rsi),%r10d |
| movl %edx,0(%rdi) |
| movl %ecx,48(%rdi) |
| movl %eax,32(%rdi) |
| movl %r10d,16(%rdi) |
| movd %xmm0,%rdx |
| movd %xmm1,%rcx |
| movd %xmm2,%rax |
| movd %xmm3,%r10 |
| pshufd $0x39,%xmm0,%xmm0 |
| pshufd $0x39,%xmm1,%xmm1 |
| pshufd $0x39,%xmm2,%xmm2 |
| pshufd $0x39,%xmm3,%xmm3 |
| xorl 20(%rsi),%edx |
| xorl 4(%rsi),%ecx |
| xorl 52(%rsi),%eax |
| xorl 36(%rsi),%r10d |
| movl %edx,20(%rdi) |
| movl %ecx,4(%rdi) |
| movl %eax,52(%rdi) |
| movl %r10d,36(%rdi) |
| movd %xmm0,%rdx |
| movd %xmm1,%rcx |
| movd %xmm2,%rax |
| movd %xmm3,%r10 |
| pshufd $0x39,%xmm0,%xmm0 |
| pshufd $0x39,%xmm1,%xmm1 |
| pshufd $0x39,%xmm2,%xmm2 |
| pshufd $0x39,%xmm3,%xmm3 |
| xorl 40(%rsi),%edx |
| xorl 24(%rsi),%ecx |
| xorl 8(%rsi),%eax |
| xorl 56(%rsi),%r10d |
| movl %edx,40(%rdi) |
| movl %ecx,24(%rdi) |
| movl %eax,8(%rdi) |
| movl %r10d,56(%rdi) |
| movd %xmm0,%rdx |
| movd %xmm1,%rcx |
| movd %xmm2,%rax |
| movd %xmm3,%r10 |
| xorl 60(%rsi),%edx |
| xorl 44(%rsi),%ecx |
| xorl 28(%rsi),%eax |
| xorl 12(%rsi),%r10d |
| movl %edx,60(%rdi) |
| movl %ecx,44(%rdi) |
| movl %eax,28(%rdi) |
| movl %r10d,12(%rdi) |
| movq 288(%rsp),%rdx |
| movl 32(%r8),%ecx |
| movl 52(%r8),%eax |
| add $1,%ecx |
| adc $0,%eax |
| movl %ecx,32(%r8) |
| movl %eax,52(%r8) |
| cmp $64,%rdx |
| ja .L_bytes_are_128_or_192 |
| .L_done: |
| add %r11,%rsp |
| mov %r11,%rax |
| pop %rbx |
| ret |
| .L_bytes_are_128_or_192: |
| sub $64,%rdx |
| add $64,%rdi |
| add $64,%rsi |
| jmp .L_bytes_are_64_128_or_192 |
| .size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks; |
| |
| #endif /*defined(USE_SALSA20)*/ |
| #endif /*__x86_64*/ |