blob: e3374e3ba3c273a49d4c4d01946c4900505c864a [file] [log] [blame]
/* Intel Pentium-4 mpn_rshift -- right shift.
*
* Copyright 2001, 2002 Free Software Foundation, Inc.
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*
* Note: This code is heavily based on the GNU MP Library.
* Actually it's the same code with only minor changes in the
* way the data is stored; this is to support the abstraction
* of an optional secure memory allocation which may be used
* to avoid revealing of sensitive data due to paging etc.
*/
#include "sysdep.h"
#include "asm-syntax.h"
/*******************
* mpi_limb_t
* _gcry_mpih_rshift( mpi_ptr_t wp, (sp + 4)
* mpi_ptr_t up, (sp + 8)
* mpi_size_t usize, (sp + 12)
* unsigned cnt) (sp + 16)
*
* P4 Willamette, Northwood: 1.75 cycles/limb
* P4 Prescott: 2.0 cycles/limb
*/
.text
ALIGN (3)
.globl C_SYMBOL_NAME(_gcry_mpih_rshift)
C_SYMBOL_NAME(_gcry_mpih_rshift:)
pushl %ebx
pushl %edi
movl 20(%esp), %eax
movl 12(%esp), %edx
movl 16(%esp), %ebx
movl 24(%esp), %ecx
cmp $5, %eax
jae .Lunroll
decl %eax
movl (%ebx), %edi
jnz .Lsimple
shrdl %cl, %edi, %eax
shrl %cl, %edi
movl %edi, (%edx)
popl %edi
popl %ebx
ret
.align 8, 0x90
.Lsimple:
movd (%ebx), %mm5
leal (%ebx,%eax,4), %ebx
movd %ecx, %mm6
leal -4(%edx,%eax,4), %edx
psllq $32, %mm5
negl %eax
.Lsimple_top:
movq (%ebx,%eax,4), %mm0
incl %eax
psrlq %mm6, %mm0
movd %mm0, (%edx,%eax,4)
jnz .Lsimple_top
movd (%ebx), %mm0
psrlq %mm6, %mm5
psrlq %mm6, %mm0
popl %edi
movd %mm5, %eax
popl %ebx
movd %mm0, 4(%edx)
emms
ret
.align 8, 0x90
.Lunroll:
movd (%ebx), %mm5
movl $4, %edi
movd %ecx, %mm6
testl %edi, %ebx
psllq $32, %mm5
jz .Lstart_src_aligned
movq (%ebx), %mm0
psrlq %mm6, %mm0
addl $4, %ebx
decl %eax
movd %mm0, (%edx)
addl $4, %edx
.Lstart_src_aligned:
movq (%ebx), %mm1
testl %edi, %edx
psrlq %mm6, %mm5
jz .Lstart_dst_aligned
movq %mm1, %mm0
addl $32, %ecx
psrlq %mm6, %mm0
movd %ecx, %mm6
movd %mm0, (%edx)
addl $4, %edx
.Lstart_dst_aligned:
movq 8(%ebx), %mm3
negl %ecx
movq %mm3, %mm2
addl $64, %ecx
movd %ecx, %mm7
psrlq %mm6, %mm1
leal -12(%ebx,%eax,4), %ebx
leal -20(%edx,%eax,4), %edx
psllq %mm7, %mm3
subl $7, %eax
por %mm1, %mm3
negl %eax
jns .Lfinish
.align 8, 0x90
.Lunroll_loop:
movq (%ebx,%eax,4), %mm0
psrlq %mm6, %mm2
movq %mm0, %mm1
psllq %mm7, %mm0
movq %mm3, -8(%edx,%eax,4)
por %mm2, %mm0
movq 8(%ebx,%eax,4), %mm3
psrlq %mm6, %mm1
movq %mm0, (%edx,%eax,4)
movq %mm3, %mm2
psllq %mm7, %mm3
addl $4, %eax
por %mm1, %mm3
js .Lunroll_loop
.Lfinish:
testb $2, %al
jnz .Lfinish_no_two
movq (%ebx,%eax,4), %mm0
psrlq %mm6, %mm2
movq %mm0, %mm1
psllq %mm7, %mm0
movq %mm3, -8(%edx,%eax,4)
por %mm2, %mm0
movq %mm1, %mm2
movq %mm0, %mm3
addl $2, %eax
.Lfinish_no_two:
testb $1, %al
popl %edi
movd %mm5, %eax
jnz .Lfinish_zero
movd 8(%ebx), %mm0
psrlq %mm6, %mm2
movq %mm0, %mm1
psllq %mm7, %mm0
movq %mm3, (%edx)
por %mm2, %mm0
psrlq %mm6, %mm1
andl $32, %ecx
popl %ebx
jz .Lfinish_one_unaligned
movd %mm1, 16(%edx)
.Lfinish_one_unaligned:
movq %mm0, 8(%edx)
emms
ret
.Lfinish_zero:
movq %mm3, 4(%edx)
psrlq %mm6, %mm2
movd %mm2, 12(%edx)
andl $32, %ecx
popl %ebx
jz .Lfinish_zero_unaligned
movq %mm2, 12(%edx)
.Lfinish_zero_unaligned:
emms
ret