libgcrypt-1.4.6/mpi/pentium4/mmx/mpih-rshift.S - nest-learning-thermostat/5.1.6/libgcrypt - Git at Google

 /* Intel Pentium-4 mpn_rshift -- right shift.
  *
  * Copyright 2001, 2002 Free Software Foundation, Inc.
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  *
  * Note: This code is heavily based on the GNU MP Library.
  *	 Actually it's the same code with only minor changes in the
  *	 way the data is stored; this is to support the abstraction
  *	 of an optional secure memory allocation which may be used
  *	 to avoid revealing of sensitive data due to paging etc.
  */


 #include "sysdep.h"
 #include "asm-syntax.h"


 /*******************
  * mpi_limb_t
  * _gcry_mpih_rshift( mpi_ptr_t wp,	(sp + 4)
  *		   mpi_ptr_t up,	(sp + 8)
  *		   mpi_size_t usize,	(sp + 12)
  *		   unsigned cnt)	(sp + 16)
  *
  * P4 Willamette, Northwood: 1.75 cycles/limb
  * P4 Prescott:		     2.0 cycles/limb
  */

 .text
 	ALIGN (3)
 	.globl C_SYMBOL_NAME(_gcry_mpih_rshift)
 C_SYMBOL_NAME(_gcry_mpih_rshift:)
 	pushl	%ebx
 	pushl	%edi


 	movl	20(%esp), %eax
 	movl	12(%esp), %edx

 	movl	16(%esp), %ebx
 	movl	24(%esp), %ecx

 	cmp	$5, %eax
 	jae	.Lunroll

 	decl	%eax
 	movl	(%ebx), %edi

 	jnz	.Lsimple

 	shrdl	%cl, %edi, %eax

 	shrl	%cl, %edi

 	movl	%edi, (%edx)
 	popl	%edi

 	popl	%ebx

 	ret


 	.align	8, 0x90
 .Lsimple:


 	movd	(%ebx), %mm5
 	leal	(%ebx,%eax,4), %ebx

 	movd	%ecx, %mm6
 	leal	-4(%edx,%eax,4), %edx

 	psllq	$32, %mm5
 	negl	%eax


 .Lsimple_top:


 	movq	(%ebx,%eax,4), %mm0
 	incl	%eax

 	psrlq	%mm6, %mm0

 	movd	%mm0, (%edx,%eax,4)
 	jnz	.Lsimple_top


 	movd	(%ebx), %mm0
 	psrlq	%mm6, %mm5

 	psrlq	%mm6, %mm0
 	popl	%edi

 	movd	%mm5, %eax
 	popl	%ebx

 	movd	%mm0, 4(%edx)

 	emms

 	ret


 	.align	8, 0x90
 .Lunroll:


 	movd	(%ebx), %mm5
 	movl	$4, %edi

 	movd	%ecx, %mm6
 	testl	%edi, %ebx

 	psllq	$32, %mm5
 	jz	.Lstart_src_aligned


 	movq	(%ebx), %mm0

 	psrlq	%mm6, %mm0
 	addl	$4, %ebx

 	decl	%eax

 	movd	%mm0, (%edx)
 	addl	$4, %edx
 .Lstart_src_aligned:


 	movq	(%ebx), %mm1
 	testl	%edi, %edx

 	psrlq	%mm6, %mm5
 	jz	.Lstart_dst_aligned


 	movq	%mm1, %mm0
 	addl	$32, %ecx

 	psrlq	%mm6, %mm0

 	movd	%ecx, %mm6

 	movd	%mm0, (%edx)
 	addl	$4, %edx
 .Lstart_dst_aligned:


 	movq	8(%ebx), %mm3
 	negl	%ecx

 	movq	%mm3, %mm2
 	addl	$64, %ecx

 	movd	%ecx, %mm7
 	psrlq	%mm6, %mm1

 	leal	-12(%ebx,%eax,4), %ebx
 	leal	-20(%edx,%eax,4), %edx

 	psllq	%mm7, %mm3
 	subl	$7, %eax

 	por	%mm1, %mm3
 	negl	%eax

 	jns	.Lfinish


 	.align	8, 0x90
 .Lunroll_loop:


 	movq	(%ebx,%eax,4), %mm0
 	psrlq	%mm6, %mm2

 	movq	%mm0, %mm1
 	psllq	%mm7, %mm0

 	movq	%mm3, -8(%edx,%eax,4)
 	por	%mm2, %mm0

 	movq	8(%ebx,%eax,4), %mm3
 	psrlq	%mm6, %mm1

 	movq	%mm0, (%edx,%eax,4)
 	movq	%mm3, %mm2

 	psllq	%mm7, %mm3
 	addl	$4, %eax

 	por	%mm1, %mm3
 	js	.Lunroll_loop


 .Lfinish:


 	testb	$2, %al

 	jnz	.Lfinish_no_two

 	movq	(%ebx,%eax,4), %mm0
 	psrlq	%mm6, %mm2

 	movq	%mm0, %mm1
 	psllq	%mm7, %mm0

 	movq	%mm3, -8(%edx,%eax,4)
 	por	%mm2, %mm0

 	movq	%mm1, %mm2
 	movq	%mm0, %mm3

 	addl	$2, %eax
 .Lfinish_no_two:


 	testb	$1, %al
 	popl	%edi

 	movd	%mm5, %eax
 	jnz	.Lfinish_zero


 	movd	8(%ebx), %mm0
 	psrlq	%mm6, %mm2

 	movq	%mm0, %mm1
 	psllq	%mm7, %mm0

 	movq	%mm3, (%edx)
 	por	%mm2, %mm0

 	psrlq	%mm6, %mm1
 	andl	$32, %ecx

 	popl	%ebx
 	jz	.Lfinish_one_unaligned


 	movd	%mm1, 16(%edx)
 .Lfinish_one_unaligned:

 	movq	%mm0, 8(%edx)

 	emms

 	ret


 .Lfinish_zero:


 	movq	%mm3, 4(%edx)
 	psrlq	%mm6, %mm2

 	movd	%mm2, 12(%edx)
 	andl	$32, %ecx

 	popl	%ebx
 	jz	.Lfinish_zero_unaligned

 	movq	%mm2, 12(%edx)
 .Lfinish_zero_unaligned:

 	emms

 	ret
	/* Intel Pentium-4 mpn_rshift -- right shift.
	*
	* Copyright 2001, 2002 Free Software Foundation, Inc.
	*
	* This file is part of Libgcrypt.
	*
	* Libgcrypt is free software; you can redistribute it and/or modify
	* it under the terms of the GNU Lesser General Public License as
	* published by the Free Software Foundation; either version 2.1 of
	* the License, or (at your option) any later version.
	*
	* Libgcrypt is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with this program; if not, write to the Free Software
	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
	*
	* Note: This code is heavily based on the GNU MP Library.
	* Actually it's the same code with only minor changes in the
	* way the data is stored; this is to support the abstraction
	* of an optional secure memory allocation which may be used
	* to avoid revealing of sensitive data due to paging etc.
	*/


	#include "sysdep.h"
	#include "asm-syntax.h"


	/*******************
	* mpi_limb_t
	* _gcry_mpih_rshift( mpi_ptr_t wp, (sp + 4)
	* mpi_ptr_t up, (sp + 8)
	* mpi_size_t usize, (sp + 12)
	* unsigned cnt) (sp + 16)
	*
	* P4 Willamette, Northwood: 1.75 cycles/limb
	* P4 Prescott: 2.0 cycles/limb
	*/

	.text
	ALIGN (3)
	.globl C_SYMBOL_NAME(_gcry_mpih_rshift)
	C_SYMBOL_NAME(_gcry_mpih_rshift:)
	pushl %ebx
	pushl %edi


	movl 20(%esp), %eax
	movl 12(%esp), %edx

	movl 16(%esp), %ebx
	movl 24(%esp), %ecx

	cmp $5, %eax
	jae .Lunroll

	decl %eax
	movl (%ebx), %edi

	jnz .Lsimple

	shrdl %cl, %edi, %eax

	shrl %cl, %edi

	movl %edi, (%edx)
	popl %edi

	popl %ebx

	ret





	.align 8, 0x90
	.Lsimple:









	movd (%ebx), %mm5
	leal (%ebx,%eax,4), %ebx

	movd %ecx, %mm6
	leal -4(%edx,%eax,4), %edx

	psllq $32, %mm5
	negl %eax







	.Lsimple_top:









	movq (%ebx,%eax,4), %mm0
	incl %eax

	psrlq %mm6, %mm0

	movd %mm0, (%edx,%eax,4)
	jnz .Lsimple_top


	movd (%ebx), %mm0
	psrlq %mm6, %mm5

	psrlq %mm6, %mm0
	popl %edi

	movd %mm5, %eax
	popl %ebx

	movd %mm0, 4(%edx)

	emms

	ret





	.align 8, 0x90
	.Lunroll:









	movd (%ebx), %mm5
	movl $4, %edi

	movd %ecx, %mm6
	testl %edi, %ebx

	psllq $32, %mm5
	jz .Lstart_src_aligned
















	movq (%ebx), %mm0

	psrlq %mm6, %mm0
	addl $4, %ebx

	decl %eax

	movd %mm0, (%edx)
	addl $4, %edx
	.Lstart_src_aligned:


	movq (%ebx), %mm1
	testl %edi, %edx

	psrlq %mm6, %mm5
	jz .Lstart_dst_aligned

















	movq %mm1, %mm0
	addl $32, %ecx

	psrlq %mm6, %mm0

	movd %ecx, %mm6

	movd %mm0, (%edx)
	addl $4, %edx
	.Lstart_dst_aligned:


	movq 8(%ebx), %mm3
	negl %ecx

	movq %mm3, %mm2
	addl $64, %ecx

	movd %ecx, %mm7
	psrlq %mm6, %mm1

	leal -12(%ebx,%eax,4), %ebx
	leal -20(%edx,%eax,4), %edx

	psllq %mm7, %mm3
	subl $7, %eax

	por %mm1, %mm3
	negl %eax

	jns .Lfinish















	.align 8, 0x90
	.Lunroll_loop:
















	movq (%ebx,%eax,4), %mm0
	psrlq %mm6, %mm2

	movq %mm0, %mm1
	psllq %mm7, %mm0

	movq %mm3, -8(%edx,%eax,4)
	por %mm2, %mm0

	movq 8(%ebx,%eax,4), %mm3
	psrlq %mm6, %mm1

	movq %mm0, (%edx,%eax,4)
	movq %mm3, %mm2

	psllq %mm7, %mm3
	addl $4, %eax

	por %mm1, %mm3
	js .Lunroll_loop


	.Lfinish:


	testb $2, %al

	jnz .Lfinish_no_two

	movq (%ebx,%eax,4), %mm0
	psrlq %mm6, %mm2

	movq %mm0, %mm1
	psllq %mm7, %mm0

	movq %mm3, -8(%edx,%eax,4)
	por %mm2, %mm0

	movq %mm1, %mm2
	movq %mm0, %mm3

	addl $2, %eax
	.Lfinish_no_two:







	testb $1, %al
	popl %edi

	movd %mm5, %eax
	jnz .Lfinish_zero







































	movd 8(%ebx), %mm0
	psrlq %mm6, %mm2

	movq %mm0, %mm1
	psllq %mm7, %mm0

	movq %mm3, (%edx)
	por %mm2, %mm0

	psrlq %mm6, %mm1
	andl $32, %ecx

	popl %ebx
	jz .Lfinish_one_unaligned


	movd %mm1, 16(%edx)
	.Lfinish_one_unaligned:

	movq %mm0, 8(%edx)

	emms

	ret




	.Lfinish_zero:






































	movq %mm3, 4(%edx)
	psrlq %mm6, %mm2

	movd %mm2, 12(%edx)
	andl $32, %ecx

	popl %ebx
	jz .Lfinish_zero_unaligned

	movq %mm2, 12(%edx)
	.Lfinish_zero_unaligned:

	emms

	ret