nss-3.41/nss/lib/freebl/mpi/mpi_x86_os2.s - manifest_repos/nss - Git at Google

 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.

 .data
 .align 4
  #
  # -1 means to call _s_mpi_is_sse to determine if we support sse
  #    instructions.
  #  0 means to use x86 instructions
  #  1 means to use sse2 instructions
 .type	is_sse,@object
 .size	is_sse,4
 is_sse: .long	-1

 #
 # sigh, handle the difference between -fPIC and not PIC
 # default to pic, since this file seems to be exclusively
 # linux right now (solaris uses mpi_i86pc.s and windows uses
 # mpi_x86_asm.c)
 #
 #.ifndef NO_PIC
 #.macro GET   var,reg
 #    movl   \var@GOTOFF(%ebx),\reg
 #.endm
 #.macro PUT   reg,var
 #    movl   \reg,\var@GOTOFF(%ebx)
 #.endm
 #.else
 .macro GET   var,reg
     movl   \var,\reg
 .endm
 .macro PUT   reg,var
     movl   \reg,\var
 .endm
 #.endif

 .text


  #  ebp - 36:	caller's esi
  #  ebp - 32:	caller's edi
  #  ebp - 28:
  #  ebp - 24:
  #  ebp - 20:
  #  ebp - 16:
  #  ebp - 12:
  #  ebp - 8:
  #  ebp - 4:
  #  ebp + 0:	caller's ebp
  #  ebp + 4:	return address
  #  ebp + 8:	a	argument
  #  ebp + 12:	a_len	argument
  #  ebp + 16:	b	argument
  #  ebp + 20:	c	argument
  #  registers:
  # 	eax:
  #	ebx:	carry
  #	ecx:	a_len
  #	edx:
  #	esi:	a ptr
  #	edi:	c ptr
 .globl	_s_mpv_mul_d
 .type	_s_mpv_mul_d,@function
 _s_mpv_mul_d:
     GET    is_sse,%eax
     cmp    $0,%eax
     je     _s_mpv_mul_d_x86
     jg     _s_mpv_mul_d_sse2
     call   _s_mpi_is_sse2
     PUT    %eax,is_sse
     cmp    $0,%eax
     jg     _s_mpv_mul_d_sse2
 _s_mpv_mul_d_x86:
     push   %ebp
     mov    %esp,%ebp
     sub    $28,%esp
     push   %edi
     push   %esi
     push   %ebx
     movl   $0,%ebx		# carry = 0
     mov    12(%ebp),%ecx	# ecx = a_len
     mov    20(%ebp),%edi
     cmp    $0,%ecx
     je     2f			# jmp if a_len == 0
     mov    8(%ebp),%esi		# esi = a
     cld
 1:
     lodsl			# eax = [ds:esi]; esi += 4
     mov    16(%ebp),%edx	# edx = b
     mull   %edx			# edx:eax = Phi:Plo = a_i * b

     add    %ebx,%eax		# add carry (%ebx) to edx:eax
     adc    $0,%edx
     mov    %edx,%ebx		# high half of product becomes next carry

     stosl			# [es:edi] = ax; edi += 4;
     dec    %ecx			# --a_len
     jnz    1b			# jmp if a_len != 0
 2:
     mov    %ebx,0(%edi)		# *c = carry
     pop    %ebx
     pop    %esi
     pop    %edi
     leave
     ret
     nop
 _s_mpv_mul_d_sse2:
     push   %ebp
     mov    %esp,%ebp
     push   %edi
     push   %esi
     psubq  %mm2,%mm2		# carry = 0
     mov    12(%ebp),%ecx	# ecx = a_len
     movd   16(%ebp),%mm1	# mm1 = b
     mov    20(%ebp),%edi
     cmp    $0,%ecx
     je     6f			# jmp if a_len == 0
     mov    8(%ebp),%esi		# esi = a
     cld
 5:
     movd   0(%esi),%mm0         # mm0 = *a++
     add    $4,%esi
     pmuludq %mm1,%mm0           # mm0 = b * *a++
     paddq  %mm0,%mm2            # add the carry
     movd   %mm2,0(%edi)         # store the 32bit result
     add    $4,%edi
     psrlq  $32, %mm2		# save the carry
     dec    %ecx			# --a_len
     jnz    5b			# jmp if a_len != 0
 6:
     movd   %mm2,0(%edi)		# *c = carry
     emms
     pop    %esi
     pop    %edi
     leave
     ret
     nop

  #  ebp - 36:	caller's esi
  #  ebp - 32:	caller's edi
  #  ebp - 28:
  #  ebp - 24:
  #  ebp - 20:
  #  ebp - 16:
  #  ebp - 12:
  #  ebp - 8:
  #  ebp - 4:
  #  ebp + 0:	caller's ebp
  #  ebp + 4:	return address
  #  ebp + 8:	a	argument
  #  ebp + 12:	a_len	argument
  #  ebp + 16:	b	argument
  #  ebp + 20:	c	argument
  #  registers:
  # 	eax:
  #	ebx:	carry
  #	ecx:	a_len
  #	edx:
  #	esi:	a ptr
  #	edi:	c ptr
 .globl	_s_mpv_mul_d_add
 .type	_s_mpv_mul_d_add,@function
 _s_mpv_mul_d_add:
     GET    is_sse,%eax
     cmp    $0,%eax
     je     _s_mpv_mul_d_add_x86
     jg     _s_mpv_mul_d_add_sse2
     call   _s_mpi_is_sse2
     PUT    %eax,is_sse
     cmp    $0,%eax
     jg     _s_mpv_mul_d_add_sse2
 _s_mpv_mul_d_add_x86:
     push   %ebp
     mov    %esp,%ebp
     sub    $28,%esp
     push   %edi
     push   %esi
     push   %ebx
     movl   $0,%ebx		# carry = 0
     mov    12(%ebp),%ecx	# ecx = a_len
     mov    20(%ebp),%edi
     cmp    $0,%ecx
     je     11f			# jmp if a_len == 0
     mov    8(%ebp),%esi		# esi = a
     cld
 10:
     lodsl			# eax = [ds:esi]; esi += 4
     mov    16(%ebp),%edx	# edx = b
     mull   %edx			# edx:eax = Phi:Plo = a_i * b

     add    %ebx,%eax		# add carry (%ebx) to edx:eax
     adc    $0,%edx
     mov    0(%edi),%ebx		# add in current word from *c
     add    %ebx,%eax
     adc    $0,%edx
     mov    %edx,%ebx		# high half of product becomes next carry

     stosl			# [es:edi] = ax; edi += 4;
     dec    %ecx			# --a_len
     jnz    10b			# jmp if a_len != 0
 11:
     mov    %ebx,0(%edi)		# *c = carry
     pop    %ebx
     pop    %esi
     pop    %edi
     leave
     ret
     nop
 _s_mpv_mul_d_add_sse2:
     push   %ebp
     mov    %esp,%ebp
     push   %edi
     push   %esi
     psubq  %mm2,%mm2		# carry = 0
     mov    12(%ebp),%ecx	# ecx = a_len
     movd   16(%ebp),%mm1	# mm1 = b
     mov    20(%ebp),%edi
     cmp    $0,%ecx
     je     16f			# jmp if a_len == 0
     mov    8(%ebp),%esi		# esi = a
     cld
 15:
     movd   0(%esi),%mm0         # mm0 = *a++
     add    $4,%esi
     pmuludq %mm1,%mm0           # mm0 = b * *a++
     paddq  %mm0,%mm2            # add the carry
     movd   0(%edi),%mm0
     paddq  %mm0,%mm2            # add the carry
     movd   %mm2,0(%edi)         # store the 32bit result
     add    $4,%edi
     psrlq  $32, %mm2		# save the carry
     dec    %ecx			# --a_len
     jnz    15b			# jmp if a_len != 0
 16:
     movd   %mm2,0(%edi)		# *c = carry
     emms
     pop    %esi
     pop    %edi
     leave
     ret
     nop

  #  ebp - 8:	caller's esi
  #  ebp - 4:	caller's edi
  #  ebp + 0:	caller's ebp
  #  ebp + 4:	return address
  #  ebp + 8:	a	argument
  #  ebp + 12:	a_len	argument
  #  ebp + 16:	b	argument
  #  ebp + 20:	c	argument
  #  registers:
  # 	eax:
  #	ebx:	carry
  #	ecx:	a_len
  #	edx:
  #	esi:	a ptr
  #	edi:	c ptr
 .globl	_s_mpv_mul_d_add_prop
 .type	_s_mpv_mul_d_add_prop,@function
 _s_mpv_mul_d_add_prop:
     GET    is_sse,%eax
     cmp    $0,%eax
     je     _s_mpv_mul_d_add_prop_x86
     jg     _s_mpv_mul_d_add_prop_sse2
     call   _s_mpi_is_sse2
     PUT    %eax,is_sse
     cmp    $0,%eax
     jg     _s_mpv_mul_d_add_prop_sse2
 _s_mpv_mul_d_add_prop_x86:
     push   %ebp
     mov    %esp,%ebp
     sub    $28,%esp
     push   %edi
     push   %esi
     push   %ebx
     movl   $0,%ebx		# carry = 0
     mov    12(%ebp),%ecx	# ecx = a_len
     mov    20(%ebp),%edi
     cmp    $0,%ecx
     je     21f			# jmp if a_len == 0
     cld
     mov    8(%ebp),%esi		# esi = a
 20:
     lodsl			# eax = [ds:esi]; esi += 4
     mov    16(%ebp),%edx	# edx = b
     mull   %edx			# edx:eax = Phi:Plo = a_i * b

     add    %ebx,%eax		# add carry (%ebx) to edx:eax
     adc    $0,%edx
     mov    0(%edi),%ebx		# add in current word from *c
     add    %ebx,%eax
     adc    $0,%edx
     mov    %edx,%ebx		# high half of product becomes next carry

     stosl			# [es:edi] = ax; edi += 4;
     dec    %ecx			# --a_len
     jnz    20b			# jmp if a_len != 0
 21:
     cmp    $0,%ebx		# is carry zero?
     jz     23f
     mov    0(%edi),%eax		# add in current word from *c
     add	   %ebx,%eax
     stosl			# [es:edi] = ax; edi += 4;
     jnc    23f
 22:
     mov    0(%edi),%eax		# add in current word from *c
     adc	   $0,%eax
     stosl			# [es:edi] = ax; edi += 4;
     jc     22b
 23:
     pop    %ebx
     pop    %esi
     pop    %edi
     leave
     ret
     nop
 _s_mpv_mul_d_add_prop_sse2:
     push   %ebp
     mov    %esp,%ebp
     push   %edi
     push   %esi
     push   %ebx
     psubq  %mm2,%mm2		# carry = 0
     mov    12(%ebp),%ecx	# ecx = a_len
     movd   16(%ebp),%mm1	# mm1 = b
     mov    20(%ebp),%edi
     cmp    $0,%ecx
     je     26f			# jmp if a_len == 0
     mov    8(%ebp),%esi		# esi = a
     cld
 25:
     movd   0(%esi),%mm0         # mm0 = *a++
     movd   0(%edi),%mm3		# fetch the sum
     add    $4,%esi
     pmuludq %mm1,%mm0           # mm0 = b * *a++
     paddq  %mm0,%mm2            # add the carry
     paddq  %mm3,%mm2            # add *c++
     movd   %mm2,0(%edi)         # store the 32bit result
     add    $4,%edi
     psrlq  $32, %mm2		# save the carry
     dec    %ecx			# --a_len
     jnz    25b			# jmp if a_len != 0
 26:
     movd   %mm2,%ebx
     cmp    $0,%ebx		# is carry zero?
     jz     28f
     mov    0(%edi),%eax
     add    %ebx, %eax
     stosl
     jnc    28f
 27:
     mov    0(%edi),%eax		# add in current word from *c
     adc	   $0,%eax
     stosl			# [es:edi] = ax; edi += 4;
     jc     27b
 28:
     emms
     pop    %ebx
     pop    %esi
     pop    %edi
     leave
     ret
     nop


  #  ebp - 20:	caller's esi
  #  ebp - 16:	caller's edi
  #  ebp - 12:
  #  ebp - 8:	carry
  #  ebp - 4:	a_len	local
  #  ebp + 0:	caller's ebp
  #  ebp + 4:	return address
  #  ebp + 8:	pa	argument
  #  ebp + 12:	a_len	argument
  #  ebp + 16:	ps	argument
  #  ebp + 20:
  #  registers:
  # 	eax:
  #	ebx:	carry
  #	ecx:	a_len
  #	edx:
  #	esi:	a ptr
  #	edi:	c ptr

 .globl	_s_mpv_sqr_add_prop
 .type	_s_mpv_sqr_add_prop,@function
 _s_mpv_sqr_add_prop:
      GET   is_sse,%eax
      cmp    $0,%eax
      je     _s_mpv_sqr_add_prop_x86
      jg     _s_mpv_sqr_add_prop_sse2
      call   _s_mpi_is_sse2
      PUT    %eax,is_sse
      cmp    $0,%eax
      jg     _s_mpv_sqr_add_prop_sse2
 _s_mpv_sqr_add_prop_x86:
      push   %ebp
      mov    %esp,%ebp
      sub    $12,%esp
      push   %edi
      push   %esi
      push   %ebx
      movl   $0,%ebx		# carry = 0
      mov    12(%ebp),%ecx	# a_len
      mov    16(%ebp),%edi	# edi = ps
      cmp    $0,%ecx
      je     31f			# jump if a_len == 0
      cld
      mov    8(%ebp),%esi	# esi = pa
 30:
      lodsl			# %eax = [ds:si]; si += 4;
      mull   %eax

      add    %ebx,%eax		# add "carry"
      adc    $0,%edx
      mov    0(%edi),%ebx
      add    %ebx,%eax		# add low word from result
      mov    4(%edi),%ebx
      stosl			# [es:di] = %eax; di += 4;
      adc    %ebx,%edx		# add high word from result
      movl   $0,%ebx
      mov    %edx,%eax
      adc    $0,%ebx
      stosl			# [es:di] = %eax; di += 4;
      dec    %ecx		# --a_len
      jnz    30b			# jmp if a_len != 0
 31:
     cmp    $0,%ebx		# is carry zero?
     jz     34f
     mov    0(%edi),%eax		# add in current word from *c
     add	   %ebx,%eax
     stosl			# [es:edi] = ax; edi += 4;
     jnc    34f
 32:
     mov    0(%edi),%eax		# add in current word from *c
     adc	   $0,%eax
     stosl			# [es:edi] = ax; edi += 4;
     jc     32b
 34:
     pop    %ebx
     pop    %esi
     pop    %edi
     leave
     ret
     nop
 _s_mpv_sqr_add_prop_sse2:
     push   %ebp
     mov    %esp,%ebp
     push   %edi
     push   %esi
     push   %ebx
     psubq  %mm2,%mm2		# carry = 0
     mov    12(%ebp),%ecx	# ecx = a_len
     mov    16(%ebp),%edi
     cmp    $0,%ecx
     je     36f			# jmp if a_len == 0
     mov    8(%ebp),%esi		# esi = a
     cld
 35:
     movd   0(%esi),%mm0        # mm0 = *a
     movd   0(%edi),%mm3	       # fetch the sum
     add	   $4,%esi
     pmuludq %mm0,%mm0          # mm0 = sqr(a)
     paddq  %mm0,%mm2           # add the carry
     paddq  %mm3,%mm2           # add the low word
     movd   4(%edi),%mm3
     movd   %mm2,0(%edi)        # store the 32bit result
     psrlq  $32, %mm2
     paddq  %mm3,%mm2           # add the high word
     movd   %mm2,4(%edi)        # store the 32bit result
     psrlq  $32, %mm2	       # save the carry.
     add    $8,%edi
     dec    %ecx			# --a_len
     jnz    35b			# jmp if a_len != 0
 36:
     movd   %mm2,%ebx
     cmp    $0,%ebx		# is carry zero?
     jz     38f
     mov    0(%edi),%eax
     add    %ebx, %eax
     stosl
     jnc    38f
 37:
     mov    0(%edi),%eax		# add in current word from *c
     adc	   $0,%eax
     stosl			# [es:edi] = ax; edi += 4;
     jc     37b
 38:
     emms
     pop    %ebx
     pop    %esi
     pop    %edi
     leave
     ret
     nop

  #
  # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
  # so its high bit is 1.   This code is from NSPR.
  #
  # mp_err _s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
  # 		          mp_digit *qp, mp_digit *rp)

  #  esp +  0:   Caller's ebx
  #  esp +  4:	return address
  #  esp +  8:	Nhi	argument
  #  esp + 12:	Nlo	argument
  #  esp + 16:	divisor	argument
  #  esp + 20:	qp	argument
  #  esp + 24:   rp	argument
  #  registers:
  # 	eax:
  #	ebx:	carry
  #	ecx:	a_len
  #	edx:
  #	esi:	a ptr
  #	edi:	c ptr
  #

 .globl	_s_mpv_div_2dx1d
 .type	_s_mpv_div_2dx1d,@function
 _s_mpv_div_2dx1d:
        push   %ebx
        mov    8(%esp),%edx
        mov    12(%esp),%eax
        mov    16(%esp),%ebx
        div    %ebx
        mov    20(%esp),%ebx
        mov    %eax,0(%ebx)
        mov    24(%esp),%ebx
        mov    %edx,0(%ebx)
        xor    %eax,%eax		# return zero
        pop    %ebx
        ret
        nop
	#
	# This Source Code Form is subject to the terms of the Mozilla Public
	# License, v. 2.0. If a copy of the MPL was not distributed with this
	# file, You can obtain one at http://mozilla.org/MPL/2.0/.

	.data
	.align 4
	#
	# -1 means to call _s_mpi_is_sse to determine if we support sse
	# instructions.
	# 0 means to use x86 instructions
	# 1 means to use sse2 instructions
	.type is_sse,@object
	.size is_sse,4
	is_sse: .long -1

	#
	# sigh, handle the difference between -fPIC and not PIC
	# default to pic, since this file seems to be exclusively
	# linux right now (solaris uses mpi_i86pc.s and windows uses
	# mpi_x86_asm.c)
	#
	#.ifndef NO_PIC
	#.macro GET var,reg
	# movl \var@GOTOFF(%ebx),\reg
	#.endm
	#.macro PUT reg,var
	# movl \reg,\var@GOTOFF(%ebx)
	#.endm
	#.else
	.macro GET var,reg
	movl \var,\reg
	.endm
	.macro PUT reg,var
	movl \reg,\var
	.endm
	#.endif

	.text


	# ebp - 36: caller's esi
	# ebp - 32: caller's edi
	# ebp - 28:
	# ebp - 24:
	# ebp - 20:
	# ebp - 16:
	# ebp - 12:
	# ebp - 8:
	# ebp - 4:
	# ebp + 0: caller's ebp
	# ebp + 4: return address
	# ebp + 8: a argument
	# ebp + 12: a_len argument
	# ebp + 16: b argument
	# ebp + 20: c argument
	# registers:
	# eax:
	# ebx: carry
	# ecx: a_len
	# edx:
	# esi: a ptr
	# edi: c ptr
	.globl _s_mpv_mul_d
	.type _s_mpv_mul_d,@function
	_s_mpv_mul_d:
	GET is_sse,%eax
	cmp $0,%eax
	je _s_mpv_mul_d_x86
	jg _s_mpv_mul_d_sse2
	call _s_mpi_is_sse2
	PUT %eax,is_sse
	cmp $0,%eax
	jg _s_mpv_mul_d_sse2
	_s_mpv_mul_d_x86:
	push %ebp
	mov %esp,%ebp
	sub $28,%esp
	push %edi
	push %esi
	push %ebx
	movl $0,%ebx # carry = 0
	mov 12(%ebp),%ecx # ecx = a_len
	mov 20(%ebp),%edi
	cmp $0,%ecx
	je 2f # jmp if a_len == 0
	mov 8(%ebp),%esi # esi = a
	cld
	1:
	lodsl # eax = [ds:esi]; esi += 4
	mov 16(%ebp),%edx # edx = b
	mull %edx # edx:eax = Phi:Plo = a_i * b

	add %ebx,%eax # add carry (%ebx) to edx:eax
	adc $0,%edx
	mov %edx,%ebx # high half of product becomes next carry

	stosl # [es:edi] = ax; edi += 4;
	dec %ecx # --a_len
	jnz 1b # jmp if a_len != 0
	2:
	mov %ebx,0(%edi) # *c = carry
	pop %ebx
	pop %esi
	pop %edi
	leave
	ret
	nop
	_s_mpv_mul_d_sse2:
	push %ebp
	mov %esp,%ebp
	push %edi
	push %esi
	psubq %mm2,%mm2 # carry = 0
	mov 12(%ebp),%ecx # ecx = a_len
	movd 16(%ebp),%mm1 # mm1 = b
	mov 20(%ebp),%edi
	cmp $0,%ecx
	je 6f # jmp if a_len == 0
	mov 8(%ebp),%esi # esi = a
	cld
	5:
	movd 0(%esi),%mm0 # mm0 = *a++
	add $4,%esi
	pmuludq %mm1,%mm0 # mm0 = b * *a++
	paddq %mm0,%mm2 # add the carry
	movd %mm2,0(%edi) # store the 32bit result
	add $4,%edi
	psrlq $32, %mm2 # save the carry
	dec %ecx # --a_len
	jnz 5b # jmp if a_len != 0
	6:
	movd %mm2,0(%edi) # *c = carry
	emms
	pop %esi
	pop %edi
	leave
	ret
	nop

	# ebp - 36: caller's esi
	# ebp - 32: caller's edi
	# ebp - 28:
	# ebp - 24:
	# ebp - 20:
	# ebp - 16:
	# ebp - 12:
	# ebp - 8:
	# ebp - 4:
	# ebp + 0: caller's ebp
	# ebp + 4: return address
	# ebp + 8: a argument
	# ebp + 12: a_len argument
	# ebp + 16: b argument
	# ebp + 20: c argument
	# registers:
	# eax:
	# ebx: carry
	# ecx: a_len
	# edx:
	# esi: a ptr
	# edi: c ptr
	.globl _s_mpv_mul_d_add
	.type _s_mpv_mul_d_add,@function
	_s_mpv_mul_d_add:
	GET is_sse,%eax
	cmp $0,%eax
	je _s_mpv_mul_d_add_x86
	jg _s_mpv_mul_d_add_sse2
	call _s_mpi_is_sse2
	PUT %eax,is_sse
	cmp $0,%eax
	jg _s_mpv_mul_d_add_sse2
	_s_mpv_mul_d_add_x86:
	push %ebp
	mov %esp,%ebp
	sub $28,%esp
	push %edi
	push %esi
	push %ebx
	movl $0,%ebx # carry = 0
	mov 12(%ebp),%ecx # ecx = a_len
	mov 20(%ebp),%edi
	cmp $0,%ecx
	je 11f # jmp if a_len == 0
	mov 8(%ebp),%esi # esi = a
	cld
	10:
	lodsl # eax = [ds:esi]; esi += 4
	mov 16(%ebp),%edx # edx = b
	mull %edx # edx:eax = Phi:Plo = a_i * b

	add %ebx,%eax # add carry (%ebx) to edx:eax
	adc $0,%edx
	mov 0(%edi),%ebx # add in current word from *c
	add %ebx,%eax
	adc $0,%edx
	mov %edx,%ebx # high half of product becomes next carry

	stosl # [es:edi] = ax; edi += 4;
	dec %ecx # --a_len
	jnz 10b # jmp if a_len != 0
	11:
	mov %ebx,0(%edi) # *c = carry
	pop %ebx
	pop %esi
	pop %edi
	leave
	ret
	nop
	_s_mpv_mul_d_add_sse2:
	push %ebp
	mov %esp,%ebp
	push %edi
	push %esi
	psubq %mm2,%mm2 # carry = 0
	mov 12(%ebp),%ecx # ecx = a_len
	movd 16(%ebp),%mm1 # mm1 = b
	mov 20(%ebp),%edi
	cmp $0,%ecx
	je 16f # jmp if a_len == 0
	mov 8(%ebp),%esi # esi = a
	cld
	15:
	movd 0(%esi),%mm0 # mm0 = *a++
	add $4,%esi
	pmuludq %mm1,%mm0 # mm0 = b * *a++
	paddq %mm0,%mm2 # add the carry
	movd 0(%edi),%mm0
	paddq %mm0,%mm2 # add the carry
	movd %mm2,0(%edi) # store the 32bit result
	add $4,%edi
	psrlq $32, %mm2 # save the carry
	dec %ecx # --a_len
	jnz 15b # jmp if a_len != 0
	16:
	movd %mm2,0(%edi) # *c = carry
	emms
	pop %esi
	pop %edi
	leave
	ret
	nop

	# ebp - 8: caller's esi
	# ebp - 4: caller's edi
	# ebp + 0: caller's ebp
	# ebp + 4: return address
	# ebp + 8: a argument
	# ebp + 12: a_len argument
	# ebp + 16: b argument
	# ebp + 20: c argument
	# registers:
	# eax:
	# ebx: carry
	# ecx: a_len
	# edx:
	# esi: a ptr
	# edi: c ptr
	.globl _s_mpv_mul_d_add_prop
	.type _s_mpv_mul_d_add_prop,@function
	_s_mpv_mul_d_add_prop:
	GET is_sse,%eax
	cmp $0,%eax
	je _s_mpv_mul_d_add_prop_x86
	jg _s_mpv_mul_d_add_prop_sse2
	call _s_mpi_is_sse2
	PUT %eax,is_sse
	cmp $0,%eax
	jg _s_mpv_mul_d_add_prop_sse2
	_s_mpv_mul_d_add_prop_x86:
	push %ebp
	mov %esp,%ebp
	sub $28,%esp
	push %edi
	push %esi
	push %ebx
	movl $0,%ebx # carry = 0
	mov 12(%ebp),%ecx # ecx = a_len
	mov 20(%ebp),%edi
	cmp $0,%ecx
	je 21f # jmp if a_len == 0
	cld
	mov 8(%ebp),%esi # esi = a
	20:
	lodsl # eax = [ds:esi]; esi += 4
	mov 16(%ebp),%edx # edx = b
	mull %edx # edx:eax = Phi:Plo = a_i * b

	add %ebx,%eax # add carry (%ebx) to edx:eax
	adc $0,%edx
	mov 0(%edi),%ebx # add in current word from *c
	add %ebx,%eax
	adc $0,%edx
	mov %edx,%ebx # high half of product becomes next carry

	stosl # [es:edi] = ax; edi += 4;
	dec %ecx # --a_len
	jnz 20b # jmp if a_len != 0
	21:
	cmp $0,%ebx # is carry zero?
	jz 23f
	mov 0(%edi),%eax # add in current word from *c
	add %ebx,%eax
	stosl # [es:edi] = ax; edi += 4;
	jnc 23f
	22:
	mov 0(%edi),%eax # add in current word from *c
	adc $0,%eax
	stosl # [es:edi] = ax; edi += 4;
	jc 22b
	23:
	pop %ebx
	pop %esi
	pop %edi
	leave
	ret
	nop
	_s_mpv_mul_d_add_prop_sse2:
	push %ebp
	mov %esp,%ebp
	push %edi
	push %esi
	push %ebx
	psubq %mm2,%mm2 # carry = 0
	mov 12(%ebp),%ecx # ecx = a_len
	movd 16(%ebp),%mm1 # mm1 = b
	mov 20(%ebp),%edi
	cmp $0,%ecx
	je 26f # jmp if a_len == 0
	mov 8(%ebp),%esi # esi = a
	cld
	25:
	movd 0(%esi),%mm0 # mm0 = *a++
	movd 0(%edi),%mm3 # fetch the sum
	add $4,%esi
	pmuludq %mm1,%mm0 # mm0 = b * *a++
	paddq %mm0,%mm2 # add the carry
	paddq %mm3,%mm2 # add *c++
	movd %mm2,0(%edi) # store the 32bit result
	add $4,%edi
	psrlq $32, %mm2 # save the carry
	dec %ecx # --a_len
	jnz 25b # jmp if a_len != 0
	26:
	movd %mm2,%ebx
	cmp $0,%ebx # is carry zero?
	jz 28f
	mov 0(%edi),%eax
	add %ebx, %eax
	stosl
	jnc 28f
	27:
	mov 0(%edi),%eax # add in current word from *c
	adc $0,%eax
	stosl # [es:edi] = ax; edi += 4;
	jc 27b
	28:
	emms
	pop %ebx
	pop %esi
	pop %edi
	leave
	ret
	nop


	# ebp - 20: caller's esi
	# ebp - 16: caller's edi
	# ebp - 12:
	# ebp - 8: carry
	# ebp - 4: a_len local
	# ebp + 0: caller's ebp
	# ebp + 4: return address
	# ebp + 8: pa argument
	# ebp + 12: a_len argument
	# ebp + 16: ps argument
	# ebp + 20:
	# registers:
	# eax:
	# ebx: carry
	# ecx: a_len
	# edx:
	# esi: a ptr
	# edi: c ptr

	.globl _s_mpv_sqr_add_prop
	.type _s_mpv_sqr_add_prop,@function
	_s_mpv_sqr_add_prop:
	GET is_sse,%eax
	cmp $0,%eax
	je _s_mpv_sqr_add_prop_x86
	jg _s_mpv_sqr_add_prop_sse2
	call _s_mpi_is_sse2
	PUT %eax,is_sse
	cmp $0,%eax
	jg _s_mpv_sqr_add_prop_sse2
	_s_mpv_sqr_add_prop_x86:
	push %ebp
	mov %esp,%ebp
	sub $12,%esp
	push %edi
	push %esi
	push %ebx
	movl $0,%ebx # carry = 0
	mov 12(%ebp),%ecx # a_len
	mov 16(%ebp),%edi # edi = ps
	cmp $0,%ecx
	je 31f # jump if a_len == 0
	cld
	mov 8(%ebp),%esi # esi = pa
	30:
	lodsl # %eax = [ds:si]; si += 4;
	mull %eax

	add %ebx,%eax # add "carry"
	adc $0,%edx
	mov 0(%edi),%ebx
	add %ebx,%eax # add low word from result
	mov 4(%edi),%ebx
	stosl # [es:di] = %eax; di += 4;
	adc %ebx,%edx # add high word from result
	movl $0,%ebx
	mov %edx,%eax
	adc $0,%ebx
	stosl # [es:di] = %eax; di += 4;
	dec %ecx # --a_len
	jnz 30b # jmp if a_len != 0
	31:
	cmp $0,%ebx # is carry zero?
	jz 34f
	mov 0(%edi),%eax # add in current word from *c
	add %ebx,%eax
	stosl # [es:edi] = ax; edi += 4;
	jnc 34f
	32:
	mov 0(%edi),%eax # add in current word from *c
	adc $0,%eax
	stosl # [es:edi] = ax; edi += 4;
	jc 32b
	34:
	pop %ebx
	pop %esi
	pop %edi
	leave
	ret
	nop
	_s_mpv_sqr_add_prop_sse2:
	push %ebp
	mov %esp,%ebp
	push %edi
	push %esi
	push %ebx
	psubq %mm2,%mm2 # carry = 0
	mov 12(%ebp),%ecx # ecx = a_len
	mov 16(%ebp),%edi
	cmp $0,%ecx
	je 36f # jmp if a_len == 0
	mov 8(%ebp),%esi # esi = a
	cld
	35:
	movd 0(%esi),%mm0 # mm0 = *a
	movd 0(%edi),%mm3 # fetch the sum
	add $4,%esi
	pmuludq %mm0,%mm0 # mm0 = sqr(a)
	paddq %mm0,%mm2 # add the carry
	paddq %mm3,%mm2 # add the low word
	movd 4(%edi),%mm3
	movd %mm2,0(%edi) # store the 32bit result
	psrlq $32, %mm2
	paddq %mm3,%mm2 # add the high word
	movd %mm2,4(%edi) # store the 32bit result
	psrlq $32, %mm2 # save the carry.
	add $8,%edi
	dec %ecx # --a_len
	jnz 35b # jmp if a_len != 0
	36:
	movd %mm2,%ebx
	cmp $0,%ebx # is carry zero?
	jz 38f
	mov 0(%edi),%eax
	add %ebx, %eax
	stosl
	jnc 38f
	37:
	mov 0(%edi),%eax # add in current word from *c
	adc $0,%eax
	stosl # [es:edi] = ax; edi += 4;
	jc 37b
	38:
	emms
	pop %ebx
	pop %esi
	pop %edi
	leave
	ret
	nop

	#
	# Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
	# so its high bit is 1. This code is from NSPR.
	#
	# mp_err _s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
	# mp_digit qp, mp_digit rp)

	# esp + 0: Caller's ebx
	# esp + 4: return address
	# esp + 8: Nhi argument
	# esp + 12: Nlo argument
	# esp + 16: divisor argument
	# esp + 20: qp argument
	# esp + 24: rp argument
	# registers:
	# eax:
	# ebx: carry
	# ecx: a_len
	# edx:
	# esi: a ptr
	# edi: c ptr
	#

	.globl _s_mpv_div_2dx1d
	.type _s_mpv_div_2dx1d,@function
	_s_mpv_div_2dx1d:
	push %ebx
	mov 8(%esp),%edx
	mov 12(%esp),%eax
	mov 16(%esp),%ebx
	div %ebx
	mov 20(%esp),%ebx
	mov %eax,0(%ebx)
	mov 24(%esp),%ebx
	mov %edx,0(%ebx)
	xor %eax,%eax # return zero
	pop %ebx
	ret
	nop