nss-3.41/nss/lib/freebl/mpi/hppa20.s - manifest_repos/nss - Git at Google

 ; This Source Code Form is subject to the terms of the Mozilla Public
 ; License, v. 2.0. If a copy of the MPL was not distributed with this
 ; file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifdef __LP64__
         .LEVEL   2.0W
 #else
 ;       .LEVEL   1.1
 ;       .ALLOW   2.0N
         .LEVEL   2.0
 #endif
         .SPACE   $TEXT$,SORT=8
         .SUBSPA  $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24

 ; ***************************************************************
 ;
 ;                 maxpy_[little/big]
 ;
 ; ***************************************************************

 ; There is no default -- you must specify one or the other.
 #define LITTLE_WORDIAN 1

 #ifdef LITTLE_WORDIAN
 #define EIGHT 8
 #define SIXTEEN 16
 #define THIRTY_TWO 32
 #define UN_EIGHT -8
 #define UN_SIXTEEN -16
 #define UN_TWENTY_FOUR -24
 #endif

 #ifdef BIG_WORDIAN
 #define EIGHT -8
 #define SIXTEEN -16
 #define THIRTY_TWO -32
 #define UN_EIGHT 8
 #define UN_SIXTEEN 16
 #define UN_TWENTY_FOUR 24
 #endif

 ; This performs a multiple-precision integer version of "daxpy",
 ; Using the selected addressing direction.  "Little-wordian" means that
 ; the least significant word of a number is stored at the lowest address.
 ; "Big-wordian" means that the most significant word is at the lowest
 ; address.  Either way, the incoming address of the vector is that
 ; of the least significant word.  That means that, for little-wordian
 ; addressing, we move the address upward as we propagate carries
 ; from the least significant word to the most significant.  For
 ; big-wordian we move the address downward.

 ; We use the following registers:
 ;
 ;     r2   return PC, of course
 ;     r26 = arg1 =  length
 ;     r25 = arg2 =  address of scalar
 ;     r24 = arg3 =  multiplicand vector
 ;     r23 = arg4 =  result vector
 ;
 ;     fr9 = scalar loaded once only from r25

 ; The cycle counts shown in the bodies below are simply the result of a
 ; scheduling by hand.  The actual PCX-U hardware does it differently.
 ; The intention is that the overall speed is the same.

 ; The pipeline startup and shutdown code is constructed in the usual way,
 ; by taking the loop bodies and removing unnecessary instructions.
 ; We have left the comments describing cycle numbers in the code.
 ; These are intended for reference when comparing with the main loop,
 ; and have no particular relationship to actual cycle numbers.

 #ifdef LITTLE_WORDIAN
 maxpy_little
 #else
 maxpy_big
 #endif
         .PROC
         .CALLINFO FRAME=120,ENTRY_GR=4
         .ENTRY
         STW,MA  %r3,128(%sp)
         STW     %r4,-124(%sp)

         ADDIB,< -1,%r26,$L0         ; If N = 0, exit immediately.
         FLDD    0(%r25),%fr9        ; fr9 = scalar

 ; First startup

         FLDD    0(%r24),%fr24       ; Cycle 1
         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
         CMPIB,> 3,%r26,$N_IS_SMALL  ; Pick out cases N = 1, 2, or 3
         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
         FLDD    EIGHT(%r24),%fr28   ; Cycle 8
         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
         FSTD    %fr24,-96(%sp)
         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
         FSTD    %fr25,-80(%sp)
         LDO     SIXTEEN(%r24),%r24  ; Cycle 12
         FSTD    %fr31,-64(%sp)
         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
         FSTD    %fr27,-48(%sp)

 ; Second startup

         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
         FSTD    %fr30,-56(%sp)
         FLDD    0(%r24),%fr24

         FSTD    %fr26,-88(%sp)      ; Cycle 2

         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
         FSTD    %fr28,-104(%sp)

         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
         LDD     -96(%sp),%r3
         FSTD    %fr29,-72(%sp)

         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
         LDD     -64(%sp),%r19
         LDD     -80(%sp),%r21

         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
         LDD     -56(%sp),%r20
         ADD     %r21,%r3,%r3

         ADD,DC  %r20,%r19,%r19      ; Cycle 7
         LDD     -88(%sp),%r4
         SHRPD   %r3,%r0,32,%r21
         LDD     -48(%sp),%r1

         FLDD    EIGHT(%r24),%fr28   ; Cycle 8
         LDD     -104(%sp),%r31
         ADD,DC  %r0,%r0,%r20
         SHRPD   %r19,%r3,32,%r3

         LDD     -72(%sp),%r29       ; Cycle 9
         SHRPD   %r20,%r19,32,%r20
         ADD     %r21,%r1,%r1

         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
         ADD,DC  %r3,%r4,%r4
         FSTD    %fr24,-96(%sp)

         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
         ADD,DC  %r0,%r20,%r20
         LDD     0(%r23),%r3
         FSTD    %fr25,-80(%sp)

         LDO     SIXTEEN(%r24),%r24  ; Cycle 12
         FSTD    %fr31,-64(%sp)

         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
         ADD     %r0,%r0,%r0         ; clear the carry bit
         ADDIB,<= -4,%r26,$ENDLOOP   ; actually happens in cycle 12
         FSTD    %fr27,-48(%sp)
 ;        MFCTL   %cr16,%r21         ; for timing
 ;        STD     %r21,-112(%sp)

 ; Here is the loop.

 $LOOP   XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
         ADD,DC  %r29,%r4,%r4
         FSTD    %fr30,-56(%sp)
         FLDD    0(%r24),%fr24

         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
         ADD,DC  %r0,%r20,%r20
         FSTD    %fr26,-88(%sp)

         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
         ADD     %r3,%r1,%r1
         FSTD    %fr28,-104(%sp)
         LDD     UN_EIGHT(%r23),%r21

         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
         ADD,DC  %r21,%r4,%r28
         FSTD    %fr29,-72(%sp)
         LDD     -96(%sp),%r3

         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
         ADD,DC  %r20,%r31,%r22
         LDD     -64(%sp),%r19
         LDD     -80(%sp),%r21

         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
         ADD     %r21,%r3,%r3
         LDD     -56(%sp),%r20
         STD     %r1,UN_SIXTEEN(%r23)

         ADD,DC  %r20,%r19,%r19      ; Cycle 7
         SHRPD   %r3,%r0,32,%r21
         LDD     -88(%sp),%r4
         LDD     -48(%sp),%r1

         ADD,DC  %r0,%r0,%r20        ; Cycle 8
         SHRPD   %r19,%r3,32,%r3
         FLDD    EIGHT(%r24),%fr28
         LDD     -104(%sp),%r31

         SHRPD   %r20,%r19,32,%r20   ; Cycle 9
         ADD     %r21,%r1,%r1
         STD     %r28,UN_EIGHT(%r23)
         LDD     -72(%sp),%r29

         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
         ADD,DC  %r3,%r4,%r4
         FSTD    %fr24,-96(%sp)

         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
         ADD,DC  %r0,%r20,%r20
         FSTD    %fr25,-80(%sp)
         LDD     0(%r23),%r3

         LDO     SIXTEEN(%r24),%r24  ; Cycle 12
         FSTD    %fr31,-64(%sp)

         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
         ADD     %r22,%r1,%r1
         ADDIB,> -2,%r26,$LOOP       ; actually happens in cycle 12
         FSTD    %fr27,-48(%sp)

 $ENDLOOP

 ; Shutdown code, first stage.

 ;        MFCTL   %cr16,%r21         ; for timing
 ;        STD     %r21,UN_SIXTEEN(%r23)
 ;        LDD     -112(%sp),%r21
 ;        STD     %r21,UN_EIGHT(%r23)

         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
         ADD,DC  %r29,%r4,%r4
         CMPIB,= 0,%r26,$ONEMORE
         FSTD    %fr30,-56(%sp)

         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
         ADD,DC  %r0,%r20,%r20
         FSTD    %fr26,-88(%sp)

         ADD     %r3,%r1,%r1         ; Cycle 3
         FSTD    %fr28,-104(%sp)
         LDD     UN_EIGHT(%r23),%r21

         ADD,DC  %r21,%r4,%r28       ; Cycle 4
         FSTD    %fr29,-72(%sp)
         STD     %r28,UN_EIGHT(%r23) ; moved up from cycle 9
         LDD     -96(%sp),%r3

         ADD,DC  %r20,%r31,%r22      ; Cycle 5
         STD     %r1,UN_SIXTEEN(%r23)
 $JOIN4
         LDD     -64(%sp),%r19
         LDD     -80(%sp),%r21

         ADD     %r21,%r3,%r3        ; Cycle 6
         LDD     -56(%sp),%r20

         ADD,DC  %r20,%r19,%r19      ; Cycle 7
         SHRPD   %r3,%r0,32,%r21
         LDD     -88(%sp),%r4
         LDD     -48(%sp),%r1

         ADD,DC  %r0,%r0,%r20        ; Cycle 8
         SHRPD   %r19,%r3,32,%r3
         LDD     -104(%sp),%r31

         SHRPD   %r20,%r19,32,%r20   ; Cycle 9
         ADD     %r21,%r1,%r1
         LDD     -72(%sp),%r29

         ADD,DC  %r3,%r4,%r4         ; Cycle 10

         ADD,DC  %r0,%r20,%r20       ; Cycle 11
         LDD     0(%r23),%r3

         ADD     %r22,%r1,%r1        ; Cycle 13

 ; Shutdown code, second stage.

         ADD,DC  %r29,%r4,%r4        ; Cycle 1

         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
         ADD,DC  %r0,%r20,%r20

         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
         ADD     %r3,%r1,%r1

         ADD,DC  %r21,%r4,%r28       ; Cycle 4

         ADD,DC  %r20,%r31,%r22      ; Cycle 5

         STD     %r1,UN_SIXTEEN(%r23); Cycle 6

         STD     %r28,UN_EIGHT(%r23) ; Cycle 9

         LDD     0(%r23),%r3         ; Cycle 11

 ; Shutdown code, third stage.

         LDO     SIXTEEN(%r23),%r23
         ADD     %r3,%r22,%r1
 $JOIN1  ADD,DC  %r0,%r0,%r21
         CMPIB,*= 0,%r21,$L0         ; if no overflow, exit
         STD     %r1,UN_SIXTEEN(%r23)

 ; Final carry propagation

 $FINAL1 LDO     EIGHT(%r23),%r23
         LDD     UN_SIXTEEN(%r23),%r21
         ADDI    1,%r21,%r21
         CMPIB,*= 0,%r21,$FINAL1     ; Keep looping if there is a carry.
         STD     %r21,UN_SIXTEEN(%r23)
         B       $L0
         NOP

 ; Here is the code that handles the difficult cases N=1, N=2, and N=3.
 ; We do the usual trick -- branch out of the startup code at appropriate
 ; points, and branch into the shutdown code.

 $N_IS_SMALL
         CMPIB,= 0,%r26,$N_IS_ONE
         FSTD    %fr24,-96(%sp)      ; Cycle 10
         FLDD    EIGHT(%r24),%fr28   ; Cycle 8
         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
         FSTD    %fr25,-80(%sp)
         FSTD    %fr31,-64(%sp)      ; Cycle 12
         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
         FSTD    %fr27,-48(%sp)
         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
         CMPIB,= 2,%r26,$N_IS_THREE
         FSTD    %fr30,-56(%sp)

 ; N = 2
         FSTD    %fr26,-88(%sp)      ; Cycle 2
         FSTD    %fr28,-104(%sp)     ; Cycle 3
         LDD     -96(%sp),%r3        ; Cycle 4
         FSTD    %fr29,-72(%sp)
         B       $JOIN4
         ADD     %r0,%r0,%r22

 $N_IS_THREE
         FLDD    SIXTEEN(%r24),%fr24
         FSTD    %fr26,-88(%sp)      ; Cycle 2
         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
         FSTD    %fr28,-104(%sp)
         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
         LDD     -96(%sp),%r3
         FSTD    %fr29,-72(%sp)
         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
         LDD     -64(%sp),%r19
         LDD     -80(%sp),%r21
         B       $JOIN3
         ADD     %r0,%r0,%r22

 $N_IS_ONE
         FSTD    %fr25,-80(%sp)
         FSTD    %fr27,-48(%sp)
         FSTD    %fr26,-88(%sp)      ; Cycle 2
         B       $JOIN5
         ADD     %r0,%r0,%r22

 ; We came out of the unrolled loop with wrong parity.  Do one more
 ; single cycle.  This is quite tricky, because of the way the
 ; carry chains and SHRPD chains have been chopped up.

 $ONEMORE

         FLDD    0(%r24),%fr24

         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
         ADD,DC  %r0,%r20,%r20
         FSTD    %fr26,-88(%sp)

         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
         FSTD    %fr28,-104(%sp)
         LDD     UN_EIGHT(%r23),%r21
         ADD     %r3,%r1,%r1

         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
         ADD,DC  %r21,%r4,%r28
         STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
         LDD     -96(%sp),%r3
         FSTD    %fr29,-72(%sp)

         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
         ADD,DC  %r20,%r31,%r22
         LDD     -64(%sp),%r19
         LDD     -80(%sp),%r21

         STD     %r1,UN_SIXTEEN(%r23); Cycle 6
 $JOIN3
         XMPYU   %fr9L,%fr24R,%fr24
         LDD     -56(%sp),%r20
         ADD     %r21,%r3,%r3

         ADD,DC  %r20,%r19,%r19      ; Cycle 7
         LDD     -88(%sp),%r4
         SHRPD   %r3,%r0,32,%r21
         LDD     -48(%sp),%r1

         LDD     -104(%sp),%r31      ; Cycle 8
         ADD,DC  %r0,%r0,%r20
         SHRPD   %r19,%r3,32,%r3

         LDD     -72(%sp),%r29       ; Cycle 9
         SHRPD   %r20,%r19,32,%r20
         ADD     %r21,%r1,%r1

         ADD,DC  %r3,%r4,%r4         ; Cycle 10
         FSTD    %fr24,-96(%sp)

         ADD,DC  %r0,%r20,%r20       ; Cycle 11
         LDD     0(%r23),%r3
         FSTD    %fr25,-80(%sp)

         ADD     %r22,%r1,%r1        ; Cycle 13
         FSTD    %fr27,-48(%sp)

 ; Shutdown code, stage 1-1/2.

         ADD,DC  %r29,%r4,%r4        ; Cycle 1

         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
         ADD,DC  %r0,%r20,%r20
         FSTD    %fr26,-88(%sp)

         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
         ADD     %r3,%r1,%r1

         ADD,DC  %r21,%r4,%r28       ; Cycle 4
         STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9

         ADD,DC  %r20,%r31,%r22      ; Cycle 5
         STD     %r1,UN_SIXTEEN(%r23)
 $JOIN5
         LDD     -96(%sp),%r3        ; moved from cycle 4
         LDD     -80(%sp),%r21
         ADD     %r21,%r3,%r3        ; Cycle 6
         ADD,DC  %r0,%r0,%r19        ; Cycle 7
         LDD     -88(%sp),%r4
         SHRPD   %r3,%r0,32,%r21
         LDD     -48(%sp),%r1
         SHRPD   %r19,%r3,32,%r3     ; Cycle 8
         ADD     %r21,%r1,%r1        ; Cycle 9
         ADD,DC  %r3,%r4,%r4         ; Cycle 10
         LDD     0(%r23),%r3         ; Cycle 11
         ADD     %r22,%r1,%r1        ; Cycle 13

 ; Shutdown code, stage 2-1/2.

         ADD,DC  %r0,%r4,%r4         ; Cycle 1
         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
         ADD     %r3,%r1,%r1
         STD     %r1,UN_SIXTEEN(%r23)
         ADD,DC  %r21,%r4,%r1
         B       $JOIN1
         LDO     EIGHT(%r23),%r23

 ; exit

 $L0
         LDW     -124(%sp),%r4
         BVE     (%r2)
         .EXIT
         LDW,MB  -128(%sp),%r3

         .PROCEND

 ; ***************************************************************
 ;
 ;                 add_diag_[little/big]
 ;
 ; ***************************************************************

 ; The arguments are as follows:
 ;     r2   return PC, of course
 ;     r26 = arg1 =  length
 ;     r25 = arg2 =  vector to square
 ;     r24 = arg3 =  result vector

 #ifdef LITTLE_WORDIAN
 add_diag_little
 #else
 add_diag_big
 #endif
         .PROC
         .CALLINFO FRAME=120,ENTRY_GR=4
         .ENTRY
         STW,MA  %r3,128(%sp)
         STW     %r4,-124(%sp)

         ADDIB,< -1,%r26,$Z0         ; If N=0, exit immediately.
         NOP

 ; Startup code

         FLDD    0(%r25),%fr7        ; Cycle 2 (alternate body)
         XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
         XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
         XMPYU   %fr7L,%fr7L,%fr30
         LDO     SIXTEEN(%r25),%r25  ; Cycle 6
         FSTD    %fr29,-88(%sp)
         FSTD    %fr27,-72(%sp)      ; Cycle 7
         CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
         FSTD    %fr30,-96(%sp)
         FLDD    UN_EIGHT(%r25),%fr7 ; Cycle 2
         LDD     -88(%sp),%r22       ; Cycle 3
         LDD     -72(%sp),%r31       ; Cycle 4
         XMPYU   %fr7R,%fr7R,%fr28
         XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
         XMPYU   %fr7L,%fr7L,%fr31
         LDD     -96(%sp),%r20       ; Cycle 6
         FSTD    %fr28,-80(%sp)
         ADD     %r0,%r0,%r0         ; clear the carry bit
         ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
         FSTD    %fr24,-64(%sp)

 ; Here is the loop.  It is unrolled twice, modelled after the "alternate body" and then the "main body".

 $DIAGLOOP
         SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
         LDO     SIXTEEN(%r25),%r25
         LDD     0(%r24),%r1
         FSTD    %fr31,-104(%sp)
         SHRPD   %r0,%r31,31,%r4     ; Cycle 2
         ADD,DC  %r22,%r3,%r3
         FLDD    UN_SIXTEEN(%r25),%fr7
         ADD,DC  %r0,%r20,%r20       ; Cycle 3
         ADD     %r1,%r3,%r3
         XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
         LDD     -80(%sp),%r21
         STD     %r3,0(%r24)
         XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
         XMPYU   %fr7L,%fr7L,%fr30
         LDD     -64(%sp),%r29
         LDD     EIGHT(%r24),%r1
         ADD,DC  %r4,%r20,%r20       ; Cycle 6
         LDD     -104(%sp),%r19
         FSTD    %fr29,-88(%sp)
         ADD     %r20,%r1,%r1        ; Cycle 7
         FSTD    %fr27,-72(%sp)
         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
         LDO     THIRTY_TWO(%r24),%r24
         LDD     UN_SIXTEEN(%r24),%r28
         FSTD    %fr30,-96(%sp)
         SHRPD   %r0,%r29,31,%r3     ; Cycle 2
         ADD,DC  %r21,%r4,%r4
         FLDD    UN_EIGHT(%r25),%fr7
         STD     %r1,UN_TWENTY_FOUR(%r24)
         ADD,DC  %r0,%r19,%r19       ; Cycle 3
         ADD     %r28,%r4,%r4
         XMPYU   %fr7R,%fr7R,%fr28   ; Cycle 4
         LDD     -88(%sp),%r22
         STD     %r4,UN_SIXTEEN(%r24)
         XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
         XMPYU   %fr7L,%fr7L,%fr31
         LDD     -72(%sp),%r31
         LDD     UN_EIGHT(%r24),%r28
         ADD,DC  %r3,%r19,%r19       ; Cycle 6
         LDD     -96(%sp),%r20
         FSTD    %fr28,-80(%sp)
         ADD     %r19,%r28,%r28      ; Cycle 7
         FSTD    %fr24,-64(%sp)
         ADDIB,> -2,%r26,$DIAGLOOP   ; Cycle 8
         STD     %r28,UN_EIGHT(%r24)

 $ENDDIAGLOOP

         ADD,DC  %r0,%r22,%r22
         CMPIB,= 0,%r26,$ONEMOREDIAG
         SHRPD   %r31,%r0,31,%r3

 ; Shutdown code, first stage.

         FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
         LDD     0(%r24),%r28
         SHRPD   %r0,%r31,31,%r4     ; Cycle 2
         ADD     %r3,%r22,%r3
         ADD,DC  %r0,%r20,%r20       ; Cycle 3
         LDD     -80(%sp),%r21
         ADD     %r3,%r28,%r3
         LDD     -64(%sp),%r29       ; Cycle 4
         STD     %r3,0(%r24)
         LDD     EIGHT(%r24),%r1     ; Cycle 5
         LDO     SIXTEEN(%r25),%r25  ; Cycle 6
         LDD     -104(%sp),%r19
         ADD,DC  %r4,%r20,%r20
         ADD     %r20,%r1,%r1        ; Cycle 7
         ADD,DC  %r0,%r21,%r21       ; Cycle 8
         STD     %r1,EIGHT(%r24)

 ; Shutdown code, second stage.

         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
         LDO     THIRTY_TWO(%r24),%r24
         LDD     UN_SIXTEEN(%r24),%r1
         SHRPD   %r0,%r29,31,%r3      ; Cycle 2
         ADD     %r4,%r21,%r4
         ADD,DC  %r0,%r19,%r19       ; Cycle 3
         ADD     %r4,%r1,%r4
         STD     %r4,UN_SIXTEEN(%r24); Cycle 4
         LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
         ADD,DC  %r3,%r19,%r19       ; Cycle 6
         ADD     %r19,%r28,%r28      ; Cycle 7
         ADD,DC  %r0,%r0,%r22        ; Cycle 8
         CMPIB,*= 0,%r22,$Z0         ; if no overflow, exit
         STD     %r28,UN_EIGHT(%r24)

 ; Final carry propagation

 $FDIAG2
         LDO     EIGHT(%r24),%r24
         LDD     UN_EIGHT(%r24),%r26
         ADDI    1,%r26,%r26
         CMPIB,*= 0,%r26,$FDIAG2     ; Keep looping if there is a carry.
         STD     %r26,UN_EIGHT(%r24)

         B   $Z0
         NOP

 ; Here is the code that handles the difficult case N=1.
 ; We do the usual trick -- branch out of the startup code at appropriate
 ; points, and branch into the shutdown code.

 $DIAG_N_IS_ONE

         LDD     -88(%sp),%r22
         LDD     -72(%sp),%r31
         B       $JOINDIAG
         LDD     -96(%sp),%r20

 ; We came out of the unrolled loop with wrong parity.  Do one more
 ; single cycle.  This is the "alternate body".  It will, of course,
 ; give us opposite registers from the other case, so we need
 ; completely different shutdown code.

 $ONEMOREDIAG
         FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
         LDD     0(%r24),%r28
         FLDD    0(%r25),%fr7        ; Cycle 2
         SHRPD   %r0,%r31,31,%r4
         ADD     %r3,%r22,%r3
         ADD,DC  %r0,%r20,%r20       ; Cycle 3
         LDD     -80(%sp),%r21
         ADD     %r3,%r28,%r3
         LDD     -64(%sp),%r29       ; Cycle 4
         STD     %r3,0(%r24)
         XMPYU   %fr7R,%fr7R,%fr29
         LDD     EIGHT(%r24),%r1     ; Cycle 5
         XMPYU   %fr7L,%fr7R,%fr27
         XMPYU   %fr7L,%fr7L,%fr30
         LDD     -104(%sp),%r19      ; Cycle 6
         FSTD    %fr29,-88(%sp)
         ADD,DC  %r4,%r20,%r20
         FSTD    %fr27,-72(%sp)      ; Cycle 7
         ADD     %r20,%r1,%r1
         ADD,DC  %r0,%r21,%r21       ; Cycle 8
         STD     %r1,EIGHT(%r24)

 ; Shutdown code, first stage.

         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
         LDO     THIRTY_TWO(%r24),%r24
         FSTD    %fr30,-96(%sp)
         LDD     UN_SIXTEEN(%r24),%r1
         SHRPD   %r0,%r29,31,%r3     ; Cycle 2
         ADD     %r4,%r21,%r4
         ADD,DC  %r0,%r19,%r19       ; Cycle 3
         LDD     -88(%sp),%r22
         ADD     %r4,%r1,%r4
         LDD     -72(%sp),%r31       ; Cycle 4
         STD     %r4,UN_SIXTEEN(%r24)
         LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
         LDD     -96(%sp),%r20       ; Cycle 6
         ADD,DC  %r3,%r19,%r19
         ADD     %r19,%r28,%r28      ; Cycle 7
         ADD,DC  %r0,%r22,%r22       ; Cycle 8
         STD     %r28,UN_EIGHT(%r24)

 ; Shutdown code, second stage.

 $JOINDIAG
         SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
         LDD     0(%r24),%r28
         SHRPD   %r0,%r31,31,%r4     ; Cycle 2
         ADD     %r3,%r22,%r3
         ADD,DC  %r0,%r20,%r20       ; Cycle 3
         ADD     %r3,%r28,%r3
         STD     %r3,0(%r24)         ; Cycle 4
         LDD     EIGHT(%r24),%r1     ; Cycle 5
         ADD,DC  %r4,%r20,%r20
         ADD     %r20,%r1,%r1        ; Cycle 7
         ADD,DC  %r0,%r0,%r21        ; Cycle 8
         CMPIB,*= 0,%r21,$Z0         ; if no overflow, exit
         STD     %r1,EIGHT(%r24)

 ; Final carry propagation

 $FDIAG1
         LDO     EIGHT(%r24),%r24
         LDD     EIGHT(%r24),%r26
         ADDI    1,%r26,%r26
         CMPIB,*= 0,%r26,$FDIAG1    ; Keep looping if there is a carry.
         STD     %r26,EIGHT(%r24)

 $Z0
         LDW     -124(%sp),%r4
         BVE     (%r2)
         .EXIT
         LDW,MB  -128(%sp),%r3
         .PROCEND
 ;	.ALLOW

         .SPACE         $TEXT$
         .SUBSPA        $CODE$
 #ifdef LITTLE_WORDIAN
 #ifdef __GNUC__
 ; GNU-as (as of 2.19) does not support LONG_RETURN
         .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
         .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
 #else
         .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
         .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
 #endif
 #else
         .EXPORT        maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
         .EXPORT        add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
 #endif
         .END


 ; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
 ;
 ; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
 ; performs a 64-bit x any-size multiply, and adds the
 ; result to an area of memory.  That is, it performs
 ; something like
 ;
 ;      A B C D
 ;    *       Z
 ;   __________
 ;    P Q R S T
 ;
 ; and then adds the "PQRST" vector into an area of memory,
 ; handling all carries.
 ;
 ; Digression on nomenclature and endian-ness:
 ;
 ; Each of the capital letters in the above represents a 64-bit
 ; quantity.  That is, you could think of the discussion as
 ; being in terms of radix-16-quintillion arithmetic.  The data
 ; type being manipulated is "unsigned long long int".  This
 ; requires the 64-bit extension of the HP-UX C compiler,
 ; available at release 10.  You need these compiler flags to
 ; enable these extensions:
 ;
 ;       -Aa +e +DA2.0 +DS2.0
 ;
 ; (The first specifies ANSI C, the second enables the
 ; extensions, which are beyond ANSI C, and the third and
 ; fourth tell the compiler to use whatever features of the
 ; PA2.0 architecture it wishes, in order to made the code more
 ; efficient.  Since the presence of the assembly code will
 ; make the program unable to run on anything less than PA2.0,
 ; you might as well gain the performance enhancements in the C
 ; code as well.)
 ;
 ; Questions of "endian-ness" often come up, usually in the
 ; context of byte ordering in a word.  These routines have a
 ; similar issue, that could be called "wordian-ness".
 ; Independent of byte ordering (PA is always big-endian), one
 ; can make two choices when representing extremely large
 ; numbers as arrays of 64-bit doublewords in memory.
 ;
 ; "Little-wordian" layout means that the least significant
 ; word of a number is stored at the lowest address.
 ;
 ;   MSW     LSW
 ;    |       |
 ;    V       V
 ;
 ;    A B C D E
 ;
 ;    ^     ^ ^
 ;    |     | |____ address 0
 ;    |     |
 ;    |     |_______address 8
 ;    |
 ;    address 32
 ;
 ; "Big-wordian" means that the most significant word is at the
 ; lowest address.
 ;
 ;   MSW     LSW
 ;    |       |
 ;    V       V
 ;
 ;    A B C D E
 ;
 ;    ^     ^ ^
 ;    |     | |____ address 32
 ;    |     |
 ;    |     |_______address 24
 ;    |
 ;    address 0
 ;
 ; When you compile the file, you must specify one or the other, with
 ; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
 ;
 ;     Incidentally, you assemble this file as part of your
 ;     project with the same C compiler as the rest of the program.
 ;     My "makefile" for a superprecision arithmetic package has
 ;     the following stuff:
 ;
 ;     # definitions:
 ;     CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
 ;     CFLAGS = +O3
 ;     LDFLAGS = -L /usr/lib -Wl,-aarchive
 ;
 ;     # general build rule for ".s" files:
 ;     .s.o:
 ;             $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
 ;
 ;     # Now any bind step that calls for pa20.o will assemble pa20.s
 ;
 ; End of digression, back to arithmetic:
 ;
 ; The way we multiply two huge numbers is, of course, to multiply
 ; the "ABCD" vector by each of the "WXYZ" doublewords, adding
 ; the result vectors with increasing offsets, the way we learned
 ; in school, back before we all used calculators:
 ;
 ;            A B C D
 ;          * W X Y Z
 ;         __________
 ;          P Q R S T
 ;        E F G H I
 ;      M N O P Q
 ;  + R S T U V
 ;    _______________
 ;    F I N A L S U M
 ;
 ; So we call maxpy_PA20_big (in my case; my package is
 ; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
 ; in turn as the "scalar", and giving the "ABCD" vector each
 ; time.  We direct it to add its result into an area of memory
 ; that we have cleared at the start.  We skew the exact
 ; location into that area with each call.
 ;
 ; The prototype for the function is
 ;
 ; extern void maxpy_PA20_big(
 ;    int length,        /* Number of doublewords in the multiplicand vector. */
 ;    const long long int *scalaraddr,    /* Address to fetch the scalar. */
 ;    const long long int *multiplicand,  /* The multiplicand vector. */
 ;    long long int *result);             /* Where to accumulate the result. */
 ;
 ; (You should place a copy of this prototype in an include file
 ; or in your C file.)
 ;
 ; Now, IN ALL CASES, the given address for the multiplicand or
 ; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
 ; That word is, of course, the word at which the routine
 ; starts processing.  "maxpy_PA20_little" then increases the
 ; addresses as it computes.  "maxpy_PA20_big" decreases them.
 ;
 ; In our example above, "length" would be 4 in each case.
 ; "multiplicand" would be the "ABCD" vector.  Specifically,
 ; the address of the element "D".  "scalaraddr" would be the
 ; address of "W", "X", "Y", or "Z" on the four calls that we
 ; would make.  (The order doesn't matter, of course.)
 ; "result" would be the appropriate address in the result
 ; area.  When multiplying by "Z", that would be the least
 ; significant word.  When multiplying by "Y", it would be the
 ; next higher word (8 bytes higher if little-wordian; 8 bytes
 ; lower if big-wordian), and so on.  The size of the result
 ; area must be the the sum of the sizes of the multiplicand
 ; and multiplier vectors, and must be initialized to zero
 ; before we start.
 ;
 ; Whenever the routine adds its partial product into the result
 ; vector, it follows carry chains as far as they need to go.
 ;
 ; Here is the super-precision multiply routine that I use for
 ; my package.  The package is big-wordian.  I have taken out
 ; handling of exponents (it's a floating point package):
 ;
 ; static void mul_PA20(
 ;   int size,
 ;   const long long int *arg1,
 ;   const long long int *arg2,
 ;   long long int *result)
 ; {
 ;    int i;
 ;
 ;    for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
 ;
 ;    for (i=0 ; i<size ; i++) {
 ;       maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
 ;    }
 ; }