| dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the |
| dnl result from a second limb vector. |
| |
| dnl Contributed to the GNU project by Torbjorn Granlund. |
| |
| dnl Copyright 2000-2004 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of either: |
| dnl |
| dnl * the GNU Lesser General Public License as published by the Free |
| dnl Software Foundation; either version 3 of the License, or (at your |
| dnl option) any later version. |
| dnl |
| dnl or |
| dnl |
| dnl * the GNU General Public License as published by the Free Software |
| dnl Foundation; either version 2 of the License, or (at your option) any |
| dnl later version. |
| dnl |
| dnl or both in parallel, as here. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| dnl for more details. |
| dnl |
| dnl You should have received copies of the GNU General Public License and the |
| dnl GNU Lesser General Public License along with the GNU MP Library. If not, |
| dnl see https://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| C cycles/limb |
| C Itanium: 4.0 |
| C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l) |
| |
| C TODO |
| C * Optimize feed-in and wind-down code, both for speed and code size. |
| C * Handle low limb input and results specially, using a common stf8 in the |
| C epilogue. |
| C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in |
| C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and |
| C save a cycle. |
| |
| C INPUT PARAMETERS |
| define(`rp', `r32') |
| define(`up', `r33') |
| define(`n', `r34') |
| define(`vl', `r35') |
| |
| ASM_START() |
| PROLOGUE(mpn_submul_1) |
| .prologue |
| .save ar.lc, r2 |
| .body |
| |
| ifdef(`HAVE_ABI_32', |
| ` addp4 rp = 0, rp C M I |
| addp4 up = 0, up C M I |
| zxt4 n = n C I |
| ;; |
| ') |
| {.mmi |
| mov r10 = rp C M I |
| mov r9 = up C M I |
| sub vl = r0, vl C M I negate vl |
| } |
| {.mmi |
| ldf8 f8 = [rp], 8 C M |
| ldf8 f7 = [up], 8 C M |
| add r19 = -1, n C M I n - 1 |
| ;; |
| } |
| {.mmi |
| cmp.eq p6, p0 = 0, vl C M I |
| mov r8 = 0 C M I zero cylimb |
| mov r2 = ar.lc C I0 |
| } |
| {.mmi |
| setf.sig f6 = vl C M2 M3 |
| and r14 = 3, n C M I |
| shr.u r19 = r19, 2 C I0 |
| ;; |
| } |
| {.mmb |
| nop 0 |
| cmp.eq p10, p0 = 0, r14 C M I |
| (p6) br.spnt .Ldone C B vl == 0 |
| } |
| {.mmi |
| cmp.eq p11, p0 = 2, r14 C M I |
| cmp.eq p12, p0 = 3, r14 C M I |
| mov ar.lc = r19 C I0 |
| } |
| {.bbb |
| (p10) br.dptk .Lb00 C B |
| (p11) br.dptk .Lb10 C B |
| (p12) br.dptk .Lb11 C B |
| ;; |
| } |
| |
| .Lb01: br.cloop.dptk .grt1 |
| |
| xma.l f39 = f7, f6, f8 |
| xma.hu f43 = f7, f6, f8 |
| ;; |
| getf.sig r27 = f39 C lo |
| getf.sig r31 = f43 C hi |
| ld8 r20 = [r9], 8 |
| br .Lcj1 |
| |
| .grt1: ldf8 f44 = [rp], 8 |
| ldf8 f32 = [up], 8 |
| ;; |
| ldf8 f45 = [rp], 8 |
| ldf8 f33 = [up], 8 |
| ;; |
| ldf8 f46 = [rp], 8 |
| xma.l f39 = f7, f6, f8 |
| ldf8 f34 = [up], 8 |
| xma.hu f43 = f7, f6, f8 |
| ;; |
| ldf8 f47 = [rp], 8 |
| xma.l f36 = f32, f6, f44 |
| ldf8 f35 = [up], 8 |
| xma.hu f40 = f32, f6, f44 |
| br.cloop.dptk .grt5 |
| ;; |
| |
| getf.sig r27 = f39 C lo |
| xma.l f37 = f33, f6, f45 |
| ld8 r20 = [r9], 8 |
| xma.hu f41 = f33, f6, f45 |
| ;; |
| getf.sig r31 = f43 C hi |
| getf.sig r24 = f36 C lo |
| xma.l f38 = f34, f6, f46 |
| ld8 r21 = [r9], 8 |
| xma.hu f42 = f34, f6, f46 |
| ;; |
| getf.sig r28 = f40 C hi |
| getf.sig r25 = f37 C lo |
| xma.l f39 = f35, f6, f47 |
| ld8 r22 = [r9], 8 |
| xma.hu f43 = f35, f6, f47 |
| ;; |
| getf.sig r29 = f41 C hi |
| getf.sig r26 = f38 C lo |
| ld8 r23 = [r9], 8 |
| br .Lcj5 |
| |
| .grt5: ldf8 f44 = [rp], 8 |
| ldf8 f32 = [up], 8 |
| ;; |
| getf.sig r27 = f39 C lo |
| xma.l f37 = f33, f6, f45 |
| ld8 r20 = [r9], 8 |
| xma.hu f41 = f33, f6, f45 |
| ;; |
| ldf8 f45 = [rp], 8 |
| getf.sig r31 = f43 C hi |
| ldf8 f33 = [up], 8 |
| ;; |
| getf.sig r24 = f36 C lo |
| xma.l f38 = f34, f6, f46 |
| ld8 r21 = [r9], 8 |
| xma.hu f42 = f34, f6, f46 |
| ;; |
| ldf8 f46 = [rp], 8 |
| getf.sig r28 = f40 C hi |
| ldf8 f34 = [up], 8 |
| ;; |
| getf.sig r25 = f37 C lo |
| xma.l f39 = f35, f6, f47 |
| ld8 r22 = [r9], 8 |
| xma.hu f43 = f35, f6, f47 |
| ;; |
| ldf8 f47 = [rp], 8 |
| getf.sig r29 = f41 C hi |
| ldf8 f35 = [up], 8 |
| ;; |
| getf.sig r26 = f38 C lo |
| xma.l f36 = f32, f6, f44 |
| ld8 r23 = [r9], 8 |
| xma.hu f40 = f32, f6, f44 |
| br.cloop.dptk .Loop |
| br .Lend |
| |
| |
| .Lb10: ldf8 f47 = [rp], 8 |
| ldf8 f35 = [up], 8 |
| br.cloop.dptk .grt2 |
| |
| xma.l f38 = f7, f6, f8 |
| xma.hu f42 = f7, f6, f8 |
| ;; |
| xma.l f39 = f35, f6, f47 |
| xma.hu f43 = f35, f6, f47 |
| ;; |
| getf.sig r26 = f38 C lo |
| getf.sig r30 = f42 C hi |
| ld8 r23 = [r9], 8 |
| ;; |
| getf.sig r27 = f39 C lo |
| getf.sig r31 = f43 C hi |
| ld8 r20 = [r9], 8 |
| br .Lcj2 |
| |
| .grt2: ldf8 f44 = [rp], 8 |
| ldf8 f32 = [up], 8 |
| ;; |
| ldf8 f45 = [rp], 8 |
| ldf8 f33 = [up], 8 |
| xma.l f38 = f7, f6, f8 |
| xma.hu f42 = f7, f6, f8 |
| ;; |
| ldf8 f46 = [rp], 8 |
| ldf8 f34 = [up], 8 |
| xma.l f39 = f35, f6, f47 |
| xma.hu f43 = f35, f6, f47 |
| ;; |
| ldf8 f47 = [rp], 8 |
| ldf8 f35 = [up], 8 |
| ;; |
| getf.sig r26 = f38 C lo |
| xma.l f36 = f32, f6, f44 |
| ld8 r23 = [r9], 8 |
| xma.hu f40 = f32, f6, f44 |
| br.cloop.dptk .grt6 |
| |
| getf.sig r30 = f42 C hi |
| ;; |
| getf.sig r27 = f39 C lo |
| xma.l f37 = f33, f6, f45 |
| ld8 r20 = [r9], 8 |
| xma.hu f41 = f33, f6, f45 |
| ;; |
| getf.sig r31 = f43 C hi |
| getf.sig r24 = f36 C lo |
| xma.l f38 = f34, f6, f46 |
| ld8 r21 = [r9], 8 |
| xma.hu f42 = f34, f6, f46 |
| ;; |
| getf.sig r28 = f40 C hi |
| getf.sig r25 = f37 C lo |
| xma.l f39 = f35, f6, f47 |
| ld8 r22 = [r9], 8 |
| xma.hu f43 = f35, f6, f47 |
| br .Lcj6 |
| |
| .grt6: ldf8 f44 = [rp], 8 |
| getf.sig r30 = f42 C hi |
| ldf8 f32 = [up], 8 |
| ;; |
| getf.sig r27 = f39 C lo |
| xma.l f37 = f33, f6, f45 |
| ld8 r20 = [r9], 8 |
| xma.hu f41 = f33, f6, f45 |
| ;; |
| ldf8 f45 = [rp], 8 |
| getf.sig r31 = f43 C hi |
| ldf8 f33 = [up], 8 |
| ;; |
| getf.sig r24 = f36 C lo |
| xma.l f38 = f34, f6, f46 |
| ld8 r21 = [r9], 8 |
| xma.hu f42 = f34, f6, f46 |
| ;; |
| ldf8 f46 = [rp], 8 |
| getf.sig r28 = f40 C hi |
| ldf8 f34 = [up], 8 |
| ;; |
| getf.sig r25 = f37 C lo |
| xma.l f39 = f35, f6, f47 |
| ld8 r22 = [r9], 8 |
| xma.hu f43 = f35, f6, f47 |
| br .LL10 |
| |
| |
| .Lb11: ldf8 f46 = [rp], 8 |
| ldf8 f34 = [up], 8 |
| ;; |
| ldf8 f47 = [rp], 8 |
| ldf8 f35 = [up], 8 |
| br.cloop.dptk .grt3 |
| |
| xma.l f37 = f7, f6, f8 |
| xma.hu f41 = f7, f6, f8 |
| ;; |
| xma.l f38 = f34, f6, f46 |
| xma.hu f42 = f34, f6, f46 |
| ;; |
| getf.sig r25 = f37 C lo |
| xma.l f39 = f35, f6, f47 |
| xma.hu f43 = f35, f6, f47 |
| ;; |
| getf.sig r29 = f41 C hi |
| ld8 r22 = [r9], 8 |
| ;; |
| getf.sig r26 = f38 C lo |
| getf.sig r30 = f42 C hi |
| ld8 r23 = [r9], 8 |
| ;; |
| getf.sig r27 = f39 C lo |
| getf.sig r31 = f43 C hi |
| ld8 r20 = [r9], 8 |
| br .Lcj3 |
| |
| .grt3: ldf8 f44 = [rp], 8 |
| xma.l f37 = f7, f6, f8 |
| ldf8 f32 = [up], 8 |
| xma.hu f41 = f7, f6, f8 |
| ;; |
| ldf8 f45 = [rp], 8 |
| xma.l f38 = f34, f6, f46 |
| ldf8 f33 = [up], 8 |
| xma.hu f42 = f34, f6, f46 |
| ;; |
| ldf8 f46 = [rp], 8 |
| ldf8 f34 = [up], 8 |
| ;; |
| getf.sig r25 = f37 C lo |
| xma.l f39 = f35, f6, f47 |
| ld8 r22 = [r9], 8 |
| xma.hu f43 = f35, f6, f47 |
| ;; |
| ldf8 f47 = [rp], 8 |
| getf.sig r29 = f41 C hi |
| ldf8 f35 = [up], 8 |
| ;; |
| getf.sig r26 = f38 C lo |
| xma.l f36 = f32, f6, f44 |
| ld8 r23 = [r9], 8 |
| xma.hu f40 = f32, f6, f44 |
| br.cloop.dptk .grt7 |
| ;; |
| |
| getf.sig r30 = f42 C hi |
| getf.sig r27 = f39 C lo |
| xma.l f37 = f33, f6, f45 |
| ld8 r20 = [r9], 8 |
| xma.hu f41 = f33, f6, f45 |
| ;; |
| getf.sig r31 = f43 C hi |
| getf.sig r24 = f36 C lo |
| xma.l f38 = f34, f6, f46 |
| ld8 r21 = [r9], 8 |
| xma.hu f42 = f34, f6, f46 |
| br .Lcj7 |
| |
| .grt7: ldf8 f44 = [rp], 8 |
| getf.sig r30 = f42 C hi |
| ldf8 f32 = [up], 8 |
| ;; |
| getf.sig r27 = f39 C lo |
| xma.l f37 = f33, f6, f45 |
| ld8 r20 = [r9], 8 |
| xma.hu f41 = f33, f6, f45 |
| ;; |
| ldf8 f45 = [rp], 8 |
| getf.sig r31 = f43 C hi |
| ldf8 f33 = [up], 8 |
| ;; |
| getf.sig r24 = f36 C lo |
| xma.l f38 = f34, f6, f46 |
| ld8 r21 = [r9], 8 |
| xma.hu f42 = f34, f6, f46 |
| br .LL11 |
| |
| |
| .Lb00: ldf8 f45 = [rp], 8 |
| ldf8 f33 = [up], 8 |
| ;; |
| ldf8 f46 = [rp], 8 |
| ldf8 f34 = [up], 8 |
| ;; |
| ldf8 f47 = [rp], 8 |
| xma.l f36 = f7, f6, f8 |
| ldf8 f35 = [up], 8 |
| xma.hu f40 = f7, f6, f8 |
| br.cloop.dptk .grt4 |
| |
| xma.l f37 = f33, f6, f45 |
| xma.hu f41 = f33, f6, f45 |
| ;; |
| getf.sig r24 = f36 C lo |
| xma.l f38 = f34, f6, f46 |
| ld8 r21 = [r9], 8 |
| xma.hu f42 = f34, f6, f46 |
| ;; |
| getf.sig r28 = f40 C hi |
| xma.l f39 = f35, f6, f47 |
| getf.sig r25 = f37 C lo |
| ld8 r22 = [r9], 8 |
| xma.hu f43 = f35, f6, f47 |
| ;; |
| getf.sig r29 = f41 C hi |
| getf.sig r26 = f38 C lo |
| ld8 r23 = [r9], 8 |
| ;; |
| getf.sig r30 = f42 C hi |
| getf.sig r27 = f39 C lo |
| ld8 r20 = [r9], 8 |
| br .Lcj4 |
| |
| .grt4: ldf8 f44 = [rp], 8 |
| xma.l f37 = f33, f6, f45 |
| ldf8 f32 = [up], 8 |
| xma.hu f41 = f33, f6, f45 |
| ;; |
| ldf8 f45 = [rp], 8 |
| ldf8 f33 = [up], 8 |
| xma.l f38 = f34, f6, f46 |
| getf.sig r24 = f36 C lo |
| ld8 r21 = [r9], 8 |
| xma.hu f42 = f34, f6, f46 |
| ;; |
| ldf8 f46 = [rp], 8 |
| getf.sig r28 = f40 C hi |
| ldf8 f34 = [up], 8 |
| xma.l f39 = f35, f6, f47 |
| getf.sig r25 = f37 C lo |
| ld8 r22 = [r9], 8 |
| xma.hu f43 = f35, f6, f47 |
| ;; |
| ldf8 f47 = [rp], 8 |
| getf.sig r29 = f41 C hi |
| ldf8 f35 = [up], 8 |
| ;; |
| getf.sig r26 = f38 C lo |
| xma.l f36 = f32, f6, f44 |
| ld8 r23 = [r9], 8 |
| xma.hu f40 = f32, f6, f44 |
| br.cloop.dptk .grt8 |
| ;; |
| |
| getf.sig r30 = f42 C hi |
| getf.sig r27 = f39 C lo |
| xma.l f37 = f33, f6, f45 |
| ld8 r20 = [r9], 8 |
| xma.hu f41 = f33, f6, f45 |
| br .Lcj8 |
| |
| .grt8: ldf8 f44 = [rp], 8 |
| getf.sig r30 = f42 C hi |
| ldf8 f32 = [up], 8 |
| ;; |
| getf.sig r27 = f39 C lo |
| xma.l f37 = f33, f6, f45 |
| ld8 r20 = [r9], 8 |
| xma.hu f41 = f33, f6, f45 |
| br .LL00 |
| |
| ALIGN(32) |
| .Loop: |
| {.mmi |
| ldf8 f44 = [rp], 8 |
| cmp.ltu p6, p0 = r27, r8 C lo cmp |
| sub r14 = r27, r8 C lo sub |
| } |
| {.mmi |
| getf.sig r30 = f42 C hi |
| ldf8 f32 = [up], 8 |
| sub r8 = r20, r31 C hi sub |
| ;; C 01 |
| } |
| {.mmf |
| getf.sig r27 = f39 C lo |
| st8 [r10] = r14, 8 |
| xma.l f37 = f33, f6, f45 |
| } |
| {.mfi |
| ld8 r20 = [r9], 8 |
| xma.hu f41 = f33, f6, f45 |
| (p6) add r8 = 1, r8 |
| ;; C 02 |
| } |
| {.mmi |
| .LL00: ldf8 f45 = [rp], 8 |
| cmp.ltu p6, p0 = r24, r8 |
| sub r14 = r24, r8 |
| } |
| {.mmi |
| getf.sig r31 = f43 C hi |
| ldf8 f33 = [up], 8 |
| sub r8 = r21, r28 |
| ;; C 03 |
| } |
| {.mmf |
| getf.sig r24 = f36 C lo |
| st8 [r10] = r14, 8 |
| xma.l f38 = f34, f6, f46 |
| } |
| {.mfi |
| ld8 r21 = [r9], 8 |
| xma.hu f42 = f34, f6, f46 |
| (p6) add r8 = 1, r8 |
| ;; C 04 |
| } |
| {.mmi |
| .LL11: ldf8 f46 = [rp], 8 |
| cmp.ltu p6, p0 = r25, r8 |
| sub r14 = r25, r8 |
| } |
| {.mmi |
| getf.sig r28 = f40 C hi |
| ldf8 f34 = [up], 8 |
| sub r8 = r22, r29 |
| ;; C 05 |
| } |
| {.mmf |
| getf.sig r25 = f37 C lo |
| st8 [r10] = r14, 8 |
| xma.l f39 = f35, f6, f47 |
| } |
| {.mfi |
| ld8 r22 = [r9], 8 |
| xma.hu f43 = f35, f6, f47 |
| (p6) add r8 = 1, r8 |
| ;; C 06 |
| } |
| {.mmi |
| .LL10: ldf8 f47 = [rp], 8 |
| cmp.ltu p6, p0 = r26, r8 |
| sub r14 = r26, r8 |
| } |
| {.mmi |
| getf.sig r29 = f41 C hi |
| ldf8 f35 = [up], 8 |
| sub r8 = r23, r30 |
| ;; C 07 |
| } |
| {.mmf |
| getf.sig r26 = f38 C lo |
| st8 [r10] = r14, 8 |
| xma.l f36 = f32, f6, f44 |
| } |
| {.mfi |
| ld8 r23 = [r9], 8 |
| xma.hu f40 = f32, f6, f44 |
| (p6) add r8 = 1, r8 |
| } |
| br.cloop.dptk .Loop |
| ;; |
| |
| .Lend: |
| cmp.ltu p6, p0 = r27, r8 |
| sub r14 = r27, r8 |
| getf.sig r30 = f42 |
| sub r8 = r20, r31 |
| ;; |
| getf.sig r27 = f39 |
| st8 [r10] = r14, 8 |
| xma.l f37 = f33, f6, f45 |
| ld8 r20 = [r9], 8 |
| xma.hu f41 = f33, f6, f45 |
| (p6) add r8 = 1, r8 |
| ;; |
| .Lcj8: |
| cmp.ltu p6, p0 = r24, r8 |
| sub r14 = r24, r8 |
| getf.sig r31 = f43 |
| sub r8 = r21, r28 |
| ;; |
| getf.sig r24 = f36 |
| st8 [r10] = r14, 8 |
| xma.l f38 = f34, f6, f46 |
| ld8 r21 = [r9], 8 |
| xma.hu f42 = f34, f6, f46 |
| (p6) add r8 = 1, r8 |
| ;; |
| .Lcj7: |
| cmp.ltu p6, p0 = r25, r8 |
| sub r14 = r25, r8 |
| getf.sig r28 = f40 |
| sub r8 = r22, r29 |
| ;; |
| getf.sig r25 = f37 |
| st8 [r10] = r14, 8 |
| xma.l f39 = f35, f6, f47 |
| ld8 r22 = [r9], 8 |
| xma.hu f43 = f35, f6, f47 |
| (p6) add r8 = 1, r8 |
| ;; |
| .Lcj6: |
| cmp.ltu p6, p0 = r26, r8 |
| sub r14 = r26, r8 |
| getf.sig r29 = f41 |
| sub r8 = r23, r30 |
| ;; |
| getf.sig r26 = f38 |
| st8 [r10] = r14, 8 |
| ld8 r23 = [r9], 8 |
| (p6) add r8 = 1, r8 |
| ;; |
| .Lcj5: |
| cmp.ltu p6, p0 = r27, r8 |
| sub r14 = r27, r8 |
| getf.sig r30 = f42 |
| sub r8 = r20, r31 |
| ;; |
| getf.sig r27 = f39 |
| st8 [r10] = r14, 8 |
| ld8 r20 = [r9], 8 |
| (p6) add r8 = 1, r8 |
| ;; |
| .Lcj4: |
| cmp.ltu p6, p0 = r24, r8 |
| sub r14 = r24, r8 |
| getf.sig r31 = f43 |
| sub r8 = r21, r28 |
| ;; |
| st8 [r10] = r14, 8 |
| (p6) add r8 = 1, r8 |
| ;; |
| .Lcj3: |
| cmp.ltu p6, p0 = r25, r8 |
| sub r14 = r25, r8 |
| sub r8 = r22, r29 |
| ;; |
| st8 [r10] = r14, 8 |
| (p6) add r8 = 1, r8 |
| ;; |
| .Lcj2: |
| cmp.ltu p6, p0 = r26, r8 |
| sub r14 = r26, r8 |
| sub r8 = r23, r30 |
| ;; |
| st8 [r10] = r14, 8 |
| (p6) add r8 = 1, r8 |
| ;; |
| .Lcj1: |
| cmp.ltu p6, p0 = r27, r8 |
| sub r14 = r27, r8 |
| sub r8 = r20, r31 |
| ;; |
| st8 [r10] = r14, 8 |
| mov ar.lc = r2 |
| (p6) add r8 = 1, r8 |
| br.ret.sptk.many b0 |
| .Ldone: mov ar.lc = r2 |
| br.ret.sptk.many b0 |
| EPILOGUE() |
| ASM_END() |