| dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and |
| dnl store the result in a second limb vector. |
| |
| dnl Contributed to the GNU project by Torbjorn Granlund. |
| |
| dnl Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of either: |
| dnl |
| dnl * the GNU Lesser General Public License as published by the Free |
| dnl Software Foundation; either version 3 of the License, or (at your |
| dnl option) any later version. |
| dnl |
| dnl or |
| dnl |
| dnl * the GNU General Public License as published by the Free Software |
| dnl Foundation; either version 2 of the License, or (at your option) any |
| dnl later version. |
| dnl |
| dnl or both in parallel, as here. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| dnl for more details. |
| dnl |
| dnl You should have received copies of the GNU General Public License and the |
| dnl GNU Lesser General Public License along with the GNU MP Library. If not, |
| dnl see https://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| C cycles/limb |
| C Itanium: 4.0 |
| C Itanium 2: 2.0 |
| |
| C TODO |
| C * Further optimize feed-in and wind-down code, both for speed and code size. |
| C * Handle low limb input and results specially, using a common stf8 in the |
| C epilogue. |
| C * Use 1 c/l carry propagation scheme in wind-down code. |
| C * Use extra pointer register for `up' to speed up feed-in loads. |
| C * Work out final differences with addmul_1.asm. |
| |
| C INPUT PARAMETERS |
| define(`rp', `r32') |
| define(`up', `r33') |
| define(`n', `r34') |
| define(`vl', `r35') |
| define(`cy', `r36') C for mpn_mul_1c |
| |
| ASM_START() |
| PROLOGUE(mpn_mul_1) |
| .prologue |
| .save ar.lc, r2 |
| .body |
| |
| ifdef(`HAVE_ABI_32', |
| ` addp4 rp = 0, rp C M I |
| addp4 up = 0, up C M I |
| zxt4 n = n C I |
| ;; |
| ') |
| {.mfi |
| adds r15 = -1, n C M I |
| mov f9 = f0 C F |
| mov.i r2 = ar.lc C I0 |
| } |
| {.mmi |
| ldf8 f7 = [up], 8 C M |
| nop.m 0 C M |
| and r14 = 3, n C M I |
| ;; |
| } |
| .Lcommon: |
| {.mii |
| setf.sig f6 = vl C M2 M3 |
| shr.u r31 = r15, 2 C I0 |
| cmp.eq p10, p0 = 0, r14 C M I |
| } |
| {.mii |
| cmp.eq p11, p0 = 2, r14 C M I |
| cmp.eq p12, p0 = 3, r14 C M I |
| nop.i 0 C I |
| ;; |
| } |
| {.mii |
| cmp.ne p6, p7 = r0, r0 C M I |
| mov.i ar.lc = r31 C I0 |
| cmp.ne p8, p9 = r0, r0 C M I |
| } |
| {.bbb |
| (p10) br.dptk .Lb00 C B |
| (p11) br.dptk .Lb10 C B |
| (p12) br.dptk .Lb11 C B |
| ;; |
| } |
| |
| .Lb01: mov r20 = 0 |
| br.cloop.dptk .grt1 C B |
| |
| xma.l f39 = f7, f6, f9 C F |
| xma.hu f43 = f7, f6, f9 C F |
| ;; |
| getf.sig r8 = f43 C M2 |
| stf8 [rp] = f39 C M2 M3 |
| mov.i ar.lc = r2 C I0 |
| br.ret.sptk.many b0 C B |
| |
| .grt1: |
| ldf8 f32 = [up], 8 |
| ;; |
| ldf8 f33 = [up], 8 |
| ;; |
| ldf8 f34 = [up], 8 |
| xma.l f39 = f7, f6, f9 |
| xma.hu f43 = f7, f6, f9 |
| ;; |
| ldf8 f35 = [up], 8 |
| br.cloop.dptk .grt5 |
| |
| xma.l f36 = f32, f6, f0 |
| xma.hu f40 = f32, f6, f0 |
| ;; |
| stf8 [rp] = f39, 8 |
| xma.l f37 = f33, f6, f0 |
| xma.hu f41 = f33, f6, f0 |
| ;; |
| getf.sig r21 = f43 |
| getf.sig r18 = f36 |
| xma.l f38 = f34, f6, f0 |
| xma.hu f42 = f34, f6, f0 |
| ;; |
| getf.sig r22 = f40 |
| getf.sig r19 = f37 |
| xma.l f39 = f35, f6, f0 |
| xma.hu f43 = f35, f6, f0 |
| ;; |
| getf.sig r23 = f41 |
| getf.sig r16 = f38 |
| br .Lcj5 |
| |
| .grt5: |
| xma.l f36 = f32, f6, f0 |
| xma.hu f40 = f32, f6, f0 |
| ;; |
| getf.sig r17 = f39 |
| ldf8 f32 = [up], 8 |
| xma.l f37 = f33, f6, f0 |
| xma.hu f41 = f33, f6, f0 |
| ;; |
| getf.sig r21 = f43 |
| ldf8 f33 = [up], 8 |
| xma.l f38 = f34, f6, f0 |
| ;; |
| getf.sig r18 = f36 |
| xma.hu f42 = f34, f6, f0 |
| ;; |
| getf.sig r22 = f40 |
| ldf8 f34 = [up], 8 |
| xma.l f39 = f35, f6, f0 |
| ;; |
| getf.sig r19 = f37 |
| xma.hu f43 = f35, f6, f0 |
| br .LL01 |
| |
| |
| .Lb10: ldf8 f35 = [up], 8 |
| mov r23 = 0 |
| br.cloop.dptk .grt2 |
| |
| xma.l f38 = f7, f6, f9 |
| xma.hu f42 = f7, f6, f9 |
| ;; |
| stf8 [rp] = f38, 8 |
| xma.l f39 = f35, f6, f42 |
| xma.hu f43 = f35, f6, f42 |
| ;; |
| getf.sig r8 = f43 |
| stf8 [rp] = f39 |
| mov.i ar.lc = r2 |
| br.ret.sptk.many b0 |
| |
| |
| .grt2: |
| ldf8 f32 = [up], 8 |
| ;; |
| ldf8 f33 = [up], 8 |
| xma.l f38 = f7, f6, f9 |
| xma.hu f42 = f7, f6, f9 |
| ;; |
| ldf8 f34 = [up], 8 |
| xma.l f39 = f35, f6, f0 |
| xma.hu f43 = f35, f6, f0 |
| ;; |
| ldf8 f35 = [up], 8 |
| br.cloop.dptk .grt6 |
| |
| stf8 [rp] = f38, 8 |
| xma.l f36 = f32, f6, f0 |
| xma.hu f40 = f32, f6, f0 |
| ;; |
| getf.sig r20 = f42 |
| getf.sig r17 = f39 |
| xma.l f37 = f33, f6, f0 |
| xma.hu f41 = f33, f6, f0 |
| ;; |
| getf.sig r21 = f43 |
| getf.sig r18 = f36 |
| xma.l f38 = f34, f6, f0 |
| xma.hu f42 = f34, f6, f0 |
| ;; |
| getf.sig r22 = f40 |
| getf.sig r19 = f37 |
| xma.l f39 = f35, f6, f0 |
| xma.hu f43 = f35, f6, f0 |
| br .Lcj6 |
| |
| .grt6: |
| getf.sig r16 = f38 |
| xma.l f36 = f32, f6, f0 |
| xma.hu f40 = f32, f6, f0 |
| ;; |
| getf.sig r20 = f42 |
| ldf8 f32 = [up], 8 |
| xma.l f37 = f33, f6, f0 |
| ;; |
| getf.sig r17 = f39 |
| xma.hu f41 = f33, f6, f0 |
| ;; |
| getf.sig r21 = f43 |
| ldf8 f33 = [up], 8 |
| xma.l f38 = f34, f6, f0 |
| ;; |
| getf.sig r18 = f36 |
| xma.hu f42 = f34, f6, f0 |
| br .LL10 |
| |
| |
| .Lb11: ldf8 f34 = [up], 8 |
| mov r22 = 0 |
| ;; |
| ldf8 f35 = [up], 8 |
| br.cloop.dptk .grt3 |
| ;; |
| |
| xma.l f37 = f7, f6, f9 |
| xma.hu f41 = f7, f6, f9 |
| xma.l f38 = f34, f6, f0 |
| xma.hu f42 = f34, f6, f0 |
| xma.l f39 = f35, f6, f0 |
| xma.hu f43 = f35, f6, f0 |
| ;; |
| getf.sig r23 = f41 |
| stf8 [rp] = f37, 8 |
| getf.sig r16 = f38 |
| getf.sig r20 = f42 |
| getf.sig r17 = f39 |
| getf.sig r8 = f43 |
| br .Lcj3 |
| |
| .grt3: |
| ldf8 f32 = [up], 8 |
| xma.l f37 = f7, f6, f9 |
| xma.hu f41 = f7, f6, f9 |
| ;; |
| ldf8 f33 = [up], 8 |
| xma.l f38 = f34, f6, f0 |
| xma.hu f42 = f34, f6, f0 |
| ;; |
| getf.sig r19 = f37 |
| ldf8 f34 = [up], 8 |
| xma.l f39 = f35, f6, f0 |
| xma.hu f43 = f35, f6, f0 |
| ;; |
| getf.sig r23 = f41 |
| ldf8 f35 = [up], 8 |
| br.cloop.dptk .grt7 |
| |
| getf.sig r16 = f38 |
| xma.l f36 = f32, f6, f0 |
| getf.sig r20 = f42 |
| xma.hu f40 = f32, f6, f0 |
| ;; |
| getf.sig r17 = f39 |
| xma.l f37 = f33, f6, f0 |
| getf.sig r21 = f43 |
| xma.hu f41 = f33, f6, f0 |
| ;; |
| getf.sig r18 = f36 |
| st8 [rp] = r19, 8 |
| xma.l f38 = f34, f6, f0 |
| xma.hu f42 = f34, f6, f0 |
| br .Lcj7 |
| |
| .grt7: |
| getf.sig r16 = f38 |
| xma.l f36 = f32, f6, f0 |
| xma.hu f40 = f32, f6, f0 |
| ;; |
| getf.sig r20 = f42 |
| ldf8 f32 = [up], 8 |
| xma.l f37 = f33, f6, f0 |
| ;; |
| getf.sig r17 = f39 |
| xma.hu f41 = f33, f6, f0 |
| br .LL11 |
| |
| |
| .Lb00: ldf8 f33 = [up], 8 |
| mov r21 = 0 |
| ;; |
| ldf8 f34 = [up], 8 |
| ;; |
| ldf8 f35 = [up], 8 |
| xma.l f36 = f7, f6, f9 |
| xma.hu f40 = f7, f6, f9 |
| br.cloop.dptk .grt4 |
| |
| xma.l f37 = f33, f6, f0 |
| xma.hu f41 = f33, f6, f0 |
| xma.l f38 = f34, f6, f0 |
| xma.hu f42 = f34, f6, f0 |
| ;; |
| getf.sig r22 = f40 |
| stf8 [rp] = f36, 8 |
| xma.l f39 = f35, f6, f0 |
| getf.sig r19 = f37 |
| xma.hu f43 = f35, f6, f0 |
| ;; |
| getf.sig r23 = f41 |
| getf.sig r16 = f38 |
| getf.sig r20 = f42 |
| getf.sig r17 = f39 |
| br .Lcj4 |
| |
| .grt4: |
| ldf8 f32 = [up], 8 |
| xma.l f37 = f33, f6, f0 |
| xma.hu f41 = f33, f6, f0 |
| ;; |
| getf.sig r18 = f36 |
| ldf8 f33 = [up], 8 |
| xma.l f38 = f34, f6, f0 |
| xma.hu f42 = f34, f6, f0 |
| ;; |
| getf.sig r22 = f40 |
| ldf8 f34 = [up], 8 |
| xma.l f39 = f35, f6, f0 |
| ;; |
| getf.sig r19 = f37 |
| getf.sig r23 = f41 |
| xma.hu f43 = f35, f6, f0 |
| ldf8 f35 = [up], 8 |
| br.cloop.dptk .grt8 |
| |
| getf.sig r16 = f38 |
| xma.l f36 = f32, f6, f0 |
| getf.sig r20 = f42 |
| xma.hu f40 = f32, f6, f0 |
| ;; |
| getf.sig r17 = f39 |
| st8 [rp] = r18, 8 |
| xma.l f37 = f33, f6, f0 |
| xma.hu f41 = f33, f6, f0 |
| br .Lcj8 |
| |
| .grt8: |
| getf.sig r16 = f38 |
| xma.l f36 = f32, f6, f0 |
| xma.hu f40 = f32, f6, f0 |
| br .LL00 |
| |
| |
| C *** MAIN LOOP START *** |
| ALIGN(32) |
| .Loop: |
| .pred.rel "mutex",p6,p7 |
| getf.sig r16 = f38 |
| xma.l f36 = f32, f6, f0 |
| (p6) cmp.leu p8, p9 = r24, r17 |
| st8 [rp] = r24, 8 |
| xma.hu f40 = f32, f6, f0 |
| (p7) cmp.ltu p8, p9 = r24, r17 |
| ;; |
| .LL00: |
| .pred.rel "mutex",p8,p9 |
| getf.sig r20 = f42 |
| (p8) add r24 = r18, r21, 1 |
| nop.b 0 |
| ldf8 f32 = [up], 8 |
| (p9) add r24 = r18, r21 |
| nop.b 0 |
| ;; |
| .pred.rel "mutex",p8,p9 |
| getf.sig r17 = f39 |
| xma.l f37 = f33, f6, f0 |
| (p8) cmp.leu p6, p7 = r24, r18 |
| st8 [rp] = r24, 8 |
| xma.hu f41 = f33, f6, f0 |
| (p9) cmp.ltu p6, p7 = r24, r18 |
| ;; |
| .LL11: |
| .pred.rel "mutex",p6,p7 |
| getf.sig r21 = f43 |
| (p6) add r24 = r19, r22, 1 |
| nop.b 0 |
| ldf8 f33 = [up], 8 |
| (p7) add r24 = r19, r22 |
| nop.b 0 |
| ;; |
| .pred.rel "mutex",p6,p7 |
| getf.sig r18 = f36 |
| xma.l f38 = f34, f6, f0 |
| (p6) cmp.leu p8, p9 = r24, r19 |
| st8 [rp] = r24, 8 |
| xma.hu f42 = f34, f6, f0 |
| (p7) cmp.ltu p8, p9 = r24, r19 |
| ;; |
| .LL10: |
| .pred.rel "mutex",p8,p9 |
| getf.sig r22 = f40 |
| (p8) add r24 = r16, r23, 1 |
| nop.b 0 |
| ldf8 f34 = [up], 8 |
| (p9) add r24 = r16, r23 |
| nop.b 0 |
| ;; |
| .pred.rel "mutex",p8,p9 |
| getf.sig r19 = f37 |
| xma.l f39 = f35, f6, f0 |
| (p8) cmp.leu p6, p7 = r24, r16 |
| st8 [rp] = r24, 8 |
| xma.hu f43 = f35, f6, f0 |
| (p9) cmp.ltu p6, p7 = r24, r16 |
| ;; |
| .LL01: |
| .pred.rel "mutex",p6,p7 |
| getf.sig r23 = f41 |
| (p6) add r24 = r17, r20, 1 |
| nop.b 0 |
| ldf8 f35 = [up], 8 |
| (p7) add r24 = r17, r20 |
| br.cloop.dptk .Loop |
| C *** MAIN LOOP END *** |
| ;; |
| |
| .Lcj9: |
| .pred.rel "mutex",p6,p7 |
| getf.sig r16 = f38 |
| xma.l f36 = f32, f6, f0 |
| (p6) cmp.leu p8, p9 = r24, r17 |
| st8 [rp] = r24, 8 |
| xma.hu f40 = f32, f6, f0 |
| (p7) cmp.ltu p8, p9 = r24, r17 |
| ;; |
| .pred.rel "mutex",p8,p9 |
| getf.sig r20 = f42 |
| (p8) add r24 = r18, r21, 1 |
| (p9) add r24 = r18, r21 |
| ;; |
| .pred.rel "mutex",p8,p9 |
| getf.sig r17 = f39 |
| xma.l f37 = f33, f6, f0 |
| (p8) cmp.leu p6, p7 = r24, r18 |
| st8 [rp] = r24, 8 |
| xma.hu f41 = f33, f6, f0 |
| (p9) cmp.ltu p6, p7 = r24, r18 |
| ;; |
| .Lcj8: |
| .pred.rel "mutex",p6,p7 |
| getf.sig r21 = f43 |
| (p6) add r24 = r19, r22, 1 |
| (p7) add r24 = r19, r22 |
| ;; |
| .pred.rel "mutex",p6,p7 |
| getf.sig r18 = f36 |
| xma.l f38 = f34, f6, f0 |
| (p6) cmp.leu p8, p9 = r24, r19 |
| st8 [rp] = r24, 8 |
| xma.hu f42 = f34, f6, f0 |
| (p7) cmp.ltu p8, p9 = r24, r19 |
| ;; |
| .Lcj7: |
| .pred.rel "mutex",p8,p9 |
| getf.sig r22 = f40 |
| (p8) add r24 = r16, r23, 1 |
| (p9) add r24 = r16, r23 |
| ;; |
| .pred.rel "mutex",p8,p9 |
| getf.sig r19 = f37 |
| xma.l f39 = f35, f6, f0 |
| (p8) cmp.leu p6, p7 = r24, r16 |
| st8 [rp] = r24, 8 |
| xma.hu f43 = f35, f6, f0 |
| (p9) cmp.ltu p6, p7 = r24, r16 |
| ;; |
| .Lcj6: |
| .pred.rel "mutex",p6,p7 |
| getf.sig r23 = f41 |
| (p6) add r24 = r17, r20, 1 |
| (p7) add r24 = r17, r20 |
| ;; |
| .pred.rel "mutex",p6,p7 |
| (p6) cmp.leu p8, p9 = r24, r17 |
| (p7) cmp.ltu p8, p9 = r24, r17 |
| getf.sig r16 = f38 |
| st8 [rp] = r24, 8 |
| ;; |
| .Lcj5: |
| .pred.rel "mutex",p8,p9 |
| getf.sig r20 = f42 |
| (p8) add r24 = r18, r21, 1 |
| (p9) add r24 = r18, r21 |
| ;; |
| .pred.rel "mutex",p8,p9 |
| (p8) cmp.leu p6, p7 = r24, r18 |
| (p9) cmp.ltu p6, p7 = r24, r18 |
| getf.sig r17 = f39 |
| st8 [rp] = r24, 8 |
| ;; |
| .Lcj4: |
| .pred.rel "mutex",p6,p7 |
| getf.sig r8 = f43 |
| (p6) add r24 = r19, r22, 1 |
| (p7) add r24 = r19, r22 |
| ;; |
| .pred.rel "mutex",p6,p7 |
| st8 [rp] = r24, 8 |
| (p6) cmp.leu p8, p9 = r24, r19 |
| (p7) cmp.ltu p8, p9 = r24, r19 |
| ;; |
| .Lcj3: |
| .pred.rel "mutex",p8,p9 |
| (p8) add r24 = r16, r23, 1 |
| (p9) add r24 = r16, r23 |
| ;; |
| .pred.rel "mutex",p8,p9 |
| st8 [rp] = r24, 8 |
| (p8) cmp.leu p6, p7 = r24, r16 |
| (p9) cmp.ltu p6, p7 = r24, r16 |
| ;; |
| .Lcj2: |
| .pred.rel "mutex",p6,p7 |
| (p6) add r24 = r17, r20, 1 |
| (p7) add r24 = r17, r20 |
| ;; |
| .pred.rel "mutex",p6,p7 |
| st8 [rp] = r24, 8 |
| (p6) cmp.leu p8, p9 = r24, r17 |
| (p7) cmp.ltu p8, p9 = r24, r17 |
| ;; |
| (p8) add r8 = 1, r8 |
| mov.i ar.lc = r2 |
| br.ret.sptk.many b0 |
| EPILOGUE() |
| |
| PROLOGUE(mpn_mul_1c) |
| .prologue |
| .save ar.lc, r2 |
| .body |
| |
| ifdef(`HAVE_ABI_32', |
| ` addp4 rp = 0, rp C M I |
| addp4 up = 0, up C M I |
| zxt4 n = n C I |
| ;; |
| ') |
| {.mmi |
| adds r15 = -1, n C M I |
| setf.sig f9 = cy C M2 M3 |
| mov.i r2 = ar.lc C I0 |
| } |
| {.mmb |
| ldf8 f7 = [up], 8 C M |
| and r14 = 3, n C M I |
| br.sptk .Lcommon |
| ;; |
| } |
| EPILOGUE() |
| ASM_END() |