| dnl ARM Neon mpn_sec_tabselect. |
| |
| dnl Contributed to the GNU project by Torbjรถrn Granlund. |
| |
| dnl Copyright 2011-2013 Free Software Foundation, Inc. |
| |
| dnl This file is part of the GNU MP Library. |
| dnl |
| dnl The GNU MP Library is free software; you can redistribute it and/or modify |
| dnl it under the terms of either: |
| dnl |
| dnl * the GNU Lesser General Public License as published by the Free |
| dnl Software Foundation; either version 3 of the License, or (at your |
| dnl option) any later version. |
| dnl |
| dnl or |
| dnl |
| dnl * the GNU General Public License as published by the Free Software |
| dnl Foundation; either version 2 of the License, or (at your option) any |
| dnl later version. |
| dnl |
| dnl or both in parallel, as here. |
| dnl |
| dnl The GNU MP Library is distributed in the hope that it will be useful, but |
| dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| dnl for more details. |
| dnl |
| dnl You should have received copies of the GNU General Public License and the |
| dnl GNU Lesser General Public License along with the GNU MP Library. If not, |
| dnl see https://www.gnu.org/licenses/. |
| |
| include(`../config.m4') |
| |
| |
| C cycles/limb |
| C StrongARM - |
| C XScale - |
| C Cortex-A7 ? |
| C Cortex-A8 ? |
| C Cortex-A9 1.15 |
| C Cortex-A15 0.65 |
| |
| define(`rp', `r0') |
| define(`tp', `r1') |
| define(`n', `r2') |
| define(`nents', `r3') |
| C define(`which', on stack) |
| |
| define(`i', `r4') |
| define(`j', `r5') |
| |
| define(`maskq', `q10') |
| define(`maskd', `d20') |
| |
| ASM_START() |
| PROLOGUE(mpn_sec_tabselect) |
| push {r4-r5} |
| |
| add r4, sp, #8 |
| vld1.32 {d30[], d31[]}, [r4] C 4 `which' copies |
| vmov.i32 q14, #1 C 4 copies of 1 |
| |
| subs j, n, #8 |
| bmi L(outer_end) |
| |
| L(outer_top): |
| mov i, nents |
| mov r12, tp C preserve tp |
| veor q13, q13, q13 C 4 counter copies |
| veor q2, q2, q2 |
| veor q3, q3, q3 |
| ALIGN(16) |
| L(top): vceq.i32 maskq, q13, q15 C compare idx copies to `which' copies |
| vld1.32 {q0,q1}, [tp] |
| vadd.i32 q13, q13, q14 |
| vbit q2, q0, maskq |
| vbit q3, q1, maskq |
| add tp, tp, n, lsl #2 |
| subs i, i, #1 |
| bne L(top) |
| vst1.32 {q2,q3}, [rp]! |
| add tp, r12, #32 C restore tp, point to next slice |
| subs j, j, #8 |
| bpl L(outer_top) |
| L(outer_end): |
| |
| tst n, #4 |
| beq L(b0xx) |
| L(b1xx):mov i, nents |
| mov r12, tp |
| veor q13, q13, q13 |
| veor q2, q2, q2 |
| ALIGN(16) |
| L(tp4): vceq.i32 maskq, q13, q15 |
| vld1.32 {q0}, [tp] |
| vadd.i32 q13, q13, q14 |
| vbit q2, q0, maskq |
| add tp, tp, n, lsl #2 |
| subs i, i, #1 |
| bne L(tp4) |
| vst1.32 {q2}, [rp]! |
| add tp, r12, #16 |
| |
| L(b0xx):tst n, #2 |
| beq L(b00x) |
| L(b01x):mov i, nents |
| mov r12, tp |
| veor d26, d26, d26 |
| veor d4, d4, d4 |
| ALIGN(16) |
| L(tp2): vceq.i32 maskd, d26, d30 |
| vld1.32 {d0}, [tp] |
| vadd.i32 d26, d26, d28 |
| vbit d4, d0, maskd |
| add tp, tp, n, lsl #2 |
| subs i, i, #1 |
| bne L(tp2) |
| vst1.32 {d4}, [rp]! |
| add tp, r12, #8 |
| |
| L(b00x):tst n, #1 |
| beq L(b000) |
| L(b001):mov i, nents |
| mov r12, tp |
| veor d26, d26, d26 |
| veor d4, d4, d4 |
| ALIGN(16) |
| L(tp1): vceq.i32 maskd, d26, d30 |
| vld1.32 {d0[0]}, [tp] |
| vadd.i32 d26, d26, d28 |
| vbit d4, d0, maskd |
| add tp, tp, n, lsl #2 |
| subs i, i, #1 |
| bne L(tp1) |
| vst1.32 {d4[0]}, [rp] |
| |
| L(b000):pop {r4-r5} |
| bx r14 |
| EPILOGUE() |