| /* This Source Code Form is subject to the terms of the Mozilla Public |
| * License, v. 2.0. If a copy of the MPL was not distributed with this |
| * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
| |
| /* Multiplication performance enhancements for sparc v8+vis CPUs. */ |
| |
| #include "mpi-priv.h" |
| #include <stddef.h> |
| #include <sys/systeminfo.h> |
| #include <strings.h> |
| |
| /* In the functions below, */ |
| /* vector y must be 8-byte aligned, and n must be even */ |
| /* returns carry out of high order word of result */ |
| /* maximum n is 256 */ |
| |
| /* vector x += vector y * scaler a; where y is of length n words. */ |
| extern mp_digit mul_add_inp(mp_digit *x, const mp_digit *y, int n, mp_digit a); |
| |
| /* vector z = vector x + vector y * scaler a; where y is of length n words. */ |
| extern mp_digit mul_add(mp_digit *z, const mp_digit *x, const mp_digit *y, |
| int n, mp_digit a); |
| |
| /* v8 versions of these functions run on any Sparc v8 CPU. */ |
| |
| /* This trick works on Sparc V8 CPUs with the Workshop compilers. */ |
| #define MP_MUL_DxD(a, b, Phi, Plo) \ |
| { \ |
| unsigned long long product = (unsigned long long)a * b; \ |
| Plo = (mp_digit)product; \ |
| Phi = (mp_digit)(product >> MP_DIGIT_BIT); \ |
| } |
| |
| /* c = a * b */ |
| static void |
| v8_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
| { |
| #if !defined(MP_NO_MP_WORD) |
| mp_digit d = 0; |
| |
| /* Inner product: Digits of a */ |
| while (a_len--) { |
| mp_word w = ((mp_word)b * *a++) + d; |
| *c++ = ACCUM(w); |
| d = CARRYOUT(w); |
| } |
| *c = d; |
| #else |
| mp_digit carry = 0; |
| while (a_len--) { |
| mp_digit a_i = *a++; |
| mp_digit a0b0, a1b1; |
| |
| MP_MUL_DxD(a_i, b, a1b1, a0b0); |
| |
| a0b0 += carry; |
| if (a0b0 < carry) |
| ++a1b1; |
| *c++ = a0b0; |
| carry = a1b1; |
| } |
| *c = carry; |
| #endif |
| } |
| |
| /* c += a * b */ |
| static void |
| v8_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
| { |
| #if !defined(MP_NO_MP_WORD) |
| mp_digit d = 0; |
| |
| /* Inner product: Digits of a */ |
| while (a_len--) { |
| mp_word w = ((mp_word)b * *a++) + *c + d; |
| *c++ = ACCUM(w); |
| d = CARRYOUT(w); |
| } |
| *c = d; |
| #else |
| mp_digit carry = 0; |
| while (a_len--) { |
| mp_digit a_i = *a++; |
| mp_digit a0b0, a1b1; |
| |
| MP_MUL_DxD(a_i, b, a1b1, a0b0); |
| |
| a0b0 += carry; |
| if (a0b0 < carry) |
| ++a1b1; |
| a0b0 += a_i = *c; |
| if (a0b0 < a_i) |
| ++a1b1; |
| *c++ = a0b0; |
| carry = a1b1; |
| } |
| *c = carry; |
| #endif |
| } |
| |
| /* Presently, this is only used by the Montgomery arithmetic code. */ |
| /* c += a * b */ |
| static void |
| v8_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
| { |
| #if !defined(MP_NO_MP_WORD) |
| mp_digit d = 0; |
| |
| /* Inner product: Digits of a */ |
| while (a_len--) { |
| mp_word w = ((mp_word)b * *a++) + *c + d; |
| *c++ = ACCUM(w); |
| d = CARRYOUT(w); |
| } |
| |
| while (d) { |
| mp_word w = (mp_word)*c + d; |
| *c++ = ACCUM(w); |
| d = CARRYOUT(w); |
| } |
| #else |
| mp_digit carry = 0; |
| while (a_len--) { |
| mp_digit a_i = *a++; |
| mp_digit a0b0, a1b1; |
| |
| MP_MUL_DxD(a_i, b, a1b1, a0b0); |
| |
| a0b0 += carry; |
| if (a0b0 < carry) |
| ++a1b1; |
| |
| a0b0 += a_i = *c; |
| if (a0b0 < a_i) |
| ++a1b1; |
| |
| *c++ = a0b0; |
| carry = a1b1; |
| } |
| while (carry) { |
| mp_digit c_i = *c; |
| carry += c_i; |
| *c++ = carry; |
| carry = carry < c_i; |
| } |
| #endif |
| } |
| |
| /* These functions run only on v8plus+vis or v9+vis CPUs. */ |
| |
| /* c = a * b */ |
| void |
| s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
| { |
| mp_digit d; |
| mp_digit x[258]; |
| if (a_len <= 256) { |
| if (a == c || ((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { |
| mp_digit *px; |
| px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; |
| memcpy(px, a, a_len * sizeof(*a)); |
| a = px; |
| if (a_len & 1) { |
| px[a_len] = 0; |
| } |
| } |
| s_mp_setz(c, a_len + 1); |
| d = mul_add_inp(c, a, a_len, b); |
| c[a_len] = d; |
| } else { |
| v8_mpv_mul_d(a, a_len, b, c); |
| } |
| } |
| |
| /* c += a * b, where a is a_len words long. */ |
| void |
| s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
| { |
| mp_digit d; |
| mp_digit x[258]; |
| if (a_len <= 256) { |
| if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { |
| mp_digit *px; |
| px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; |
| memcpy(px, a, a_len * sizeof(*a)); |
| a = px; |
| if (a_len & 1) { |
| px[a_len] = 0; |
| } |
| } |
| d = mul_add_inp(c, a, a_len, b); |
| c[a_len] = d; |
| } else { |
| v8_mpv_mul_d_add(a, a_len, b, c); |
| } |
| } |
| |
| /* c += a * b, where a is y words long. */ |
| void |
| s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
| { |
| mp_digit d; |
| mp_digit x[258]; |
| if (a_len <= 256) { |
| if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { |
| mp_digit *px; |
| px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; |
| memcpy(px, a, a_len * sizeof(*a)); |
| a = px; |
| if (a_len & 1) { |
| px[a_len] = 0; |
| } |
| } |
| d = mul_add_inp(c, a, a_len, b); |
| if (d) { |
| c += a_len; |
| do { |
| mp_digit sum = d + *c; |
| *c++ = sum; |
| d = sum < d; |
| } while (d); |
| } |
| } else { |
| v8_mpv_mul_d_add_prop(a, a_len, b, c); |
| } |
| } |