nss-3.41/nss/lib/freebl/mpi/mpi_sparc.c - manifest_repos/nss - Git at Google

 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 /* Multiplication performance enhancements for sparc v8+vis CPUs. */

 #include "mpi-priv.h"
 #include <stddef.h>
 #include <sys/systeminfo.h>
 #include <strings.h>

 /* In the functions below, */
 /* vector y must be 8-byte aligned, and n must be even */
 /* returns carry out of high order word of result */
 /* maximum n is 256 */

 /* vector x += vector y * scaler a; where y is of length n words. */
 extern mp_digit mul_add_inp(mp_digit *x, const mp_digit *y, int n, mp_digit a);

 /* vector z = vector x + vector y * scaler a; where y is of length n words. */
 extern mp_digit mul_add(mp_digit *z, const mp_digit *x, const mp_digit *y,
                         int n, mp_digit a);

 /* v8 versions of these functions run on any Sparc v8 CPU. */

 /* This trick works on Sparc V8 CPUs with the Workshop compilers. */
 #define MP_MUL_DxD(a, b, Phi, Plo)                              \
     {                                                           \
         unsigned long long product = (unsigned long long)a * b; \
         Plo = (mp_digit)product;                                \
         Phi = (mp_digit)(product >> MP_DIGIT_BIT);              \
     }

 /* c = a * b */
 static void
 v8_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
 {
 #if !defined(MP_NO_MP_WORD)
     mp_digit d = 0;

     /* Inner product:  Digits of a */
     while (a_len--) {
         mp_word w = ((mp_word)b * *a++) + d;
         *c++ = ACCUM(w);
         d = CARRYOUT(w);
     }
     *c = d;
 #else
     mp_digit carry = 0;
     while (a_len--) {
         mp_digit a_i = *a++;
         mp_digit a0b0, a1b1;

         MP_MUL_DxD(a_i, b, a1b1, a0b0);

         a0b0 += carry;
         if (a0b0 < carry)
             ++a1b1;
         *c++ = a0b0;
         carry = a1b1;
     }
     *c = carry;
 #endif
 }

 /* c += a * b */
 static void
 v8_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
 {
 #if !defined(MP_NO_MP_WORD)
     mp_digit d = 0;

     /* Inner product:  Digits of a */
     while (a_len--) {
         mp_word w = ((mp_word)b * *a++) + *c + d;
         *c++ = ACCUM(w);
         d = CARRYOUT(w);
     }
     *c = d;
 #else
     mp_digit carry = 0;
     while (a_len--) {
         mp_digit a_i = *a++;
         mp_digit a0b0, a1b1;

         MP_MUL_DxD(a_i, b, a1b1, a0b0);

         a0b0 += carry;
         if (a0b0 < carry)
             ++a1b1;
         a0b0 += a_i = *c;
         if (a0b0 < a_i)
             ++a1b1;
         *c++ = a0b0;
         carry = a1b1;
     }
     *c = carry;
 #endif
 }

 /* Presently, this is only used by the Montgomery arithmetic code. */
 /* c += a * b */
 static void
 v8_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
 {
 #if !defined(MP_NO_MP_WORD)
     mp_digit d = 0;

     /* Inner product:  Digits of a */
     while (a_len--) {
         mp_word w = ((mp_word)b * *a++) + *c + d;
         *c++ = ACCUM(w);
         d = CARRYOUT(w);
     }

     while (d) {
         mp_word w = (mp_word)*c + d;
         *c++ = ACCUM(w);
         d = CARRYOUT(w);
     }
 #else
     mp_digit carry = 0;
     while (a_len--) {
         mp_digit a_i = *a++;
         mp_digit a0b0, a1b1;

         MP_MUL_DxD(a_i, b, a1b1, a0b0);

         a0b0 += carry;
         if (a0b0 < carry)
             ++a1b1;

         a0b0 += a_i = *c;
         if (a0b0 < a_i)
             ++a1b1;

         *c++ = a0b0;
         carry = a1b1;
     }
     while (carry) {
         mp_digit c_i = *c;
         carry += c_i;
         *c++ = carry;
         carry = carry < c_i;
     }
 #endif
 }

 /* These functions run only on v8plus+vis or v9+vis CPUs. */

 /* c = a * b */
 void
 s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
 {
     mp_digit d;
     mp_digit x[258];
     if (a_len <= 256) {
         if (a == c || ((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
             mp_digit *px;
             px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
             memcpy(px, a, a_len * sizeof(*a));
             a = px;
             if (a_len & 1) {
                 px[a_len] = 0;
             }
         }
         s_mp_setz(c, a_len + 1);
         d = mul_add_inp(c, a, a_len, b);
         c[a_len] = d;
     } else {
         v8_mpv_mul_d(a, a_len, b, c);
     }
 }

 /* c += a * b, where a is a_len words long. */
 void
 s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
 {
     mp_digit d;
     mp_digit x[258];
     if (a_len <= 256) {
         if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
             mp_digit *px;
             px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
             memcpy(px, a, a_len * sizeof(*a));
             a = px;
             if (a_len & 1) {
                 px[a_len] = 0;
             }
         }
         d = mul_add_inp(c, a, a_len, b);
         c[a_len] = d;
     } else {
         v8_mpv_mul_d_add(a, a_len, b, c);
     }
 }

 /* c += a * b, where a is y words long. */
 void
 s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
 {
     mp_digit d;
     mp_digit x[258];
     if (a_len <= 256) {
         if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
             mp_digit *px;
             px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
             memcpy(px, a, a_len * sizeof(*a));
             a = px;
             if (a_len & 1) {
                 px[a_len] = 0;
             }
         }
         d = mul_add_inp(c, a, a_len, b);
         if (d) {
             c += a_len;
             do {
                 mp_digit sum = d + *c;
                 *c++ = sum;
                 d = sum < d;
             } while (d);
         }
     } else {
         v8_mpv_mul_d_add_prop(a, a_len, b, c);
     }
 }
	/* This Source Code Form is subject to the terms of the Mozilla Public
	* License, v. 2.0. If a copy of the MPL was not distributed with this
	* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

	/* Multiplication performance enhancements for sparc v8+vis CPUs. */

	#include "mpi-priv.h"
	#include <stddef.h>
	#include <sys/systeminfo.h>
	#include <strings.h>

	/* In the functions below, */
	/* vector y must be 8-byte aligned, and n must be even */
	/* returns carry out of high order word of result */
	/* maximum n is 256 */

	/* vector x += vector y * scaler a; where y is of length n words. */
	extern mp_digit mul_add_inp(mp_digit x, const mp_digit y, int n, mp_digit a);

	/* vector z = vector x + vector y * scaler a; where y is of length n words. */
	extern mp_digit mul_add(mp_digit z, const mp_digit x, const mp_digit *y,
	int n, mp_digit a);

	/* v8 versions of these functions run on any Sparc v8 CPU. */

	/* This trick works on Sparc V8 CPUs with the Workshop compilers. */
	#define MP_MUL_DxD(a, b, Phi, Plo) \
	{ \
	unsigned long long product = (unsigned long long)a * b; \
	Plo = (mp_digit)product; \
	Phi = (mp_digit)(product >> MP_DIGIT_BIT); \
	}

	/* c = a * b */
	static void
	v8_mpv_mul_d(const mp_digit a, mp_size a_len, mp_digit b, mp_digit c)
	{
	#if !defined(MP_NO_MP_WORD)
	mp_digit d = 0;

	/* Inner product: Digits of a */
	while (a_len--) {
	mp_word w = ((mp_word)b * *a++) + d;
	*c++ = ACCUM(w);
	d = CARRYOUT(w);
	}
	*c = d;
	#else
	mp_digit carry = 0;
	while (a_len--) {
	mp_digit a_i = *a++;
	mp_digit a0b0, a1b1;

	MP_MUL_DxD(a_i, b, a1b1, a0b0);

	a0b0 += carry;
	if (a0b0 < carry)
	++a1b1;
	*c++ = a0b0;
	carry = a1b1;
	}
	*c = carry;
	#endif
	}

	/* c += a * b */
	static void
	v8_mpv_mul_d_add(const mp_digit a, mp_size a_len, mp_digit b, mp_digit c)
	{
	#if !defined(MP_NO_MP_WORD)
	mp_digit d = 0;

	/* Inner product: Digits of a */
	while (a_len--) {
	mp_word w = ((mp_word)b * a++) + c + d;
	*c++ = ACCUM(w);
	d = CARRYOUT(w);
	}
	*c = d;
	#else
	mp_digit carry = 0;
	while (a_len--) {
	mp_digit a_i = *a++;
	mp_digit a0b0, a1b1;

	MP_MUL_DxD(a_i, b, a1b1, a0b0);

	a0b0 += carry;
	if (a0b0 < carry)
	++a1b1;
	a0b0 += a_i = *c;
	if (a0b0 < a_i)
	++a1b1;
	*c++ = a0b0;
	carry = a1b1;
	}
	*c = carry;
	#endif
	}

	/* Presently, this is only used by the Montgomery arithmetic code. */
	/* c += a * b */
	static void
	v8_mpv_mul_d_add_prop(const mp_digit a, mp_size a_len, mp_digit b, mp_digit c)
	{
	#if !defined(MP_NO_MP_WORD)
	mp_digit d = 0;

	/* Inner product: Digits of a */
	while (a_len--) {
	mp_word w = ((mp_word)b * a++) + c + d;
	*c++ = ACCUM(w);
	d = CARRYOUT(w);
	}

	while (d) {
	mp_word w = (mp_word)*c + d;
	*c++ = ACCUM(w);
	d = CARRYOUT(w);
	}
	#else
	mp_digit carry = 0;
	while (a_len--) {
	mp_digit a_i = *a++;
	mp_digit a0b0, a1b1;

	MP_MUL_DxD(a_i, b, a1b1, a0b0);

	a0b0 += carry;
	if (a0b0 < carry)
	++a1b1;

	a0b0 += a_i = *c;
	if (a0b0 < a_i)
	++a1b1;

	*c++ = a0b0;
	carry = a1b1;
	}
	while (carry) {
	mp_digit c_i = *c;
	carry += c_i;
	*c++ = carry;
	carry = carry < c_i;
	}
	#endif
	}

	/* These functions run only on v8plus+vis or v9+vis CPUs. */

	/* c = a * b */
	void
	s_mpv_mul_d(const mp_digit a, mp_size a_len, mp_digit b, mp_digit c)
	{
	mp_digit d;
	mp_digit x[258];
	if (a_len <= 256) {
	if (a == c \|\| ((ptrdiff_t)a & 0x7) != 0 \|\| (a_len & 1) != 0) {
	mp_digit *px;
	px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
	memcpy(px, a, a_len * sizeof(*a));
	a = px;
	if (a_len & 1) {
	px[a_len] = 0;
	}
	}
	s_mp_setz(c, a_len + 1);
	d = mul_add_inp(c, a, a_len, b);
	c[a_len] = d;
	} else {
	v8_mpv_mul_d(a, a_len, b, c);
	}
	}

	/* c += a * b, where a is a_len words long. */
	void
	s_mpv_mul_d_add(const mp_digit a, mp_size a_len, mp_digit b, mp_digit c)
	{
	mp_digit d;
	mp_digit x[258];
	if (a_len <= 256) {
	if (((ptrdiff_t)a & 0x7) != 0 \|\| (a_len & 1) != 0) {
	mp_digit *px;
	px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
	memcpy(px, a, a_len * sizeof(*a));
	a = px;
	if (a_len & 1) {
	px[a_len] = 0;
	}
	}
	d = mul_add_inp(c, a, a_len, b);
	c[a_len] = d;
	} else {
	v8_mpv_mul_d_add(a, a_len, b, c);
	}
	}

	/* c += a * b, where a is y words long. */
	void
	s_mpv_mul_d_add_prop(const mp_digit a, mp_size a_len, mp_digit b, mp_digit c)
	{
	mp_digit d;
	mp_digit x[258];
	if (a_len <= 256) {
	if (((ptrdiff_t)a & 0x7) != 0 \|\| (a_len & 1) != 0) {
	mp_digit *px;
	px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
	memcpy(px, a, a_len * sizeof(*a));
	a = px;
	if (a_len & 1) {
	px[a_len] = 0;
	}
	}
	d = mul_add_inp(c, a, a_len, b);
	if (d) {
	c += a_len;
	do {
	mp_digit sum = d + *c;
	*c++ = sum;
	d = sum < d;
	} while (d);
	}
	} else {
	v8_mpv_mul_d_add_prop(a, a_len, b, c);
	}
	}