nss-3.73/nss/lib/freebl/sha256-x86.c - manifest_repos/nss - Git at Google

 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 #ifdef USE_HW_SHA2

 #include <immintrin.h>

 #ifdef FREEBL_NO_DEPEND
 #include "stubs.h"
 #endif

 #include "blapii.h"
 #include "prcpucfg.h"
 #include "prtypes.h" /* for PRUintXX */
 #include "prlong.h"
 #include "blapi.h"
 #include "sha256.h"

 /* SHA-256 constants, K256. */
 pre_align static const PRUint32 K256[64] post_align = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 };

 #define ROUND(n, a, b, c, d)                                \
     {                                                       \
         __m128i t = _mm_add_epi32(a, k##n);                 \
         w1 = _mm_sha256rnds2_epu32(w1, w0, t);              \
         t = _mm_shuffle_epi32(t, 0x0e);                     \
         w0 = _mm_sha256rnds2_epu32(w0, w1, t);              \
         if (n < 12) {                                       \
             a = _mm_sha256msg1_epu32(a, b);                 \
             a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
             a = _mm_sha256msg2_epu32(a, d);                 \
         }                                                   \
     }

 void
 SHA256_Compress_Native(SHA256Context *ctx)
 {
     __m128i h0, h1, th;
     __m128i a, b, c, d;
     __m128i w0, w1;
     const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);

     const __m128i *K = (__m128i *)K256;
     const __m128i k0 = _mm_load_si128(K);
     const __m128i k1 = _mm_load_si128(K + 1);
     const __m128i k2 = _mm_load_si128(K + 2);
     const __m128i k3 = _mm_load_si128(K + 3);
     const __m128i k4 = _mm_load_si128(K + 4);
     const __m128i k5 = _mm_load_si128(K + 5);
     const __m128i k6 = _mm_load_si128(K + 6);
     const __m128i k7 = _mm_load_si128(K + 7);
     const __m128i k8 = _mm_load_si128(K + 8);
     const __m128i k9 = _mm_load_si128(K + 9);
     const __m128i k10 = _mm_load_si128(K + 10);
     const __m128i k11 = _mm_load_si128(K + 11);
     const __m128i k12 = _mm_load_si128(K + 12);
     const __m128i k13 = _mm_load_si128(K + 13);
     const __m128i k14 = _mm_load_si128(K + 14);
     const __m128i k15 = _mm_load_si128(K + 15);

     const __m128i *input = (__m128i *)ctx->u.b;

     h0 = _mm_loadu_si128((__m128i *)(ctx->h));
     h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));

     /* H0123:4567 -> H01256:H2367 */
     th = _mm_shuffle_epi32(h0, 0xb1);
     h1 = _mm_shuffle_epi32(h1, 0x1b);
     h0 = _mm_alignr_epi8(th, h1, 8);
     h1 = _mm_blend_epi16(h1, th, 0xf0);

     a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
     b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
     c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
     d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);

     w0 = h0;
     w1 = h1;

     ROUND(0, a, b, c, d)
     ROUND(1, b, c, d, a)
     ROUND(2, c, d, a, b)
     ROUND(3, d, a, b, c)
     ROUND(4, a, b, c, d)
     ROUND(5, b, c, d, a)
     ROUND(6, c, d, a, b)
     ROUND(7, d, a, b, c)
     ROUND(8, a, b, c, d)
     ROUND(9, b, c, d, a)
     ROUND(10, c, d, a, b)
     ROUND(11, d, a, b, c)
     ROUND(12, a, b, c, d)
     ROUND(13, b, c, d, a)
     ROUND(14, c, d, a, b)
     ROUND(15, d, a, b, c)

     h0 = _mm_add_epi32(h0, w0);
     h1 = _mm_add_epi32(h1, w1);

     /* H0145:2367 -> H0123:4567 */
     th = _mm_shuffle_epi32(h0, 0x1b);
     h1 = _mm_shuffle_epi32(h1, 0xb1);
     h0 = _mm_blend_epi16(th, h1, 0xf0);
     h1 = _mm_alignr_epi8(h1, th, 8);

     _mm_storeu_si128((__m128i *)ctx->h, h0);
     _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
 }

 void
 SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
                      unsigned int inputLen)
 {
     __m128i h0, h1, th;
     const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);

     const __m128i *K = (__m128i *)K256;
     const __m128i k0 = _mm_load_si128(K);
     const __m128i k1 = _mm_load_si128(K + 1);
     const __m128i k2 = _mm_load_si128(K + 2);
     const __m128i k3 = _mm_load_si128(K + 3);
     const __m128i k4 = _mm_load_si128(K + 4);
     const __m128i k5 = _mm_load_si128(K + 5);
     const __m128i k6 = _mm_load_si128(K + 6);
     const __m128i k7 = _mm_load_si128(K + 7);
     const __m128i k8 = _mm_load_si128(K + 8);
     const __m128i k9 = _mm_load_si128(K + 9);
     const __m128i k10 = _mm_load_si128(K + 10);
     const __m128i k11 = _mm_load_si128(K + 11);
     const __m128i k12 = _mm_load_si128(K + 12);
     const __m128i k13 = _mm_load_si128(K + 13);
     const __m128i k14 = _mm_load_si128(K + 14);
     const __m128i k15 = _mm_load_si128(K + 15);

     unsigned int inBuf = ctx->sizeLo & 0x3f;
     if (!inputLen) {
         return;
     }

     /* Add inputLen into the count of bytes processed, before processing */
     if ((ctx->sizeLo += inputLen) < inputLen) {
         ctx->sizeHi++;
     }

     /* if data already in buffer, attempt to fill rest of buffer */
     if (inBuf) {
         unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
         if (inputLen < todo) {
             todo = inputLen;
         }
         memcpy(ctx->u.b + inBuf, input, todo);
         input += todo;
         inputLen -= todo;
         if (inBuf + todo == SHA256_BLOCK_LENGTH) {
             SHA256_Compress_Native(ctx);
         }
     }

     h0 = _mm_loadu_si128((__m128i *)(ctx->h));
     h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));

     /* H0123:4567 -> H01256:H2367 */
     th = _mm_shuffle_epi32(h0, 0xb1);
     h1 = _mm_shuffle_epi32(h1, 0x1b);
     h0 = _mm_alignr_epi8(th, h1, 8);
     h1 = _mm_blend_epi16(h1, th, 0xf0);

     /* if enough data to fill one or more whole buffers, process them. */
     while (inputLen >= SHA256_BLOCK_LENGTH) {
         __m128i a, b, c, d;
         __m128i w0, w1;
         a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
         b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
         c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
         d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
         input += SHA256_BLOCK_LENGTH;
         inputLen -= SHA256_BLOCK_LENGTH;

         w0 = h0;
         w1 = h1;

         ROUND(0, a, b, c, d)
         ROUND(1, b, c, d, a)
         ROUND(2, c, d, a, b)
         ROUND(3, d, a, b, c)
         ROUND(4, a, b, c, d)
         ROUND(5, b, c, d, a)
         ROUND(6, c, d, a, b)
         ROUND(7, d, a, b, c)
         ROUND(8, a, b, c, d)
         ROUND(9, b, c, d, a)
         ROUND(10, c, d, a, b)
         ROUND(11, d, a, b, c)
         ROUND(12, a, b, c, d)
         ROUND(13, b, c, d, a)
         ROUND(14, c, d, a, b)
         ROUND(15, d, a, b, c)

         h0 = _mm_add_epi32(h0, w0);
         h1 = _mm_add_epi32(h1, w1);
     }

     // H01234567 -> H01256 and H2367
     th = _mm_shuffle_epi32(h0, 0x1b);
     h1 = _mm_shuffle_epi32(h1, 0xb1);
     h0 = _mm_blend_epi16(th, h1, 0xf0);
     h1 = _mm_alignr_epi8(h1, th, 8);

     _mm_storeu_si128((__m128i *)ctx->h, h0);
     _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);

     /* if data left over, fill it into buffer */
     if (inputLen) {
         memcpy(ctx->u.b, input, inputLen);
     }
 }

 #endif /* USE_HW_SHA2 */
	/* This Source Code Form is subject to the terms of the Mozilla Public
	* License, v. 2.0. If a copy of the MPL was not distributed with this
	* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

	#ifdef USE_HW_SHA2

	#include <immintrin.h>

	#ifdef FREEBL_NO_DEPEND
	#include "stubs.h"
	#endif

	#include "blapii.h"
	#include "prcpucfg.h"
	#include "prtypes.h" /* for PRUintXX */
	#include "prlong.h"
	#include "blapi.h"
	#include "sha256.h"

	/* SHA-256 constants, K256. */
	pre_align static const PRUint32 K256[64] post_align = {
	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
	};

	#define ROUND(n, a, b, c, d) \
	{ \
	__m128i t = _mm_add_epi32(a, k##n); \
	w1 = _mm_sha256rnds2_epu32(w1, w0, t); \
	t = _mm_shuffle_epi32(t, 0x0e); \
	w0 = _mm_sha256rnds2_epu32(w0, w1, t); \
	if (n < 12) { \
	a = _mm_sha256msg1_epu32(a, b); \
	a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
	a = _mm_sha256msg2_epu32(a, d); \
	} \
	}

	void
	SHA256_Compress_Native(SHA256Context *ctx)
	{
	__m128i h0, h1, th;
	__m128i a, b, c, d;
	__m128i w0, w1;
	const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);

	const __m128i K = (__m128i )K256;
	const __m128i k0 = _mm_load_si128(K);
	const __m128i k1 = _mm_load_si128(K + 1);
	const __m128i k2 = _mm_load_si128(K + 2);
	const __m128i k3 = _mm_load_si128(K + 3);
	const __m128i k4 = _mm_load_si128(K + 4);
	const __m128i k5 = _mm_load_si128(K + 5);
	const __m128i k6 = _mm_load_si128(K + 6);
	const __m128i k7 = _mm_load_si128(K + 7);
	const __m128i k8 = _mm_load_si128(K + 8);
	const __m128i k9 = _mm_load_si128(K + 9);
	const __m128i k10 = _mm_load_si128(K + 10);
	const __m128i k11 = _mm_load_si128(K + 11);
	const __m128i k12 = _mm_load_si128(K + 12);
	const __m128i k13 = _mm_load_si128(K + 13);
	const __m128i k14 = _mm_load_si128(K + 14);
	const __m128i k15 = _mm_load_si128(K + 15);

	const __m128i input = (__m128i )ctx->u.b;

	h0 = _mm_loadu_si128((__m128i *)(ctx->h));
	h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));

	/* H0123:4567 -> H01256:H2367 */
	th = _mm_shuffle_epi32(h0, 0xb1);
	h1 = _mm_shuffle_epi32(h1, 0x1b);
	h0 = _mm_alignr_epi8(th, h1, 8);
	h1 = _mm_blend_epi16(h1, th, 0xf0);

	a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
	b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
	c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
	d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);

	w0 = h0;
	w1 = h1;

	ROUND(0, a, b, c, d)
	ROUND(1, b, c, d, a)
	ROUND(2, c, d, a, b)
	ROUND(3, d, a, b, c)
	ROUND(4, a, b, c, d)
	ROUND(5, b, c, d, a)
	ROUND(6, c, d, a, b)
	ROUND(7, d, a, b, c)
	ROUND(8, a, b, c, d)
	ROUND(9, b, c, d, a)
	ROUND(10, c, d, a, b)
	ROUND(11, d, a, b, c)
	ROUND(12, a, b, c, d)
	ROUND(13, b, c, d, a)
	ROUND(14, c, d, a, b)
	ROUND(15, d, a, b, c)

	h0 = _mm_add_epi32(h0, w0);
	h1 = _mm_add_epi32(h1, w1);

	/* H0145:2367 -> H0123:4567 */
	th = _mm_shuffle_epi32(h0, 0x1b);
	h1 = _mm_shuffle_epi32(h1, 0xb1);
	h0 = _mm_blend_epi16(th, h1, 0xf0);
	h1 = _mm_alignr_epi8(h1, th, 8);

	_mm_storeu_si128((__m128i *)ctx->h, h0);
	_mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
	}

	void
	SHA256_Update_Native(SHA256Context ctx, const unsigned char input,
	unsigned int inputLen)
	{
	__m128i h0, h1, th;
	const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);

	const __m128i K = (__m128i )K256;
	const __m128i k0 = _mm_load_si128(K);
	const __m128i k1 = _mm_load_si128(K + 1);
	const __m128i k2 = _mm_load_si128(K + 2);
	const __m128i k3 = _mm_load_si128(K + 3);
	const __m128i k4 = _mm_load_si128(K + 4);
	const __m128i k5 = _mm_load_si128(K + 5);
	const __m128i k6 = _mm_load_si128(K + 6);
	const __m128i k7 = _mm_load_si128(K + 7);
	const __m128i k8 = _mm_load_si128(K + 8);
	const __m128i k9 = _mm_load_si128(K + 9);
	const __m128i k10 = _mm_load_si128(K + 10);
	const __m128i k11 = _mm_load_si128(K + 11);
	const __m128i k12 = _mm_load_si128(K + 12);
	const __m128i k13 = _mm_load_si128(K + 13);
	const __m128i k14 = _mm_load_si128(K + 14);
	const __m128i k15 = _mm_load_si128(K + 15);

	unsigned int inBuf = ctx->sizeLo & 0x3f;
	if (!inputLen) {
	return;
	}

	/* Add inputLen into the count of bytes processed, before processing */
	if ((ctx->sizeLo += inputLen) < inputLen) {
	ctx->sizeHi++;
	}

	/* if data already in buffer, attempt to fill rest of buffer */
	if (inBuf) {
	unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
	if (inputLen < todo) {
	todo = inputLen;
	}
	memcpy(ctx->u.b + inBuf, input, todo);
	input += todo;
	inputLen -= todo;
	if (inBuf + todo == SHA256_BLOCK_LENGTH) {
	SHA256_Compress_Native(ctx);
	}
	}

	h0 = _mm_loadu_si128((__m128i *)(ctx->h));
	h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));

	/* H0123:4567 -> H01256:H2367 */
	th = _mm_shuffle_epi32(h0, 0xb1);
	h1 = _mm_shuffle_epi32(h1, 0x1b);
	h0 = _mm_alignr_epi8(th, h1, 8);
	h1 = _mm_blend_epi16(h1, th, 0xf0);

	/* if enough data to fill one or more whole buffers, process them. */
	while (inputLen >= SHA256_BLOCK_LENGTH) {
	__m128i a, b, c, d;
	__m128i w0, w1;
	a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
	b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
	c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
	d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
	input += SHA256_BLOCK_LENGTH;
	inputLen -= SHA256_BLOCK_LENGTH;

	w0 = h0;
	w1 = h1;

	ROUND(0, a, b, c, d)
	ROUND(1, b, c, d, a)
	ROUND(2, c, d, a, b)
	ROUND(3, d, a, b, c)
	ROUND(4, a, b, c, d)
	ROUND(5, b, c, d, a)
	ROUND(6, c, d, a, b)
	ROUND(7, d, a, b, c)
	ROUND(8, a, b, c, d)
	ROUND(9, b, c, d, a)
	ROUND(10, c, d, a, b)
	ROUND(11, d, a, b, c)
	ROUND(12, a, b, c, d)
	ROUND(13, b, c, d, a)
	ROUND(14, c, d, a, b)
	ROUND(15, d, a, b, c)

	h0 = _mm_add_epi32(h0, w0);
	h1 = _mm_add_epi32(h1, w1);
	}

	// H01234567 -> H01256 and H2367
	th = _mm_shuffle_epi32(h0, 0x1b);
	h1 = _mm_shuffle_epi32(h1, 0xb1);
	h0 = _mm_blend_epi16(th, h1, 0xf0);
	h1 = _mm_alignr_epi8(h1, th, 8);

	_mm_storeu_si128((__m128i *)ctx->h, h0);
	_mm_storeu_si128((__m128i *)(ctx->h + 4), h1);

	/* if data left over, fill it into buffer */
	if (inputLen) {
	memcpy(ctx->u.b, input, inputLen);
	}
	}

	#endif /* USE_HW_SHA2 */