nss-3.73/nss/lib/freebl/sha1-armv8.c - manifest_repos/nss - Git at Google

 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 #ifdef USE_HW_SHA1

 #ifndef __ARM_FEATURE_CRYPTO
 #error "Compiler option is invalid"
 #endif

 #ifdef FREEBL_NO_DEPEND
 #include "stubs.h"
 #endif

 #include <arm_neon.h>
 #include <memory.h>
 #include "blapi.h"
 #include "sha_fast.h"

 #if !defined(SHA_PUT_W_IN_STACK)
 #define H2X 11
 #else
 #define H2X 0
 #endif

 static void shaCompress(SHA_HW_t *X, const PRUint32 *datain);

 void
 SHA1_Compress_Native(SHA1Context *ctx)
 {
     shaCompress(&ctx->H[H2X], ctx->u.w);
 }

 /*
  *  SHA: Add data to context.
  */
 void
 SHA1_Update_Native(SHA1Context *ctx, const unsigned char *dataIn, unsigned int len)
 {
     unsigned int lenB;
     unsigned int togo;

     if (!len) {
         return;
     }

     /* accumulate the byte count. */
     lenB = (unsigned int)(ctx->size) & 63U;

     ctx->size += len;

     /*
    *  Read the data into W and process blocks as they get full
    */
     if (lenB > 0) {
         togo = 64U - lenB;
         if (len < togo) {
             togo = len;
         }
         memcpy(ctx->u.b + lenB, dataIn, togo);
         len -= togo;
         dataIn += togo;
         lenB = (lenB + togo) & 63U;
         if (!lenB) {
             shaCompress(&ctx->H[H2X], ctx->u.w);
         }
     }

     while (len >= 64U) {
         len -= 64U;
         shaCompress(&ctx->H[H2X], (PRUint32 *)dataIn);
         dataIn += 64U;
     }

     if (len) {
         memcpy(ctx->u.b, dataIn, len);
     }
 }

 /*
  *  SHA: Compression function, unrolled.
  */
 static void
 shaCompress(SHA_HW_t *X, const PRUint32 *inbuf)
 {
 #define XH(n) X[n - H2X]

     const uint32x4_t K0 = vdupq_n_u32(0x5a827999);
     const uint32x4_t K1 = vdupq_n_u32(0x6ed9eba1);
     const uint32x4_t K2 = vdupq_n_u32(0x8f1bbcdc);
     const uint32x4_t K3 = vdupq_n_u32(0xca62c1d6);

     uint32x4_t abcd = vld1q_u32(&XH(0));
     PRUint32 e = XH(4);

     const uint32x4_t origABCD = abcd;
     const PRUint32 origE = e;

     uint32x4_t w0 = vld1q_u32(inbuf);
     uint32x4_t w1 = vld1q_u32(inbuf + 4);
     uint32x4_t w2 = vld1q_u32(inbuf + 8);
     uint32x4_t w3 = vld1q_u32(inbuf + 12);

     w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0)));
     w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1)));
     w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2)));
     w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3)));

     uint32x4_t t0 = vaddq_u32(w0, K0);
     uint32x4_t t1 = vaddq_u32(w1, K0);

     PRUint32 tmpE;

     /*
          * Using the following ARM instructions to accelerate SHA1
          *
          * sha1c for round 0 - 20
          * sha1p for round 20 - 40
          * sha1m for round 40 - 60
          * sha1p for round 60 - 80
          * sha1su0 and shasu1 for message schedule
          * sha1h for rotate left 30
          */

     /* Round 0-3 */
     tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1cq_u32(abcd, e, t0);
     t0 = vaddq_u32(w2, K0);
     w0 = vsha1su0q_u32(w0, w1, w2);

     /* Round 4-7 */
     e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1cq_u32(abcd, tmpE, t1);
     t1 = vaddq_u32(w3, K0);
     w0 = vsha1su1q_u32(w0, w3);
     w1 = vsha1su0q_u32(w1, w2, w3);

     /* Round 8-11 */
     tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1cq_u32(abcd, e, t0);
     t0 = vaddq_u32(w0, K0);
     w1 = vsha1su1q_u32(w1, w0);
     w2 = vsha1su0q_u32(w2, w3, w0);

     /* Round 12-15 */
     e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1cq_u32(abcd, tmpE, t1);
     t1 = vaddq_u32(w1, K1);
     w2 = vsha1su1q_u32(w2, w1);
     w3 = vsha1su0q_u32(w3, w0, w1);

     /* Round 16-19 */
     tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1cq_u32(abcd, e, t0);
     t0 = vaddq_u32(w2, K1);
     w3 = vsha1su1q_u32(w3, w2);
     w0 = vsha1su0q_u32(w0, w1, w2);

     /* Round 20-23 */
     e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1pq_u32(abcd, tmpE, t1);
     t1 = vaddq_u32(w3, K1);
     w0 = vsha1su1q_u32(w0, w3);
     w1 = vsha1su0q_u32(w1, w2, w3);

     /* Round 24-27 */
     tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1pq_u32(abcd, e, t0);
     t0 = vaddq_u32(w0, K1);
     w1 = vsha1su1q_u32(w1, w0);
     w2 = vsha1su0q_u32(w2, w3, w0);

     /* Round 28-31 */
     e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1pq_u32(abcd, tmpE, t1);
     t1 = vaddq_u32(w1, K1);
     w2 = vsha1su1q_u32(w2, w1);
     w3 = vsha1su0q_u32(w3, w0, w1);

     /* Round 32-35 */
     tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1pq_u32(abcd, e, t0);
     t0 = vaddq_u32(w2, K2);
     w3 = vsha1su1q_u32(w3, w2);
     w0 = vsha1su0q_u32(w0, w1, w2);

     /* Round 36-39 */
     e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1pq_u32(abcd, tmpE, t1);
     t1 = vaddq_u32(w3, K2);
     w0 = vsha1su1q_u32(w0, w3);
     w1 = vsha1su0q_u32(w1, w2, w3);

     /* Round 40-43 */
     tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1mq_u32(abcd, e, t0);
     t0 = vaddq_u32(w0, K2);
     w1 = vsha1su1q_u32(w1, w0);
     w2 = vsha1su0q_u32(w2, w3, w0);

     /* Round 44-47 */
     e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1mq_u32(abcd, tmpE, t1);
     t1 = vaddq_u32(w1, K2);
     w2 = vsha1su1q_u32(w2, w1);
     w3 = vsha1su0q_u32(w3, w0, w1);

     /* Round 48-51 */
     tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1mq_u32(abcd, e, t0);
     t0 = vaddq_u32(w2, K2);
     w3 = vsha1su1q_u32(w3, w2);
     w0 = vsha1su0q_u32(w0, w1, w2);

     /* Round 52-55 */
     e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1mq_u32(abcd, tmpE, t1);
     t1 = vaddq_u32(w3, K3);
     w0 = vsha1su1q_u32(w0, w3);
     w1 = vsha1su0q_u32(w1, w2, w3);

     /* Round 56-59 */
     tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1mq_u32(abcd, e, t0);
     t0 = vaddq_u32(w0, K3);
     w1 = vsha1su1q_u32(w1, w0);
     w2 = vsha1su0q_u32(w2, w3, w0);

     /* Round 60-63 */
     e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1pq_u32(abcd, tmpE, t1);
     t1 = vaddq_u32(w1, K3);
     w2 = vsha1su1q_u32(w2, w1);
     w3 = vsha1su0q_u32(w3, w0, w1);

     /* Round 64-67 */
     tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1pq_u32(abcd, e, t0);
     t0 = vaddq_u32(w2, K3);
     w3 = vsha1su1q_u32(w3, w2);
     w0 = vsha1su0q_u32(w0, w1, w2);

     /* Round 68-71 */
     e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1pq_u32(abcd, tmpE, t1);
     t1 = vaddq_u32(w3, K3);
     w0 = vsha1su1q_u32(w0, w3);

     /* Round 72-75 */
     tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1pq_u32(abcd, e, t0);

     /* Round 76-79 */
     e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
     abcd = vsha1pq_u32(abcd, tmpE, t1);

     e += origE;
     abcd = vaddq_u32(origABCD, abcd);

     vst1q_u32(&XH(0), abcd);
     XH(4) = e;
 }

 #endif /* USE_HW_SHA1 */
	/* This Source Code Form is subject to the terms of the Mozilla Public
	* License, v. 2.0. If a copy of the MPL was not distributed with this
	* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

	#ifdef USE_HW_SHA1

	#ifndef __ARM_FEATURE_CRYPTO
	#error "Compiler option is invalid"
	#endif

	#ifdef FREEBL_NO_DEPEND
	#include "stubs.h"
	#endif

	#include <arm_neon.h>
	#include <memory.h>
	#include "blapi.h"
	#include "sha_fast.h"

	#if !defined(SHA_PUT_W_IN_STACK)
	#define H2X 11
	#else
	#define H2X 0
	#endif

	static void shaCompress(SHA_HW_t X, const PRUint32 datain);

	void
	SHA1_Compress_Native(SHA1Context *ctx)
	{
	shaCompress(&ctx->H[H2X], ctx->u.w);
	}

	/*
	* SHA: Add data to context.
	*/
	void
	SHA1_Update_Native(SHA1Context ctx, const unsigned char dataIn, unsigned int len)
	{
	unsigned int lenB;
	unsigned int togo;

	if (!len) {
	return;
	}

	/* accumulate the byte count. */
	lenB = (unsigned int)(ctx->size) & 63U;

	ctx->size += len;

	/*
	* Read the data into W and process blocks as they get full
	*/
	if (lenB > 0) {
	togo = 64U - lenB;
	if (len < togo) {
	togo = len;
	}
	memcpy(ctx->u.b + lenB, dataIn, togo);
	len -= togo;
	dataIn += togo;
	lenB = (lenB + togo) & 63U;
	if (!lenB) {
	shaCompress(&ctx->H[H2X], ctx->u.w);
	}
	}

	while (len >= 64U) {
	len -= 64U;
	shaCompress(&ctx->H[H2X], (PRUint32 *)dataIn);
	dataIn += 64U;
	}

	if (len) {
	memcpy(ctx->u.b, dataIn, len);
	}
	}

	/*
	* SHA: Compression function, unrolled.
	*/
	static void
	shaCompress(SHA_HW_t X, const PRUint32 inbuf)
	{
	#define XH(n) X[n - H2X]

	const uint32x4_t K0 = vdupq_n_u32(0x5a827999);
	const uint32x4_t K1 = vdupq_n_u32(0x6ed9eba1);
	const uint32x4_t K2 = vdupq_n_u32(0x8f1bbcdc);
	const uint32x4_t K3 = vdupq_n_u32(0xca62c1d6);

	uint32x4_t abcd = vld1q_u32(&XH(0));
	PRUint32 e = XH(4);

	const uint32x4_t origABCD = abcd;
	const PRUint32 origE = e;

	uint32x4_t w0 = vld1q_u32(inbuf);
	uint32x4_t w1 = vld1q_u32(inbuf + 4);
	uint32x4_t w2 = vld1q_u32(inbuf + 8);
	uint32x4_t w3 = vld1q_u32(inbuf + 12);

	w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0)));
	w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1)));
	w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2)));
	w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3)));

	uint32x4_t t0 = vaddq_u32(w0, K0);
	uint32x4_t t1 = vaddq_u32(w1, K0);

	PRUint32 tmpE;

	/*
	* Using the following ARM instructions to accelerate SHA1
	*
	* sha1c for round 0 - 20
	* sha1p for round 20 - 40
	* sha1m for round 40 - 60
	* sha1p for round 60 - 80
	* sha1su0 and shasu1 for message schedule
	* sha1h for rotate left 30
	*/

	/* Round 0-3 */
	tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1cq_u32(abcd, e, t0);
	t0 = vaddq_u32(w2, K0);
	w0 = vsha1su0q_u32(w0, w1, w2);

	/* Round 4-7 */
	e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1cq_u32(abcd, tmpE, t1);
	t1 = vaddq_u32(w3, K0);
	w0 = vsha1su1q_u32(w0, w3);
	w1 = vsha1su0q_u32(w1, w2, w3);

	/* Round 8-11 */
	tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1cq_u32(abcd, e, t0);
	t0 = vaddq_u32(w0, K0);
	w1 = vsha1su1q_u32(w1, w0);
	w2 = vsha1su0q_u32(w2, w3, w0);

	/* Round 12-15 */
	e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1cq_u32(abcd, tmpE, t1);
	t1 = vaddq_u32(w1, K1);
	w2 = vsha1su1q_u32(w2, w1);
	w3 = vsha1su0q_u32(w3, w0, w1);

	/* Round 16-19 */
	tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1cq_u32(abcd, e, t0);
	t0 = vaddq_u32(w2, K1);
	w3 = vsha1su1q_u32(w3, w2);
	w0 = vsha1su0q_u32(w0, w1, w2);

	/* Round 20-23 */
	e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1pq_u32(abcd, tmpE, t1);
	t1 = vaddq_u32(w3, K1);
	w0 = vsha1su1q_u32(w0, w3);
	w1 = vsha1su0q_u32(w1, w2, w3);

	/* Round 24-27 */
	tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1pq_u32(abcd, e, t0);
	t0 = vaddq_u32(w0, K1);
	w1 = vsha1su1q_u32(w1, w0);
	w2 = vsha1su0q_u32(w2, w3, w0);

	/* Round 28-31 */
	e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1pq_u32(abcd, tmpE, t1);
	t1 = vaddq_u32(w1, K1);
	w2 = vsha1su1q_u32(w2, w1);
	w3 = vsha1su0q_u32(w3, w0, w1);

	/* Round 32-35 */
	tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1pq_u32(abcd, e, t0);
	t0 = vaddq_u32(w2, K2);
	w3 = vsha1su1q_u32(w3, w2);
	w0 = vsha1su0q_u32(w0, w1, w2);

	/* Round 36-39 */
	e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1pq_u32(abcd, tmpE, t1);
	t1 = vaddq_u32(w3, K2);
	w0 = vsha1su1q_u32(w0, w3);
	w1 = vsha1su0q_u32(w1, w2, w3);

	/* Round 40-43 */
	tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1mq_u32(abcd, e, t0);
	t0 = vaddq_u32(w0, K2);
	w1 = vsha1su1q_u32(w1, w0);
	w2 = vsha1su0q_u32(w2, w3, w0);

	/* Round 44-47 */
	e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1mq_u32(abcd, tmpE, t1);
	t1 = vaddq_u32(w1, K2);
	w2 = vsha1su1q_u32(w2, w1);
	w3 = vsha1su0q_u32(w3, w0, w1);

	/* Round 48-51 */
	tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1mq_u32(abcd, e, t0);
	t0 = vaddq_u32(w2, K2);
	w3 = vsha1su1q_u32(w3, w2);
	w0 = vsha1su0q_u32(w0, w1, w2);

	/* Round 52-55 */
	e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1mq_u32(abcd, tmpE, t1);
	t1 = vaddq_u32(w3, K3);
	w0 = vsha1su1q_u32(w0, w3);
	w1 = vsha1su0q_u32(w1, w2, w3);

	/* Round 56-59 */
	tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1mq_u32(abcd, e, t0);
	t0 = vaddq_u32(w0, K3);
	w1 = vsha1su1q_u32(w1, w0);
	w2 = vsha1su0q_u32(w2, w3, w0);

	/* Round 60-63 */
	e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1pq_u32(abcd, tmpE, t1);
	t1 = vaddq_u32(w1, K3);
	w2 = vsha1su1q_u32(w2, w1);
	w3 = vsha1su0q_u32(w3, w0, w1);

	/* Round 64-67 */
	tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1pq_u32(abcd, e, t0);
	t0 = vaddq_u32(w2, K3);
	w3 = vsha1su1q_u32(w3, w2);
	w0 = vsha1su0q_u32(w0, w1, w2);

	/* Round 68-71 */
	e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1pq_u32(abcd, tmpE, t1);
	t1 = vaddq_u32(w3, K3);
	w0 = vsha1su1q_u32(w0, w3);

	/* Round 72-75 */
	tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1pq_u32(abcd, e, t0);

	/* Round 76-79 */
	e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
	abcd = vsha1pq_u32(abcd, tmpE, t1);

	e += origE;
	abcd = vaddq_u32(origABCD, abcd);

	vst1q_u32(&XH(0), abcd);
	XH(4) = e;
	}

	#endif /* USE_HW_SHA1 */