| ; LICENSE: |
| ; This submission to NSS is to be made available under the terms of the |
| ; Mozilla Public License, v. 2.0. You can obtain one at http: |
| ; //mozilla.org/MPL/2.0/. |
| ;############################################################################### |
| ; Copyright(c) 2014, Intel Corp. |
| ; Developers and authors: |
| ; Shay Gueron and Vlad Krasnov |
| ; Intel Corporation, Israel Development Centre, Haifa, Israel |
| ; Please send feedback directly to crypto.feedback.alias@intel.com |
| |
| |
| .MODEL FLAT, C |
| .XMM |
| |
| .DATA |
| ALIGN 16 |
| Lone dq 1,0 |
| Ltwo dq 2,0 |
| Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 |
| Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh |
| Lpoly dq 01h, 0c200000000000000h |
| |
| .CODE |
| |
| |
| GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 |
| vpclmulqdq TMP1, SRC2, SRC1, 0h |
| vpclmulqdq TMP4, SRC2, SRC1, 011h |
| |
| vpshufd TMP2, SRC2, 78 |
| vpshufd TMP3, SRC1, 78 |
| vpxor TMP2, TMP2, SRC2 |
| vpxor TMP3, TMP3, SRC1 |
| |
| vpclmulqdq TMP2, TMP2, TMP3, 0h |
| vpxor TMP2, TMP2, TMP1 |
| vpxor TMP2, TMP2, TMP4 |
| |
| vpslldq TMP3, TMP2, 8 |
| vpsrldq TMP2, TMP2, 8 |
| |
| vpxor TMP1, TMP1, TMP3 |
| vpxor TMP4, TMP4, TMP2 |
| |
| vpclmulqdq TMP2, TMP1, [Lpoly], 010h |
| vpshufd TMP3, TMP1, 78 |
| vpxor TMP1, TMP2, TMP3 |
| |
| vpclmulqdq TMP2, TMP1, [Lpoly], 010h |
| vpshufd TMP3, TMP1, 78 |
| vpxor TMP1, TMP2, TMP3 |
| |
| vpxor DST, TMP1, TMP4 |
| |
| ENDM |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ; |
| ; Generates the final GCM tag |
| ; void intel_aes_gcmTAG(unsigned char Htbl[16*16], |
| ; unsigned char *Tp, |
| ; unsigned int Mlen, |
| ; unsigned int Alen, |
| ; unsigned char* X0, |
| ; unsigned char* TAG); |
| ; |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ALIGN 16 |
| intel_aes_gcmTAG PROC |
| |
| Htbl textequ <eax> |
| Tp textequ <ecx> |
| X0 textequ <edx> |
| TAG textequ <ebx> |
| |
| T textequ <xmm0> |
| TMP0 textequ <xmm1> |
| |
| push ebx |
| |
| mov Htbl, [esp + 2*4 + 0*4] |
| mov Tp, [esp + 2*4 + 1*4] |
| mov X0, [esp + 2*4 + 4*4] |
| mov TAG, [esp + 2*4 + 5*4] |
| |
| vzeroupper |
| vmovdqu T, XMMWORD PTR[Tp] |
| |
| vpxor TMP0, TMP0, TMP0 |
| vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0 |
| vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2 |
| vpsllq TMP0, TMP0, 3 |
| |
| vpxor T, T, TMP0 |
| vmovdqu TMP0, XMMWORD PTR[Htbl] |
| GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 |
| |
| vpshufb T, T, [Lbswap_mask] |
| vpxor T, T, [X0] |
| vmovdqu XMMWORD PTR[TAG], T |
| vzeroupper |
| |
| pop ebx |
| |
| ret |
| |
| intel_aes_gcmTAG ENDP |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ; |
| ; Generates the H table |
| ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); |
| ; |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ALIGN 16 |
| intel_aes_gcmINIT PROC |
| |
| Htbl textequ <eax> |
| KS textequ <ecx> |
| NR textequ <edx> |
| |
| T textequ <xmm0> |
| TMP0 textequ <xmm1> |
| |
| mov Htbl, [esp + 4*1 + 0*4] |
| mov KS, [esp + 4*1 + 1*4] |
| mov NR, [esp + 4*1 + 2*4] |
| |
| vzeroupper |
| ; AES-ENC(0) |
| vmovdqu T, XMMWORD PTR[KS] |
| lea KS, [16 + KS] |
| dec NR |
| Lenc_loop: |
| vaesenc T, T, [KS] |
| lea KS, [16 + KS] |
| dec NR |
| jnz Lenc_loop |
| |
| vaesenclast T, T, [KS] |
| vpshufb T, T, [Lbswap_mask] |
| |
| ;Calculate H` = GFMUL(H, 2) |
| vpsrad xmm3, T, 31 |
| vpshufd xmm3, xmm3, 0ffh |
| vpand xmm5, xmm3, [Lpoly] |
| vpsrld xmm3, T, 31 |
| vpslld xmm4, T, 1 |
| vpslldq xmm3, xmm3, 4 |
| vpxor T, xmm4, xmm3 |
| vpxor T, T, xmm5 |
| |
| vmovdqu TMP0, T |
| vmovdqu XMMWORD PTR[Htbl + 0*16], T |
| |
| vpshufd xmm2, T, 78 |
| vpxor xmm2, xmm2, T |
| vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 |
| |
| i = 1 |
| WHILE i LT 8 |
| GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 |
| vmovdqu XMMWORD PTR[Htbl + i*16], T |
| vpshufd xmm2, T, 78 |
| vpxor xmm2, xmm2, T |
| vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 |
| i = i+1 |
| ENDM |
| vzeroupper |
| ret |
| intel_aes_gcmINIT ENDP |
| |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ; |
| ; Authenticate only |
| ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); |
| ; |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ALIGN 16 |
| intel_aes_gcmAAD PROC |
| |
| Htbl textequ <eax> |
| inp textequ <ecx> |
| len textequ <edx> |
| Tp textequ <ebx> |
| hlp0 textequ <esi> |
| |
| DATA textequ <xmm0> |
| T textequ <xmm1> |
| TMP0 textequ <xmm2> |
| TMP1 textequ <xmm3> |
| TMP2 textequ <xmm4> |
| TMP3 textequ <xmm5> |
| TMP4 textequ <xmm6> |
| Xhi textequ <xmm7> |
| |
| KARATSUBA_AAD MACRO i |
| vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h |
| vpxor TMP0, TMP0, TMP3 |
| vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h |
| vpxor TMP1, TMP1, TMP3 |
| vpshufd TMP3, DATA, 78 |
| vpxor TMP3, TMP3, DATA |
| vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h |
| vpxor TMP2, TMP2, TMP3 |
| ENDM |
| |
| cmp DWORD PTR[esp + 1*3 + 2*4], 0 |
| jnz LbeginAAD |
| ret |
| |
| LbeginAAD: |
| push ebx |
| push esi |
| |
| mov Htbl, [esp + 4*3 + 0*4] |
| mov inp, [esp + 4*3 + 1*4] |
| mov len, [esp + 4*3 + 2*4] |
| mov Tp, [esp + 4*3 + 3*4] |
| |
| vzeroupper |
| |
| vpxor Xhi, Xhi, Xhi |
| |
| vmovdqu T, XMMWORD PTR[Tp] |
| ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first |
| mov hlp0, len |
| and hlp0, 128-1 |
| jz Lmod_loop |
| |
| and len, -128 |
| sub hlp0, 16 |
| |
| ; Prefix block |
| vmovdqu DATA, XMMWORD PTR[inp] |
| vpshufb DATA, DATA, [Lbswap_mask] |
| vpxor DATA, DATA, T |
| |
| vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h |
| vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h |
| vpshufd TMP3, DATA, 78 |
| vpxor TMP3, TMP3, DATA |
| vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h |
| |
| lea inp, [inp+16] |
| test hlp0, hlp0 |
| jnz Lpre_loop |
| jmp Lred1 |
| |
| ;hash remaining prefix bocks (up to 7 total prefix blocks) |
| Lpre_loop: |
| |
| sub hlp0, 16 |
| |
| vmovdqu DATA, XMMWORD PTR[inp] |
| vpshufb DATA, DATA, [Lbswap_mask] |
| |
| vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h |
| vpxor TMP0, TMP0, TMP3 |
| vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h |
| vpxor TMP1, TMP1, TMP3 |
| vpshufd TMP3, DATA, 78 |
| vpxor TMP3, TMP3, DATA |
| vpclmulqdq TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h |
| vpxor TMP2, TMP2, TMP3 |
| |
| test hlp0, hlp0 |
| lea inp, [inp+16] |
| jnz Lpre_loop |
| |
| Lred1: |
| |
| vpxor TMP2, TMP2, TMP0 |
| vpxor TMP2, TMP2, TMP1 |
| vpsrldq TMP3, TMP2, 8 |
| vpslldq TMP2, TMP2, 8 |
| |
| vpxor Xhi, TMP1, TMP3 |
| vpxor T, TMP0, TMP2 |
| |
| Lmod_loop: |
| |
| sub len, 16*8 |
| jb Ldone |
| ; Block #0 |
| vmovdqu DATA, XMMWORD PTR[inp + 16*7] |
| vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask] |
| |
| vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h |
| vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h |
| vpshufd TMP3, DATA, 78 |
| vpxor TMP3, TMP3, DATA |
| vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h |
| |
| ; Block #1 |
| vmovdqu DATA, XMMWORD PTR[inp + 16*6] |
| vpshufb DATA, DATA, [Lbswap_mask] |
| KARATSUBA_AAD 1 |
| |
| ; Block #2 |
| vmovdqu DATA, XMMWORD PTR[inp + 16*5] |
| vpshufb DATA, DATA, [Lbswap_mask] |
| |
| vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a |
| vpalignr T, T, T, 8 |
| |
| KARATSUBA_AAD 2 |
| |
| vpxor T, T, TMP4 ;reduction stage 1b |
| |
| ; Block #3 |
| vmovdqu DATA, XMMWORD PTR[inp + 16*4] |
| vpshufb DATA, DATA, [Lbswap_mask] |
| KARATSUBA_AAD 3 |
| ; Block #4 |
| vmovdqu DATA, XMMWORD PTR[inp + 16*3] |
| vpshufb DATA, DATA, [Lbswap_mask] |
| |
| vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a |
| vpalignr T, T, T, 8 |
| |
| KARATSUBA_AAD 4 |
| |
| vpxor T, T, TMP4 ;reduction stage 2b |
| ; Block #5 |
| vmovdqu DATA, XMMWORD PTR[inp + 16*2] |
| vpshufb DATA, DATA, [Lbswap_mask] |
| KARATSUBA_AAD 5 |
| |
| vpxor T, T, Xhi ;reduction finalize |
| ; Block #6 |
| vmovdqu DATA, XMMWORD PTR[inp + 16*1] |
| vpshufb DATA, DATA, [Lbswap_mask] |
| KARATSUBA_AAD 6 |
| ; Block #7 |
| vmovdqu DATA, XMMWORD PTR[inp + 16*0] |
| vpshufb DATA, DATA, [Lbswap_mask] |
| vpxor DATA, DATA, T |
| KARATSUBA_AAD 7 |
| ; Aggregated 8 blocks, now karatsuba fixup |
| vpxor TMP2, TMP2, TMP0 |
| vpxor TMP2, TMP2, TMP1 |
| vpsrldq TMP3, TMP2, 8 |
| vpslldq TMP2, TMP2, 8 |
| |
| vpxor Xhi, TMP1, TMP3 |
| vpxor T, TMP0, TMP2 |
| |
| lea inp, [inp + 16*8] |
| jmp Lmod_loop |
| |
| Ldone: |
| vpclmulqdq TMP4, T, [Lpoly], 010h |
| vpalignr T, T, T, 8 |
| vpxor T, T, TMP4 |
| |
| vpclmulqdq TMP4, T, [Lpoly], 010h |
| vpalignr T, T, T, 8 |
| vpxor T, T, TMP4 |
| |
| vpxor T, T, Xhi |
| vmovdqu XMMWORD PTR[Tp], T |
| vzeroupper |
| |
| pop esi |
| pop ebx |
| ret |
| |
| intel_aes_gcmAAD ENDP |
| |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ; |
| ; Encrypt and Authenticate |
| ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); |
| ; |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| ALIGN 16 |
| intel_aes_gcmENC PROC |
| |
| PT textequ <eax> |
| CT textequ <ecx> |
| Htbl textequ <edx> |
| Gctx textequ <edx> |
| len textequ <DWORD PTR[ebp + 5*4 + 3*4]> |
| KS textequ <esi> |
| NR textequ <DWORD PTR[244+KS]> |
| |
| aluCTR textequ <ebx> |
| aluTMP textequ <edi> |
| |
| T textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]> |
| TMP0 textequ <xmm1> |
| TMP1 textequ <xmm2> |
| TMP2 textequ <xmm3> |
| TMP3 textequ <xmm4> |
| TMP4 textequ <xmm5> |
| TMP5 textequ <xmm6> |
| |
| CTR0 textequ <xmm0> |
| CTR1 textequ <xmm1> |
| CTR2 textequ <xmm2> |
| CTR3 textequ <xmm3> |
| CTR4 textequ <xmm4> |
| CTR5 textequ <xmm5> |
| CTR6 textequ <xmm6> |
| |
| ROUND MACRO i |
| vmovdqu xmm7, XMMWORD PTR[i*16 + KS] |
| vaesenc CTR0, CTR0, xmm7 |
| vaesenc CTR1, CTR1, xmm7 |
| vaesenc CTR2, CTR2, xmm7 |
| vaesenc CTR3, CTR3, xmm7 |
| vaesenc CTR4, CTR4, xmm7 |
| vaesenc CTR5, CTR5, xmm7 |
| vaesenc CTR6, CTR6, xmm7 |
| ENDM |
| |
| KARATSUBA MACRO i |
| vpshufd TMP4, TMP5, 78 |
| vpxor TMP4, TMP4, TMP5 |
| vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h |
| vpxor TMP0, TMP0, TMP3 |
| vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] |
| vpclmulqdq TMP3, TMP5, TMP4, 011h |
| vpxor TMP1, TMP1, TMP3 |
| vpclmulqdq TMP3, TMP5, TMP4, 000h |
| vpxor TMP2, TMP2, TMP3 |
| ENDM |
| |
| NEXTCTR MACRO i |
| add aluCTR, 1 |
| mov aluTMP, aluCTR |
| bswap aluTMP |
| xor aluTMP, [3*4 + KS] |
| mov [3*4 + 8*16 + i*16 + esp], aluTMP |
| ENDM |
| |
| cmp DWORD PTR[1*4 + 3*4 + esp], 0 |
| jne LbeginENC |
| ret |
| |
| LbeginENC: |
| |
| vzeroupper |
| push ebp |
| push ebx |
| push esi |
| push edi |
| |
| mov ebp, esp |
| sub esp, 16*16 |
| and esp, -16 |
| |
| mov PT, [ebp + 5*4 + 0*4] |
| mov CT, [ebp + 5*4 + 1*4] |
| mov Gctx, [ebp + 5*4 + 2*4] |
| |
| mov KS, [16*16 + 3*16 + Gctx] |
| |
| mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] |
| bswap aluCTR |
| |
| |
| vmovdqu TMP0, XMMWORD PTR[0*16 + KS] |
| vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
| vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0 |
| |
| cmp len, 16*7 |
| jb LEncDataSingles |
| ; Prepare the "top" counters |
| vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0 |
| vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0 |
| vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0 |
| vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0 |
| vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0 |
| vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0 |
| |
| vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
| vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
| ; Encrypt the initial 7 blocks |
| sub len, 16*7 |
| vpaddd CTR1, CTR0, XMMWORD PTR[Lone] |
| vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] |
| vpaddd CTR3, CTR2, XMMWORD PTR[Lone] |
| vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] |
| vpaddd CTR5, CTR4, XMMWORD PTR[Lone] |
| vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] |
| |
| vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask] |
| |
| vmovdqu xmm7, XMMWORD PTR[0*16 + KS] |
| vpxor CTR0, CTR0, xmm7 |
| vpxor CTR1, CTR1, xmm7 |
| vpxor CTR2, CTR2, xmm7 |
| vpxor CTR3, CTR3, xmm7 |
| vpxor CTR4, CTR4, xmm7 |
| vpxor CTR5, CTR5, xmm7 |
| vpxor CTR6, CTR6, xmm7 |
| |
| ROUND 1 |
| |
| add aluCTR, 7 |
| mov aluTMP, aluCTR |
| bswap aluTMP |
| xor aluTMP, [KS + 3*4] |
| mov [8*16 + 0*16 + 3*4 + esp], aluTMP |
| |
| ROUND 2 |
| NEXTCTR 1 |
| ROUND 3 |
| NEXTCTR 2 |
| ROUND 4 |
| NEXTCTR 3 |
| ROUND 5 |
| NEXTCTR 4 |
| ROUND 6 |
| NEXTCTR 5 |
| ROUND 7 |
| NEXTCTR 6 |
| ROUND 8 |
| ROUND 9 |
| vmovdqu xmm7, XMMWORD PTR[10*16 + KS] |
| cmp NR, 10 |
| je @f |
| |
| ROUND 10 |
| ROUND 11 |
| vmovdqu xmm7, XMMWORD PTR[12*16 + KS] |
| cmp NR, 12 |
| je @f |
| |
| ROUND 12 |
| ROUND 13 |
| vmovdqu xmm7, XMMWORD PTR[14*16 + KS] |
| @@: |
| vaesenclast CTR0, CTR0, xmm7 |
| vaesenclast CTR1, CTR1, xmm7 |
| vaesenclast CTR2, CTR2, xmm7 |
| vaesenclast CTR3, CTR3, xmm7 |
| vaesenclast CTR4, CTR4, xmm7 |
| vaesenclast CTR5, CTR5, xmm7 |
| vaesenclast CTR6, CTR6, xmm7 |
| |
| vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] |
| vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] |
| vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] |
| vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] |
| vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] |
| vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] |
| vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] |
| |
| vmovdqu XMMWORD PTR[0*16 + CT], CTR0 |
| vmovdqu XMMWORD PTR[1*16 + CT], CTR1 |
| vmovdqu XMMWORD PTR[2*16 + CT], CTR2 |
| vmovdqu XMMWORD PTR[3*16 + CT], CTR3 |
| vmovdqu XMMWORD PTR[4*16 + CT], CTR4 |
| vmovdqu XMMWORD PTR[5*16 + CT], CTR5 |
| vmovdqu XMMWORD PTR[6*16 + CT], CTR6 |
| |
| vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] |
| vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] |
| |
| vmovdqa XMMWORD PTR[1*16 + esp], CTR5 |
| vmovdqa XMMWORD PTR[2*16 + esp], CTR4 |
| vmovdqa XMMWORD PTR[3*16 + esp], CTR3 |
| vmovdqa XMMWORD PTR[4*16 + esp], CTR2 |
| vmovdqa XMMWORD PTR[5*16 + esp], CTR1 |
| vmovdqa XMMWORD PTR[6*16 + esp], CTR0 |
| |
| lea CT, [7*16 + CT] |
| lea PT, [7*16 + PT] |
| jmp LEncData7 |
| |
| LEncData7: |
| cmp len, 16*7 |
| jb LEndEnc7 |
| sub len, 16*7 |
| |
| vpshufd TMP4, TMP5, 78 |
| vpxor TMP4, TMP4, TMP5 |
| vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h |
| vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] |
| vpclmulqdq TMP1, TMP5, TMP4, 011h |
| vpclmulqdq TMP2, TMP5, TMP4, 000h |
| |
| vmovdqu TMP5, XMMWORD PTR[1*16 + esp] |
| KARATSUBA 1 |
| vmovdqu TMP5, XMMWORD PTR[2*16 + esp] |
| KARATSUBA 2 |
| vmovdqu TMP5, XMMWORD PTR[3*16 + esp] |
| KARATSUBA 3 |
| vmovdqu TMP5, XMMWORD PTR[4*16 + esp] |
| KARATSUBA 4 |
| vmovdqu TMP5, XMMWORD PTR[5*16 + esp] |
| KARATSUBA 5 |
| vmovdqu TMP5, XMMWORD PTR[6*16 + esp] |
| vpxor TMP5, TMP5, T |
| KARATSUBA 6 |
| |
| vpxor TMP0, TMP0, TMP1 |
| vpxor TMP0, TMP0, TMP2 |
| vpsrldq TMP3, TMP0, 8 |
| vpxor TMP4, TMP1, TMP3 |
| vpslldq TMP3, TMP0, 8 |
| vpxor TMP5, TMP2, TMP3 |
| |
| vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
| vpalignr TMP5,TMP5,TMP5,8 |
| vpxor TMP5, TMP5, TMP1 |
| |
| vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
| vpalignr TMP5,TMP5,TMP5,8 |
| vpxor TMP5, TMP5, TMP1 |
| |
| vpxor TMP5, TMP5, TMP4 |
| vmovdqu T, TMP5 |
| |
| vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp] |
| vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp] |
| vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp] |
| vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp] |
| vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp] |
| vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp] |
| vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp] |
| |
| ROUND 1 |
| NEXTCTR 0 |
| ROUND 2 |
| NEXTCTR 1 |
| ROUND 3 |
| NEXTCTR 2 |
| ROUND 4 |
| NEXTCTR 3 |
| ROUND 5 |
| NEXTCTR 4 |
| ROUND 6 |
| NEXTCTR 5 |
| ROUND 7 |
| NEXTCTR 6 |
| |
| ROUND 8 |
| ROUND 9 |
| |
| vmovdqu xmm7, XMMWORD PTR[10*16 + KS] |
| cmp NR, 10 |
| je @f |
| |
| ROUND 10 |
| ROUND 11 |
| vmovdqu xmm7, XMMWORD PTR[12*16 + KS] |
| cmp NR, 12 |
| je @f |
| |
| ROUND 12 |
| ROUND 13 |
| vmovdqu xmm7, XMMWORD PTR[14*16 + KS] |
| @@: |
| vaesenclast CTR0, CTR0, xmm7 |
| vaesenclast CTR1, CTR1, xmm7 |
| vaesenclast CTR2, CTR2, xmm7 |
| vaesenclast CTR3, CTR3, xmm7 |
| vaesenclast CTR4, CTR4, xmm7 |
| vaesenclast CTR5, CTR5, xmm7 |
| vaesenclast CTR6, CTR6, xmm7 |
| |
| vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] |
| vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] |
| vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] |
| vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] |
| vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] |
| vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] |
| vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] |
| |
| vmovdqu XMMWORD PTR[0*16 + CT], CTR0 |
| vmovdqu XMMWORD PTR[1*16 + CT], CTR1 |
| vmovdqu XMMWORD PTR[2*16 + CT], CTR2 |
| vmovdqu XMMWORD PTR[3*16 + CT], CTR3 |
| vmovdqu XMMWORD PTR[4*16 + CT], CTR4 |
| vmovdqu XMMWORD PTR[5*16 + CT], CTR5 |
| vmovdqu XMMWORD PTR[6*16 + CT], CTR6 |
| |
| vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] |
| vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] |
| vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] |
| |
| vmovdqa XMMWORD PTR[1*16 + esp], CTR5 |
| vmovdqa XMMWORD PTR[2*16 + esp], CTR4 |
| vmovdqa XMMWORD PTR[3*16 + esp], CTR3 |
| vmovdqa XMMWORD PTR[4*16 + esp], CTR2 |
| vmovdqa XMMWORD PTR[5*16 + esp], CTR1 |
| vmovdqa XMMWORD PTR[6*16 + esp], CTR0 |
| |
| lea CT, [7*16 + CT] |
| lea PT, [7*16 + PT] |
| jmp LEncData7 |
| |
| LEndEnc7: |
| |
| vpshufd TMP4, TMP5, 78 |
| vpxor TMP4, TMP4, TMP5 |
| vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h |
| vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] |
| vpclmulqdq TMP1, TMP5, TMP4, 011h |
| vpclmulqdq TMP2, TMP5, TMP4, 000h |
| |
| vmovdqu TMP5, XMMWORD PTR[1*16 + esp] |
| KARATSUBA 1 |
| vmovdqu TMP5, XMMWORD PTR[2*16 + esp] |
| KARATSUBA 2 |
| vmovdqu TMP5, XMMWORD PTR[3*16 + esp] |
| KARATSUBA 3 |
| vmovdqu TMP5, XMMWORD PTR[4*16 + esp] |
| KARATSUBA 4 |
| vmovdqu TMP5, XMMWORD PTR[5*16 + esp] |
| KARATSUBA 5 |
| vmovdqu TMP5, XMMWORD PTR[6*16 + esp] |
| vpxor TMP5, TMP5, T |
| KARATSUBA 6 |
| |
| vpxor TMP0, TMP0, TMP1 |
| vpxor TMP0, TMP0, TMP2 |
| vpsrldq TMP3, TMP0, 8 |
| vpxor TMP4, TMP1, TMP3 |
| vpslldq TMP3, TMP0, 8 |
| vpxor TMP5, TMP2, TMP3 |
| |
| vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
| vpalignr TMP5,TMP5,TMP5,8 |
| vpxor TMP5, TMP5, TMP1 |
| |
| vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
| vpalignr TMP5,TMP5,TMP5,8 |
| vpxor TMP5, TMP5, TMP1 |
| |
| vpxor TMP5, TMP5, TMP4 |
| vmovdqu T, TMP5 |
| |
| sub aluCTR, 6 |
| |
| LEncDataSingles: |
| |
| cmp len, 16 |
| jb LEncDataTail |
| sub len, 16 |
| |
| vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] |
| NEXTCTR 0 |
| |
| vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
| cmp NR, 10 |
| je @f |
| vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
| cmp NR, 12 |
| je @f |
| vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
| @@: |
| vaesenclast TMP1, TMP1, TMP2 |
| vpxor TMP1, TMP1, XMMWORD PTR[PT] |
| vmovdqu XMMWORD PTR[CT], TMP1 |
| |
| lea PT, [16+PT] |
| lea CT, [16+CT] |
| |
| vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
| vpxor TMP1, TMP1, T |
| |
| vmovdqu TMP0, XMMWORD PTR[Htbl] |
| GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
| vmovdqu T, TMP1 |
| |
| jmp LEncDataSingles |
| |
| LEncDataTail: |
| |
| cmp len, 0 |
| je LEncDataEnd |
| |
| vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] |
| |
| vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
| cmp NR, 10 |
| je @f |
| vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
| cmp NR, 12 |
| je @f |
| vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
| @@: |
| vaesenclast TMP1, TMP1, TMP2 |
| ; zero a temp location |
| vpxor TMP2, TMP2, TMP2 |
| vmovdqa XMMWORD PTR[esp], TMP2 |
| ; copy as many bytes as needed |
| xor KS, KS |
| mov aluTMP, edx |
| @@: |
| cmp len, KS |
| je @f |
| mov dl, BYTE PTR[PT + KS] |
| mov BYTE PTR[esp + KS], dl |
| inc KS |
| jmp @b |
| @@: |
| vpxor TMP1, TMP1, XMMWORD PTR[esp] |
| vmovdqa XMMWORD PTR[esp], TMP1 |
| xor KS, KS |
| @@: |
| cmp len, KS |
| je @f |
| mov dl, BYTE PTR[esp + KS] |
| mov BYTE PTR[CT + KS], dl |
| inc KS |
| jmp @b |
| @@: |
| cmp KS, 16 |
| je @f |
| mov BYTE PTR[esp + KS], 0 |
| inc KS |
| jmp @b |
| @@: |
| mov edx, aluTMP |
| vmovdqa TMP1, XMMWORD PTR[esp] |
| vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
| vpxor TMP1, TMP1, T |
| |
| vmovdqu TMP0, XMMWORD PTR[Htbl] |
| GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
| vmovdqu T, TMP1 |
| |
| LEncDataEnd: |
| inc aluCTR |
| bswap aluCTR |
| mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR |
| |
| mov esp, ebp |
| pop edi |
| pop esi |
| pop ebx |
| pop ebp |
| |
| |
| vzeroupper |
| |
| ret |
| intel_aes_gcmENC ENDP |
| |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| ; |
| ; Decrypt and Authenticate |
| ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); |
| ; |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| |
| NEXTCTR MACRO i |
| add aluCTR, 1 |
| mov aluTMP, aluCTR |
| bswap aluTMP |
| xor aluTMP, [3*4 + KS] |
| mov [3*4 + i*16 + esp], aluTMP |
| ENDM |
| |
| intel_aes_gcmDEC PROC |
| |
| cmp DWORD PTR[1*4 + 3*4 + esp], 0 |
| jne LbeginDEC |
| ret |
| |
| LbeginDEC: |
| |
| vzeroupper |
| push ebp |
| push ebx |
| push esi |
| push edi |
| |
| mov ebp, esp |
| sub esp, 8*16 |
| and esp, -16 |
| |
| mov CT, [ebp + 5*4 + 0*4] |
| mov PT, [ebp + 5*4 + 1*4] |
| mov Gctx, [ebp + 5*4 + 2*4] |
| |
| mov KS, [16*16 + 3*16 + Gctx] |
| |
| mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] |
| bswap aluCTR |
| |
| |
| vmovdqu TMP0, XMMWORD PTR[0*16 + KS] |
| vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
| vmovdqu XMMWORD PTR[0*16 + esp], TMP0 |
| |
| cmp len, 16*7 |
| jb LDecDataSingles |
| vmovdqu XMMWORD PTR[1*16 + esp], TMP0 |
| vmovdqu XMMWORD PTR[2*16 + esp], TMP0 |
| vmovdqu XMMWORD PTR[3*16 + esp], TMP0 |
| vmovdqu XMMWORD PTR[4*16 + esp], TMP0 |
| vmovdqu XMMWORD PTR[5*16 + esp], TMP0 |
| vmovdqu XMMWORD PTR[6*16 + esp], TMP0 |
| dec aluCTR |
| |
| LDecData7: |
| cmp len, 16*7 |
| jb LDecData7End |
| sub len, 16*7 |
| |
| vmovdqu TMP5, XMMWORD PTR[0*16 + CT] |
| vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
| vpxor TMP5, TMP5, T |
| vpshufd TMP4, TMP5, 78 |
| vpxor TMP4, TMP4, TMP5 |
| vpclmulqdq TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h |
| vmovdqu TMP4, XMMWORD PTR[6*16 + Htbl] |
| vpclmulqdq TMP1, TMP5, TMP4, 011h |
| vpclmulqdq TMP2, TMP5, TMP4, 000h |
| |
| NEXTCTR 0 |
| vmovdqu TMP5, XMMWORD PTR[1*16 + CT] |
| vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
| KARATSUBA 5 |
| NEXTCTR 1 |
| vmovdqu TMP5, XMMWORD PTR[2*16 + CT] |
| vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
| KARATSUBA 4 |
| NEXTCTR 2 |
| vmovdqu TMP5, XMMWORD PTR[3*16 + CT] |
| vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
| KARATSUBA 3 |
| NEXTCTR 3 |
| vmovdqu TMP5, XMMWORD PTR[4*16 + CT] |
| vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
| KARATSUBA 2 |
| NEXTCTR 4 |
| vmovdqu TMP5, XMMWORD PTR[5*16 + CT] |
| vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
| KARATSUBA 1 |
| NEXTCTR 5 |
| vmovdqu TMP5, XMMWORD PTR[6*16 + CT] |
| vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
| KARATSUBA 0 |
| NEXTCTR 6 |
| |
| vpxor TMP0, TMP0, TMP1 |
| vpxor TMP0, TMP0, TMP2 |
| vpsrldq TMP3, TMP0, 8 |
| vpxor TMP4, TMP1, TMP3 |
| vpslldq TMP3, TMP0, 8 |
| vpxor TMP5, TMP2, TMP3 |
| |
| vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
| vpalignr TMP5,TMP5,TMP5,8 |
| vpxor TMP5, TMP5, TMP1 |
| |
| vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
| vpalignr TMP5,TMP5,TMP5,8 |
| vpxor TMP5, TMP5, TMP1 |
| |
| vpxor TMP5, TMP5, TMP4 |
| vmovdqu T, TMP5 |
| |
| vmovdqa CTR0, XMMWORD PTR[0*16 + esp] |
| vmovdqa CTR1, XMMWORD PTR[1*16 + esp] |
| vmovdqa CTR2, XMMWORD PTR[2*16 + esp] |
| vmovdqa CTR3, XMMWORD PTR[3*16 + esp] |
| vmovdqa CTR4, XMMWORD PTR[4*16 + esp] |
| vmovdqa CTR5, XMMWORD PTR[5*16 + esp] |
| vmovdqa CTR6, XMMWORD PTR[6*16 + esp] |
| |
| ROUND 1 |
| ROUND 2 |
| ROUND 3 |
| ROUND 4 |
| ROUND 5 |
| ROUND 6 |
| ROUND 7 |
| ROUND 8 |
| ROUND 9 |
| vmovdqu xmm7, XMMWORD PTR[10*16 + KS] |
| cmp NR, 10 |
| je @f |
| |
| ROUND 10 |
| ROUND 11 |
| vmovdqu xmm7, XMMWORD PTR[12*16 + KS] |
| cmp NR, 12 |
| je @f |
| |
| ROUND 12 |
| ROUND 13 |
| vmovdqu xmm7, XMMWORD PTR[14*16 + KS] |
| @@: |
| vaesenclast CTR0, CTR0, xmm7 |
| vaesenclast CTR1, CTR1, xmm7 |
| vaesenclast CTR2, CTR2, xmm7 |
| vaesenclast CTR3, CTR3, xmm7 |
| vaesenclast CTR4, CTR4, xmm7 |
| vaesenclast CTR5, CTR5, xmm7 |
| vaesenclast CTR6, CTR6, xmm7 |
| |
| vpxor CTR0, CTR0, XMMWORD PTR[0*16 + CT] |
| vpxor CTR1, CTR1, XMMWORD PTR[1*16 + CT] |
| vpxor CTR2, CTR2, XMMWORD PTR[2*16 + CT] |
| vpxor CTR3, CTR3, XMMWORD PTR[3*16 + CT] |
| vpxor CTR4, CTR4, XMMWORD PTR[4*16 + CT] |
| vpxor CTR5, CTR5, XMMWORD PTR[5*16 + CT] |
| vpxor CTR6, CTR6, XMMWORD PTR[6*16 + CT] |
| |
| vmovdqu XMMWORD PTR[0*16 + PT], CTR0 |
| vmovdqu XMMWORD PTR[1*16 + PT], CTR1 |
| vmovdqu XMMWORD PTR[2*16 + PT], CTR2 |
| vmovdqu XMMWORD PTR[3*16 + PT], CTR3 |
| vmovdqu XMMWORD PTR[4*16 + PT], CTR4 |
| vmovdqu XMMWORD PTR[5*16 + PT], CTR5 |
| vmovdqu XMMWORD PTR[6*16 + PT], CTR6 |
| |
| lea CT, [7*16 + CT] |
| lea PT, [7*16 + PT] |
| jmp LDecData7 |
| |
| LDecData7End: |
| |
| NEXTCTR 0 |
| |
| LDecDataSingles: |
| |
| cmp len, 16 |
| jb LDecDataTail |
| sub len, 16 |
| |
| vmovdqu TMP1, XMMWORD PTR[CT] |
| vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
| vpxor TMP1, TMP1, T |
| |
| vmovdqu TMP0, XMMWORD PTR[Htbl] |
| GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
| vmovdqu T, TMP1 |
| |
| vmovdqa TMP1, XMMWORD PTR[0*16 + esp] |
| NEXTCTR 0 |
| |
| vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
| cmp NR, 10 |
| je @f |
| vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
| cmp NR, 12 |
| je @f |
| vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
| @@: |
| vaesenclast TMP1, TMP1, TMP2 |
| vpxor TMP1, TMP1, XMMWORD PTR[CT] |
| vmovdqu XMMWORD PTR[PT], TMP1 |
| |
| lea PT, [16+PT] |
| lea CT, [16+CT] |
| jmp LDecDataSingles |
| |
| LDecDataTail: |
| |
| cmp len, 0 |
| je LDecDataEnd |
| |
| vmovdqa TMP1, XMMWORD PTR[0*16 + esp] |
| inc aluCTR |
| vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
| cmp NR, 10 |
| je @f |
| vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
| cmp NR, 12 |
| je @f |
| vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
| vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
| vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
| @@: |
| vaesenclast xmm7, TMP1, TMP2 |
| |
| ; copy as many bytes as needed |
| xor KS, KS |
| mov aluTMP, edx |
| @@: |
| cmp len, KS |
| je @f |
| mov dl, BYTE PTR[CT + KS] |
| mov BYTE PTR[esp + KS], dl |
| inc KS |
| jmp @b |
| @@: |
| cmp KS, 16 |
| je @f |
| mov BYTE PTR[esp + KS], 0 |
| inc KS |
| jmp @b |
| @@: |
| mov edx, aluTMP |
| vmovdqa TMP1, XMMWORD PTR[esp] |
| vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
| vpxor TMP1, TMP1, T |
| |
| vmovdqu TMP0, XMMWORD PTR[Htbl] |
| GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
| vmovdqu T, TMP1 |
| |
| vpxor xmm7, xmm7, XMMWORD PTR[esp] |
| vmovdqa XMMWORD PTR[esp], xmm7 |
| xor KS, KS |
| mov aluTMP, edx |
| @@: |
| cmp len, KS |
| je @f |
| mov dl, BYTE PTR[esp + KS] |
| mov BYTE PTR[PT + KS], dl |
| inc KS |
| jmp @b |
| @@: |
| mov edx, aluTMP |
| |
| LDecDataEnd: |
| |
| bswap aluCTR |
| mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR |
| |
| mov esp, ebp |
| pop edi |
| pop esi |
| pop ebx |
| pop ebp |
| |
| vzeroupper |
| |
| ret |
| intel_aes_gcmDEC ENDP |
| |
| |
| END |