| /* cast5-amd64.S - AMD64 assembly implementation of CAST5 cipher |
| * |
| * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
| * |
| * This file is part of Libgcrypt. |
| * |
| * Libgcrypt is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU Lesser General Public License as |
| * published by the Free Software Foundation; either version 2.1 of |
| * the License, or (at your option) any later version. |
| * |
| * Libgcrypt is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with this program; if not, see <http://www.gnu.org/licenses/>. |
| */ |
| |
| #ifdef __x86_64 |
| #include <config.h> |
| #if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_CAST5) |
| |
| #ifdef __PIC__ |
| # define RIP %rip |
| # define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg |
| #else |
| # define RIP |
| # define GET_EXTERN_POINTER(name, reg) leaq name, reg |
| #endif |
| |
| .text |
| |
| .extern _gcry_cast5_s1to4; |
| |
| #define s1 0 |
| #define s2 (s1 + (4 * 256)) |
| #define s3 (s2 + (4 * 256)) |
| #define s4 (s3 + (4 * 256)) |
| |
| /* structure of CAST5_context: */ |
| #define Km 0 |
| #define Kr (Km + (16 * 4)) |
| |
| /* register macros */ |
| #define CTX %rdi |
| #define RIO %rsi |
| #define RTAB %r8 |
| |
| #define RLR0 %r9 |
| #define RLR1 %r10 |
| #define RLR2 %r11 |
| #define RLR3 %r12 |
| |
| #define RLR0d %r9d |
| #define RLR1d %r10d |
| #define RLR2d %r11d |
| #define RLR3d %r12d |
| |
| #define RX0 %rax |
| #define RX1 %rbx |
| #define RX2 %rdx |
| |
| #define RX0d %eax |
| #define RX1d %ebx |
| #define RX2d %edx |
| |
| #define RX0bl %al |
| #define RX1bl %bl |
| #define RX2bl %dl |
| |
| #define RX0bh %ah |
| #define RX1bh %bh |
| #define RX2bh %dh |
| |
| #define RKR %rcx |
| #define RKRd %ecx |
| #define RKRbl %cl |
| |
| #define RT0 %rbp |
| #define RT1 %rsi |
| |
| #define RT0d %ebp |
| #define RT1d %esi |
| |
| #define RKM0d %r13d |
| #define RKM1d %r14d |
| |
| /*********************************************************************** |
| * 1-way cast5 |
| ***********************************************************************/ |
| #define dummy(x) |
| |
| #define shr_kr(none) \ |
| shrq $8, RKR; |
| |
| #define F(km, load_next_kr, op0, op1, op2, op3) \ |
| op0 ## l RLR0d, km ## d; \ |
| roll RKRbl, km ## d; \ |
| rorq $32, RLR0; \ |
| movzbl km ## bh, RT0d; \ |
| movzbl km ## bl, RT1d; \ |
| roll $16, km ## d; \ |
| movl s1(RTAB,RT0,4), RT0d; \ |
| op1 ## l s2(RTAB,RT1,4), RT0d; \ |
| load_next_kr(kr_next); \ |
| movzbl km ## bh, RT1d; \ |
| movzbl km ## bl, km ## d; \ |
| op2 ## l s3(RTAB,RT1,4), RT0d; \ |
| op3 ## l s4(RTAB,km,4), RT0d; \ |
| xorq RT0, RLR0; |
| |
| #define F1(km, load_next_kr) \ |
| F(##km, load_next_kr, add, xor, sub, add) |
| #define F2(km, load_next_kr) \ |
| F(##km, load_next_kr, xor, sub, add, xor) |
| #define F3(km, load_next_kr) \ |
| F(##km, load_next_kr, sub, add, xor, sub) |
| |
| #define get_round_km(n, km) \ |
| movl Km+4*(n)(CTX), km; |
| |
| #define get_round_kr_enc(n) \ |
| movq $0x1010101010101010, RKR; \ |
| \ |
| /* merge rorl rk and rorl $16 */ \ |
| xorq Kr+(n)(CTX), RKR; |
| |
| #define get_round_kr_dec(n) \ |
| movq $0x1010101010101010, RKR; \ |
| \ |
| /* merge rorl rk and rorl $16 */ \ |
| xorq Kr+(n - 7)(CTX), RKR; \ |
| bswapq RKR; |
| |
| #define round_enc(n, FA, FB, fn1, fn2) \ |
| get_round_km(n + 1, RX2d); \ |
| FA(RX0, fn1); \ |
| get_round_km(n + 2, RX0d); \ |
| FB(RX2, fn2); |
| |
| #define round_enc_last(n, FXA, FXB) \ |
| get_round_km(n + 1, RX2d); \ |
| \ |
| FXA(RX0, shr_kr); \ |
| FXB(RX2, dummy); |
| |
| #define round_enc_1(n, FA, FB) \ |
| round_enc(n, FA, FB, shr_kr, shr_kr) |
| |
| #define round_enc_2(n, FA, FB) \ |
| round_enc(n, FA, FB, shr_kr, dummy) |
| |
| #define round_dec(n, FA, FB, fn1, fn2) \ |
| get_round_km(n - 1, RX2d); \ |
| FA(RX0, fn1); \ |
| get_round_km(n - 2, RX0d); \ |
| FB(RX2, fn2); |
| |
| #define round_dec_last(n, FXA, FXB) \ |
| get_round_km(n - 1, RX2d); \ |
| FXA(RX0, shr_kr); \ |
| FXB(RX2, dummy); |
| |
| #define round_dec_1(n, FA, FB) \ |
| round_dec(n, FA, FB, shr_kr, shr_kr) |
| |
| #define round_dec_2(n, FA, FB) \ |
| round_dec(n, FA, FB, shr_kr, dummy) |
| |
| #define read_block() \ |
| movq (RIO), RLR0; \ |
| bswapq RLR0; |
| |
| #define write_block() \ |
| bswapq RLR0; \ |
| rorq $32, RLR0; \ |
| movq RLR0, (RIO); |
| |
| .align 8 |
| .globl _gcry_cast5_amd64_encrypt_block |
| .type _gcry_cast5_amd64_encrypt_block,@function; |
| |
| _gcry_cast5_amd64_encrypt_block: |
| /* input: |
| * %rdi: ctx, CTX |
| * %rsi: dst |
| * %rdx: src |
| */ |
| pushq %rbp; |
| pushq %rbx; |
| |
| movq %rsi, %r10; |
| |
| GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); |
| |
| movq %rdx, RIO; |
| read_block(); |
| |
| get_round_km(0, RX0d); |
| get_round_kr_enc(0); |
| round_enc_1(0, F1, F2); |
| round_enc_1(2, F3, F1); |
| round_enc_1(4, F2, F3); |
| round_enc_2(6, F1, F2); |
| get_round_kr_enc(8); |
| round_enc_1(8, F3, F1); |
| round_enc_1(10, F2, F3); |
| round_enc_1(12, F1, F2); |
| round_enc_last(14, F3, F1); |
| |
| movq %r10, RIO; |
| write_block(); |
| |
| popq %rbx; |
| popq %rbp; |
| ret; |
| .size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block; |
| |
| .align 8 |
| .globl _gcry_cast5_amd64_decrypt_block |
| .type _gcry_cast5_amd64_decrypt_block,@function; |
| |
| _gcry_cast5_amd64_decrypt_block: |
| /* input: |
| * %rdi: ctx, CTX |
| * %rsi: dst |
| * %rdx: src |
| */ |
| pushq %rbp; |
| pushq %rbx; |
| |
| movq %rsi, %r10; |
| |
| GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); |
| |
| movq %rdx, RIO; |
| read_block(); |
| |
| get_round_km(15, RX0d); |
| get_round_kr_dec(15); |
| round_dec_1(15, F1, F3); |
| round_dec_1(13, F2, F1); |
| round_dec_1(11, F3, F2); |
| round_dec_2(9, F1, F3); |
| get_round_kr_dec(7); |
| round_dec_1(7, F2, F1); |
| round_dec_1(5, F3, F2); |
| round_dec_1(3, F1, F3); |
| round_dec_last(1, F2, F1); |
| |
| movq %r10, RIO; |
| write_block(); |
| |
| popq %rbx; |
| popq %rbp; |
| ret; |
| .size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block; |
| |
| /********************************************************************** |
| 4-way cast5, four blocks parallel |
| **********************************************************************/ |
| #define F_tail(rlr, rx, op1, op2, op3) \ |
| movzbl rx ## bh, RT0d; \ |
| movzbl rx ## bl, RT1d; \ |
| roll $16, rx ## d; \ |
| movl s1(RTAB,RT0,4), RT0d; \ |
| op1 ## l s2(RTAB,RT1,4), RT0d; \ |
| movzbl rx ## bh, RT1d; \ |
| movzbl rx ## bl, rx ## d; \ |
| op2 ## l s3(RTAB,RT1,4), RT0d; \ |
| op3 ## l s4(RTAB,rx,4), RT0d; \ |
| xorq RT0, rlr; |
| |
| #define F4(km, load_next_kr, op0, op1, op2, op3) \ |
| movl km, RX0d; \ |
| op0 ## l RLR0d, RX0d; \ |
| roll RKRbl, RX0d; \ |
| rorq $32, RLR0; \ |
| \ |
| movl km, RX1d; \ |
| op0 ## l RLR1d, RX1d; \ |
| roll RKRbl, RX1d; \ |
| rorq $32, RLR1; \ |
| \ |
| movl km, RX2d; \ |
| op0 ## l RLR2d, RX2d; \ |
| roll RKRbl, RX2d; \ |
| rorq $32, RLR2; \ |
| \ |
| F_tail(RLR0, RX0, op1, op2, op3); \ |
| F_tail(RLR1, RX1, op1, op2, op3); \ |
| F_tail(RLR2, RX2, op1, op2, op3); \ |
| \ |
| movl km, RX0d; \ |
| op0 ## l RLR3d, RX0d; \ |
| roll RKRbl, RX0d; \ |
| load_next_kr(); \ |
| rorq $32, RLR3; \ |
| \ |
| F_tail(RLR3, RX0, op1, op2, op3); |
| |
| #define F4_1(km, load_next_kr) \ |
| F4(km, load_next_kr, add, xor, sub, add) |
| #define F4_2(km, load_next_kr) \ |
| F4(km, load_next_kr, xor, sub, add, xor) |
| #define F4_3(km, load_next_kr) \ |
| F4(km, load_next_kr, sub, add, xor, sub) |
| |
| #define round_enc4(n, FA, FB, fn1, fn2) \ |
| get_round_km(n + 1, RKM1d); \ |
| FA(RKM0d, fn1); \ |
| get_round_km(n + 2, RKM0d); \ |
| FB(RKM1d, fn2); |
| |
| #define round_enc_last4(n, FXA, FXB) \ |
| get_round_km(n + 1, RKM1d); \ |
| FXA(RKM0d, shr_kr); \ |
| FXB(RKM1d, dummy); |
| |
| #define round_enc4_1(n, FA, FB) \ |
| round_enc4(n, FA, FB, shr_kr, shr_kr); |
| |
| #define round_enc4_2(n, FA, FB) \ |
| round_enc4(n, FA, FB, shr_kr, dummy); |
| |
| #define round_dec4(n, FA, FB, fn1, fn2) \ |
| get_round_km(n - 1, RKM1d); \ |
| FA(RKM0d, fn1); \ |
| get_round_km(n - 2, RKM0d); \ |
| FB(RKM1d, fn2); |
| |
| #define round_dec_last4(n, FXA, FXB) \ |
| get_round_km(n - 1, RKM1d); \ |
| FXA(RKM0d, shr_kr); \ |
| FXB(RKM1d, dummy); |
| |
| #define round_dec4_1(n, FA, FB) \ |
| round_dec4(n, FA, FB, shr_kr, shr_kr); |
| |
| #define round_dec4_2(n, FA, FB) \ |
| round_dec4(n, FA, FB, shr_kr, dummy); |
| |
| #define inbswap_block4(a, b, c, d) \ |
| bswapq a; \ |
| bswapq b; \ |
| bswapq c; \ |
| bswapq d; |
| |
| #define outbswap_block4(a, b, c, d) \ |
| bswapq a; \ |
| bswapq b; \ |
| bswapq c; \ |
| bswapq d; \ |
| rorq $32, a; \ |
| rorq $32, b; \ |
| rorq $32, c; \ |
| rorq $32, d; |
| |
| .align 8 |
| .type __cast5_enc_blk4,@function; |
| |
| __cast5_enc_blk4: |
| /* input: |
| * %rdi: ctx, CTX |
| * RLR0,RLR1,RLR2,RLR3: four input plaintext blocks |
| * output: |
| * RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks |
| */ |
| GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); |
| |
| get_round_km(0, RKM0d); |
| get_round_kr_enc(0); |
| round_enc4_1(0, F4_1, F4_2); |
| round_enc4_1(2, F4_3, F4_1); |
| round_enc4_1(4, F4_2, F4_3); |
| round_enc4_2(6, F4_1, F4_2); |
| get_round_kr_enc(8); |
| round_enc4_1(8, F4_3, F4_1); |
| round_enc4_1(10, F4_2, F4_3); |
| round_enc4_1(12, F4_1, F4_2); |
| round_enc_last4(14, F4_3, F4_1); |
| |
| outbswap_block4(RLR0, RLR1, RLR2, RLR3); |
| ret; |
| .size __cast5_enc_blk4,.-__cast5_enc_blk4; |
| |
| .align 8 |
| .type __cast5_dec_blk4,@function; |
| |
| __cast5_dec_blk4: |
| /* input: |
| * %rdi: ctx, CTX |
| * RLR0,RLR1,RLR2,RLR3: four input ciphertext blocks |
| * output: |
| * RLR0,RLR1,RLR2,RLR3: four output plaintext blocks |
| */ |
| GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); |
| |
| inbswap_block4(RLR0, RLR1, RLR2, RLR3); |
| |
| get_round_km(15, RKM0d); |
| get_round_kr_dec(15); |
| round_dec4_1(15, F4_1, F4_3); |
| round_dec4_1(13, F4_2, F4_1); |
| round_dec4_1(11, F4_3, F4_2); |
| round_dec4_2(9, F4_1, F4_3); |
| get_round_kr_dec(7); |
| round_dec4_1(7, F4_2, F4_1); |
| round_dec4_1(5, F4_3, F4_2); |
| round_dec4_1(3, F4_1, F4_3); |
| round_dec_last4(1, F4_2, F4_1); |
| |
| outbswap_block4(RLR0, RLR1, RLR2, RLR3); |
| ret; |
| .size __cast5_dec_blk4,.-__cast5_dec_blk4; |
| |
| .align 8 |
| .globl _gcry_cast5_amd64_ctr_enc |
| .type _gcry_cast5_amd64_ctr_enc,@function; |
| _gcry_cast5_amd64_ctr_enc: |
| /* input: |
| * %rdi: ctx, CTX |
| * %rsi: dst (8 blocks) |
| * %rdx: src (8 blocks) |
| * %rcx: iv (big endian, 64bit) |
| */ |
| |
| pushq %rbp; |
| pushq %rbx; |
| pushq %r12; |
| pushq %r13; |
| pushq %r14; |
| |
| pushq %rsi; |
| pushq %rdx; |
| |
| /* load IV and byteswap */ |
| movq (%rcx), RX0; |
| bswapq RX0; |
| movq RX0, RLR0; |
| |
| /* construct IVs */ |
| leaq 1(RX0), RLR1; |
| leaq 2(RX0), RLR2; |
| leaq 3(RX0), RLR3; |
| leaq 4(RX0), RX0; |
| bswapq RX0; |
| |
| /* store new IV */ |
| movq RX0, (%rcx); |
| |
| call __cast5_enc_blk4; |
| |
| popq %r14; /*src*/ |
| popq %r13; /*dst*/ |
| |
| /* XOR key-stream with plaintext */ |
| xorq 0 * 8(%r14), RLR0; |
| xorq 1 * 8(%r14), RLR1; |
| xorq 2 * 8(%r14), RLR2; |
| xorq 3 * 8(%r14), RLR3; |
| movq RLR0, 0 * 8(%r13); |
| movq RLR1, 1 * 8(%r13); |
| movq RLR2, 2 * 8(%r13); |
| movq RLR3, 3 * 8(%r13); |
| |
| popq %r14; |
| popq %r13; |
| popq %r12; |
| popq %rbx; |
| popq %rbp; |
| ret |
| .size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc; |
| |
| .align 8 |
| .globl _gcry_cast5_amd64_cbc_dec |
| .type _gcry_cast5_amd64_cbc_dec,@function; |
| _gcry_cast5_amd64_cbc_dec: |
| /* input: |
| * %rdi: ctx, CTX |
| * %rsi: dst (8 blocks) |
| * %rdx: src (8 blocks) |
| * %rcx: iv (64bit) |
| */ |
| |
| pushq %rbp; |
| pushq %rbx; |
| pushq %r12; |
| pushq %r13; |
| pushq %r14; |
| |
| pushq %rcx; |
| pushq %rsi; |
| pushq %rdx; |
| |
| /* load input */ |
| movq 0 * 8(%rdx), RLR0; |
| movq 1 * 8(%rdx), RLR1; |
| movq 2 * 8(%rdx), RLR2; |
| movq 3 * 8(%rdx), RLR3; |
| |
| call __cast5_dec_blk4; |
| |
| popq RX0; /*src*/ |
| popq RX1; /*dst*/ |
| popq RX2; /*iv*/ |
| |
| movq 3 * 8(RX0), %r14; |
| xorq (RX2), RLR0; |
| xorq 0 * 8(RX0), RLR1; |
| xorq 1 * 8(RX0), RLR2; |
| xorq 2 * 8(RX0), RLR3; |
| movq %r14, (RX2); /* store new IV */ |
| |
| movq RLR0, 0 * 8(RX1); |
| movq RLR1, 1 * 8(RX1); |
| movq RLR2, 2 * 8(RX1); |
| movq RLR3, 3 * 8(RX1); |
| |
| popq %r14; |
| popq %r13; |
| popq %r12; |
| popq %rbx; |
| popq %rbp; |
| ret; |
| |
| .size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec; |
| |
| .align 8 |
| .globl _gcry_cast5_amd64_cfb_dec |
| .type _gcry_cast5_amd64_cfb_dec,@function; |
| _gcry_cast5_amd64_cfb_dec: |
| /* input: |
| * %rdi: ctx, CTX |
| * %rsi: dst (8 blocks) |
| * %rdx: src (8 blocks) |
| * %rcx: iv (64bit) |
| */ |
| |
| pushq %rbp; |
| pushq %rbx; |
| pushq %r12; |
| pushq %r13; |
| pushq %r14; |
| |
| pushq %rsi; |
| pushq %rdx; |
| |
| /* Load input */ |
| movq (%rcx), RLR0; |
| movq 0 * 8(%rdx), RLR1; |
| movq 1 * 8(%rdx), RLR2; |
| movq 2 * 8(%rdx), RLR3; |
| |
| inbswap_block4(RLR0, RLR1, RLR2, RLR3); |
| |
| /* Update IV */ |
| movq 3 * 8(%rdx), %rdx; |
| movq %rdx, (%rcx); |
| |
| call __cast5_enc_blk4; |
| |
| popq %rdx; /*src*/ |
| popq %rcx; /*dst*/ |
| |
| xorq 0 * 8(%rdx), RLR0; |
| xorq 1 * 8(%rdx), RLR1; |
| xorq 2 * 8(%rdx), RLR2; |
| xorq 3 * 8(%rdx), RLR3; |
| movq RLR0, 0 * 8(%rcx); |
| movq RLR1, 1 * 8(%rcx); |
| movq RLR2, 2 * 8(%rcx); |
| movq RLR3, 3 * 8(%rcx); |
| |
| popq %r14; |
| popq %r13; |
| popq %r12; |
| popq %rbx; |
| popq %rbp; |
| ret; |
| |
| .size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec; |
| |
| #endif /*defined(USE_CAST5)*/ |
| #endif /*__x86_64*/ |