cipher/cast5-amd64.S - nest-hello/507800056/libgcrypt - Git at Google

 /* cast5-amd64.S  -  AMD64 assembly implementation of CAST5 cipher
  *
  * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * Libgcrypt is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */

 #ifdef __x86_64
 #include <config.h>
 #if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_CAST5)

 #ifdef __PIC__
 #  define RIP %rip
 #  define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg
 #else
 #  define RIP
 #  define GET_EXTERN_POINTER(name, reg) leaq name, reg
 #endif

 .text

 .extern _gcry_cast5_s1to4;

 #define s1 0
 #define s2 (s1 + (4 * 256))
 #define s3 (s2 + (4 * 256))
 #define s4 (s3 + (4 * 256))

 /* structure of CAST5_context: */
 #define Km 0
 #define Kr (Km + (16 * 4))

 /* register macros */
 #define CTX %rdi
 #define RIO %rsi
 #define RTAB %r8

 #define RLR0 %r9
 #define RLR1 %r10
 #define RLR2 %r11
 #define RLR3 %r12

 #define RLR0d %r9d
 #define RLR1d %r10d
 #define RLR2d %r11d
 #define RLR3d %r12d

 #define RX0 %rax
 #define RX1 %rbx
 #define RX2 %rdx

 #define RX0d %eax
 #define RX1d %ebx
 #define RX2d %edx

 #define RX0bl %al
 #define RX1bl %bl
 #define RX2bl %dl

 #define RX0bh %ah
 #define RX1bh %bh
 #define RX2bh %dh

 #define RKR %rcx
 #define RKRd %ecx
 #define RKRbl %cl

 #define RT0 %rbp
 #define RT1 %rsi

 #define RT0d %ebp
 #define RT1d %esi

 #define RKM0d %r13d
 #define RKM1d %r14d

 /***********************************************************************
  * 1-way cast5
  ***********************************************************************/
 #define dummy(x)

 #define shr_kr(none) \
 	shrq $8,			RKR;

 #define F(km, load_next_kr, op0, op1, op2, op3) \
 	op0 ## l RLR0d,			km ## d; \
 	roll RKRbl,			km ## d; \
 	rorq $32,			RLR0; \
 	movzbl km ## bh,		RT0d; \
 	movzbl km ## bl,		RT1d; \
 	roll $16,			km ## d; \
 	movl s1(RTAB,RT0,4),		RT0d; \
 	op1 ## l s2(RTAB,RT1,4),	RT0d; \
 	load_next_kr(kr_next); \
 	movzbl km ## bh,		RT1d; \
 	movzbl km ## bl,		km ## d; \
 	op2 ## l s3(RTAB,RT1,4),	RT0d; \
 	op3 ## l s4(RTAB,km,4),		RT0d; \
 	xorq RT0,			RLR0;

 #define F1(km, load_next_kr) \
 	F(##km, load_next_kr, add, xor, sub, add)
 #define F2(km, load_next_kr) \
 	F(##km, load_next_kr, xor, sub, add, xor)
 #define F3(km, load_next_kr) \
 	F(##km, load_next_kr, sub, add, xor, sub)

 #define get_round_km(n, km) \
 	movl Km+4*(n)(CTX), 		km;

 #define get_round_kr_enc(n) \
 	movq $0x1010101010101010,	RKR; \
 	\
 	/* merge rorl rk and rorl $16 */ \
 	xorq Kr+(n)(CTX),		RKR;

 #define get_round_kr_dec(n) \
 	movq $0x1010101010101010,	RKR; \
 	\
 	/* merge rorl rk and rorl $16 */ \
 	xorq Kr+(n - 7)(CTX),		RKR; \
 	bswapq				RKR;

 #define round_enc(n, FA, FB, fn1, fn2) \
 	get_round_km(n + 1, RX2d); \
 	FA(RX0, fn1); \
 	get_round_km(n + 2, RX0d); \
 	FB(RX2, fn2);

 #define round_enc_last(n, FXA, FXB) \
 	get_round_km(n + 1, RX2d); \
 	\
 	FXA(RX0, shr_kr); \
 	FXB(RX2, dummy);

 #define round_enc_1(n, FA, FB) \
 	round_enc(n, FA, FB, shr_kr, shr_kr)

 #define round_enc_2(n, FA, FB) \
 	round_enc(n, FA, FB, shr_kr, dummy)

 #define round_dec(n, FA, FB, fn1, fn2) \
 	get_round_km(n - 1, RX2d); \
 	FA(RX0, fn1); \
 	get_round_km(n - 2, RX0d); \
 	FB(RX2, fn2);

 #define round_dec_last(n, FXA, FXB) \
 	get_round_km(n - 1, RX2d); \
 	FXA(RX0, shr_kr); \
 	FXB(RX2, dummy);

 #define round_dec_1(n, FA, FB) \
 	round_dec(n, FA, FB, shr_kr, shr_kr)

 #define round_dec_2(n, FA, FB) \
 	round_dec(n, FA, FB, shr_kr, dummy)

 #define read_block() \
 	movq (RIO), 		RLR0; \
 	bswapq 			RLR0;

 #define write_block() \
 	bswapq 			RLR0; \
 	rorq $32,		RLR0; \
 	movq RLR0, 		(RIO);

 .align 8
 .globl _gcry_cast5_amd64_encrypt_block
 .type   _gcry_cast5_amd64_encrypt_block,@function;

 _gcry_cast5_amd64_encrypt_block:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
 	pushq %rbp;
 	pushq %rbx;

 	movq %rsi, %r10;

 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);

 	movq %rdx, RIO;
 	read_block();

 	get_round_km(0, RX0d);
 	get_round_kr_enc(0);
 	round_enc_1(0, F1, F2);
 	round_enc_1(2, F3, F1);
 	round_enc_1(4, F2, F3);
 	round_enc_2(6, F1, F2);
 	get_round_kr_enc(8);
 	round_enc_1(8, F3, F1);
 	round_enc_1(10, F2, F3);
 	round_enc_1(12, F1, F2);
 	round_enc_last(14, F3, F1);

 	movq %r10, RIO;
 	write_block();

 	popq %rbx;
 	popq %rbp;
 	ret;
 .size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;

 .align 8
 .globl _gcry_cast5_amd64_decrypt_block
 .type   _gcry_cast5_amd64_decrypt_block,@function;

 _gcry_cast5_amd64_decrypt_block:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
 	pushq %rbp;
 	pushq %rbx;

 	movq %rsi, %r10;

 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);

 	movq %rdx, RIO;
 	read_block();

 	get_round_km(15, RX0d);
 	get_round_kr_dec(15);
 	round_dec_1(15, F1, F3);
 	round_dec_1(13, F2, F1);
 	round_dec_1(11, F3, F2);
 	round_dec_2(9, F1, F3);
 	get_round_kr_dec(7);
 	round_dec_1(7, F2, F1);
 	round_dec_1(5, F3, F2);
 	round_dec_1(3, F1, F3);
 	round_dec_last(1, F2, F1);

 	movq %r10, RIO;
 	write_block();

 	popq %rbx;
 	popq %rbp;
 	ret;
 .size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;

 /**********************************************************************
   4-way cast5, four blocks parallel
  **********************************************************************/
 #define F_tail(rlr, rx, op1, op2, op3) \
 	movzbl rx ## bh,		RT0d; \
 	movzbl rx ## bl,		RT1d; \
 	roll $16,			rx ## d; \
 	movl s1(RTAB,RT0,4),		RT0d; \
 	op1 ## l s2(RTAB,RT1,4),	RT0d; \
 	movzbl rx ## bh,		RT1d; \
 	movzbl rx ## bl,		rx ## d; \
 	op2 ## l s3(RTAB,RT1,4),	RT0d; \
 	op3 ## l s4(RTAB,rx,4),		RT0d; \
 	xorq RT0,			rlr;

 #define F4(km, load_next_kr, op0, op1, op2, op3) \
 	movl km,			RX0d; \
 	op0 ## l RLR0d,			RX0d; \
 	roll RKRbl,			RX0d; \
 	rorq $32,			RLR0; \
 	\
 	movl km,			RX1d; \
 	op0 ## l RLR1d,			RX1d; \
 	roll RKRbl,			RX1d; \
 	rorq $32,			RLR1; \
 	\
 	movl km,			RX2d; \
 	op0 ## l RLR2d,			RX2d; \
 	roll RKRbl,			RX2d; \
 	rorq $32,			RLR2; \
 	\
 	F_tail(RLR0, RX0, op1, op2, op3); \
 	F_tail(RLR1, RX1, op1, op2, op3); \
 	F_tail(RLR2, RX2, op1, op2, op3); \
 	\
 	movl km,			RX0d; \
 	op0 ## l RLR3d,			RX0d; \
 	roll RKRbl,			RX0d; \
 	load_next_kr();			\
 	rorq $32,			RLR3; \
 	\
 	F_tail(RLR3, RX0, op1, op2, op3);

 #define F4_1(km, load_next_kr) \
 	F4(km, load_next_kr, add, xor, sub, add)
 #define F4_2(km, load_next_kr) \
 	F4(km, load_next_kr, xor, sub, add, xor)
 #define F4_3(km, load_next_kr) \
 	F4(km, load_next_kr, sub, add, xor, sub)

 #define round_enc4(n, FA, FB, fn1, fn2) \
 	get_round_km(n + 1, RKM1d); \
 	FA(RKM0d, fn1); \
 	get_round_km(n + 2, RKM0d); \
 	FB(RKM1d, fn2);

 #define round_enc_last4(n, FXA, FXB) \
 	get_round_km(n + 1, RKM1d); \
 	FXA(RKM0d, shr_kr); \
 	FXB(RKM1d, dummy);

 #define round_enc4_1(n, FA, FB) \
 	round_enc4(n, FA, FB, shr_kr, shr_kr);

 #define round_enc4_2(n, FA, FB) \
 	round_enc4(n, FA, FB, shr_kr, dummy);

 #define round_dec4(n, FA, FB, fn1, fn2) \
 	get_round_km(n - 1, RKM1d); \
 	FA(RKM0d, fn1); \
 	get_round_km(n - 2, RKM0d); \
 	FB(RKM1d, fn2);

 #define round_dec_last4(n, FXA, FXB) \
 	get_round_km(n - 1, RKM1d); \
 	FXA(RKM0d, shr_kr); \
 	FXB(RKM1d, dummy);

 #define round_dec4_1(n, FA, FB) \
 	round_dec4(n, FA, FB, shr_kr, shr_kr);

 #define round_dec4_2(n, FA, FB) \
 	round_dec4(n, FA, FB, shr_kr, dummy);

 #define inbswap_block4(a, b, c, d) \
 	bswapq 			a; \
 	bswapq 			b; \
 	bswapq 			c; \
 	bswapq 			d;

 #define outbswap_block4(a, b, c, d) \
 	bswapq 			a; \
 	bswapq 			b; \
 	bswapq 			c; \
 	bswapq 			d; \
 	rorq $32,		a; \
 	rorq $32,		b; \
 	rorq $32,		c; \
 	rorq $32,		d;

 .align 8
 .type   __cast5_enc_blk4,@function;

 __cast5_enc_blk4:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RLR0,RLR1,RLR2,RLR3: four input plaintext blocks
 	 * output:
 	 *	RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks
 	 */
 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);

 	get_round_km(0, RKM0d);
 	get_round_kr_enc(0);
 	round_enc4_1(0, F4_1, F4_2);
 	round_enc4_1(2, F4_3, F4_1);
 	round_enc4_1(4, F4_2, F4_3);
 	round_enc4_2(6, F4_1, F4_2);
 	get_round_kr_enc(8);
 	round_enc4_1(8, F4_3, F4_1);
 	round_enc4_1(10, F4_2, F4_3);
 	round_enc4_1(12, F4_1, F4_2);
 	round_enc_last4(14, F4_3, F4_1);

 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
 	ret;
 .size __cast5_enc_blk4,.-__cast5_enc_blk4;

 .align 8
 .type   __cast5_dec_blk4,@function;

 __cast5_dec_blk4:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	RLR0,RLR1,RLR2,RLR3: four input ciphertext blocks
 	 * output:
 	 *	RLR0,RLR1,RLR2,RLR3: four output plaintext blocks
 	 */
 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);

 	inbswap_block4(RLR0, RLR1, RLR2, RLR3);

 	get_round_km(15, RKM0d);
 	get_round_kr_dec(15);
 	round_dec4_1(15, F4_1, F4_3);
 	round_dec4_1(13, F4_2, F4_1);
 	round_dec4_1(11, F4_3, F4_2);
 	round_dec4_2(9, F4_1, F4_3);
 	get_round_kr_dec(7);
 	round_dec4_1(7, F4_2, F4_1);
 	round_dec4_1(5, F4_3, F4_2);
 	round_dec4_1(3, F4_1, F4_3);
 	round_dec_last4(1, F4_2, F4_1);

 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
 	ret;
 .size __cast5_dec_blk4,.-__cast5_dec_blk4;

 .align 8
 .globl _gcry_cast5_amd64_ctr_enc
 .type   _gcry_cast5_amd64_ctr_enc,@function;
 _gcry_cast5_amd64_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (8 blocks)
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (big endian, 64bit)
 	 */

 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
 	pushq %r13;
 	pushq %r14;

 	pushq %rsi;
 	pushq %rdx;

 	/* load IV and byteswap */
 	movq (%rcx), RX0;
 	bswapq RX0;
 	movq RX0, RLR0;

 	/* construct IVs */
 	leaq 1(RX0), RLR1;
 	leaq 2(RX0), RLR2;
 	leaq 3(RX0), RLR3;
 	leaq 4(RX0), RX0;
 	bswapq RX0;

 	/* store new IV */
 	movq RX0, (%rcx);

 	call __cast5_enc_blk4;

 	popq %r14; /*src*/
 	popq %r13; /*dst*/

 	/* XOR key-stream with plaintext */
 	xorq 0 * 8(%r14), RLR0;
 	xorq 1 * 8(%r14), RLR1;
 	xorq 2 * 8(%r14), RLR2;
 	xorq 3 * 8(%r14), RLR3;
 	movq RLR0, 0 * 8(%r13);
 	movq RLR1, 1 * 8(%r13);
 	movq RLR2, 2 * 8(%r13);
 	movq RLR3, 3 * 8(%r13);

 	popq %r14;
 	popq %r13;
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
 	ret
 .size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;

 .align 8
 .globl _gcry_cast5_amd64_cbc_dec
 .type   _gcry_cast5_amd64_cbc_dec,@function;
 _gcry_cast5_amd64_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (8 blocks)
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (64bit)
 	 */

 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
 	pushq %r13;
 	pushq %r14;

 	pushq %rcx;
 	pushq %rsi;
 	pushq %rdx;

 	/* load input */
 	movq 0 * 8(%rdx), RLR0;
 	movq 1 * 8(%rdx), RLR1;
 	movq 2 * 8(%rdx), RLR2;
 	movq 3 * 8(%rdx), RLR3;

 	call __cast5_dec_blk4;

 	popq RX0; /*src*/
 	popq RX1; /*dst*/
 	popq RX2; /*iv*/

 	movq 3 * 8(RX0), %r14;
 	xorq      (RX2), RLR0;
 	xorq 0 * 8(RX0), RLR1;
 	xorq 1 * 8(RX0), RLR2;
 	xorq 2 * 8(RX0), RLR3;
 	movq %r14, (RX2); /* store new IV */

 	movq RLR0, 0 * 8(RX1);
 	movq RLR1, 1 * 8(RX1);
 	movq RLR2, 2 * 8(RX1);
 	movq RLR3, 3 * 8(RX1);

 	popq %r14;
 	popq %r13;
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
 	ret;

 .size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;

 .align 8
 .globl _gcry_cast5_amd64_cfb_dec
 .type   _gcry_cast5_amd64_cfb_dec,@function;
 _gcry_cast5_amd64_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
 	 *	%rsi: dst (8 blocks)
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (64bit)
 	 */

 	pushq %rbp;
 	pushq %rbx;
 	pushq %r12;
 	pushq %r13;
 	pushq %r14;

 	pushq %rsi;
 	pushq %rdx;

 	/* Load input */
 	movq (%rcx), RLR0;
 	movq 0 * 8(%rdx), RLR1;
 	movq 1 * 8(%rdx), RLR2;
 	movq 2 * 8(%rdx), RLR3;

 	inbswap_block4(RLR0, RLR1, RLR2, RLR3);

 	/* Update IV */
 	movq 3 * 8(%rdx), %rdx;
 	movq %rdx, (%rcx);

 	call __cast5_enc_blk4;

 	popq %rdx; /*src*/
 	popq %rcx; /*dst*/

 	xorq 0 * 8(%rdx), RLR0;
 	xorq 1 * 8(%rdx), RLR1;
 	xorq 2 * 8(%rdx), RLR2;
 	xorq 3 * 8(%rdx), RLR3;
 	movq RLR0, 0 * 8(%rcx);
 	movq RLR1, 1 * 8(%rcx);
 	movq RLR2, 2 * 8(%rcx);
 	movq RLR3, 3 * 8(%rcx);

 	popq %r14;
 	popq %r13;
 	popq %r12;
 	popq %rbx;
 	popq %rbp;
 	ret;

 .size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;

 #endif /*defined(USE_CAST5)*/
 #endif /*__x86_64*/
	/* cast5-amd64.S - AMD64 assembly implementation of CAST5 cipher
	*
	* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
	*
	* This file is part of Libgcrypt.
	*
	* Libgcrypt is free software; you can redistribute it and/or modify
	* it under the terms of the GNU Lesser General Public License as
	* published by the Free Software Foundation; either version 2.1 of
	* the License, or (at your option) any later version.
	*
	* Libgcrypt is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with this program; if not, see <http://www.gnu.org/licenses/>.
	*/

	#ifdef __x86_64
	#include <config.h>
	#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_CAST5)

	#ifdef __PIC__
	# define RIP %rip
	# define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg
	#else
	# define RIP
	# define GET_EXTERN_POINTER(name, reg) leaq name, reg
	#endif

	.text

	.extern _gcry_cast5_s1to4;

	#define s1 0
	#define s2 (s1 + (4 * 256))
	#define s3 (s2 + (4 * 256))
	#define s4 (s3 + (4 * 256))

	/* structure of CAST5_context: */
	#define Km 0
	#define Kr (Km + (16 * 4))

	/* register macros */
	#define CTX %rdi
	#define RIO %rsi
	#define RTAB %r8

	#define RLR0 %r9
	#define RLR1 %r10
	#define RLR2 %r11
	#define RLR3 %r12

	#define RLR0d %r9d
	#define RLR1d %r10d
	#define RLR2d %r11d
	#define RLR3d %r12d

	#define RX0 %rax
	#define RX1 %rbx
	#define RX2 %rdx

	#define RX0d %eax
	#define RX1d %ebx
	#define RX2d %edx

	#define RX0bl %al
	#define RX1bl %bl
	#define RX2bl %dl

	#define RX0bh %ah
	#define RX1bh %bh
	#define RX2bh %dh

	#define RKR %rcx
	#define RKRd %ecx
	#define RKRbl %cl

	#define RT0 %rbp
	#define RT1 %rsi

	#define RT0d %ebp
	#define RT1d %esi

	#define RKM0d %r13d
	#define RKM1d %r14d

	/***********************************************************************
	* 1-way cast5
	***********************************************************************/
	#define dummy(x)

	#define shr_kr(none) \
	shrq $8, RKR;

	#define F(km, load_next_kr, op0, op1, op2, op3) \
	op0 ## l RLR0d, km ## d; \
	roll RKRbl, km ## d; \
	rorq $32, RLR0; \
	movzbl km ## bh, RT0d; \
	movzbl km ## bl, RT1d; \
	roll $16, km ## d; \
	movl s1(RTAB,RT0,4), RT0d; \
	op1 ## l s2(RTAB,RT1,4), RT0d; \
	load_next_kr(kr_next); \
	movzbl km ## bh, RT1d; \
	movzbl km ## bl, km ## d; \
	op2 ## l s3(RTAB,RT1,4), RT0d; \
	op3 ## l s4(RTAB,km,4), RT0d; \
	xorq RT0, RLR0;

	#define F1(km, load_next_kr) \
	F(##km, load_next_kr, add, xor, sub, add)
	#define F2(km, load_next_kr) \
	F(##km, load_next_kr, xor, sub, add, xor)
	#define F3(km, load_next_kr) \
	F(##km, load_next_kr, sub, add, xor, sub)

	#define get_round_km(n, km) \
	movl Km+4*(n)(CTX), km;

	#define get_round_kr_enc(n) \
	movq $0x1010101010101010, RKR; \
	\
	/* merge rorl rk and rorl $16 */ \
	xorq Kr+(n)(CTX), RKR;

	#define get_round_kr_dec(n) \
	movq $0x1010101010101010, RKR; \
	\
	/* merge rorl rk and rorl $16 */ \
	xorq Kr+(n - 7)(CTX), RKR; \
	bswapq RKR;

	#define round_enc(n, FA, FB, fn1, fn2) \
	get_round_km(n + 1, RX2d); \
	FA(RX0, fn1); \
	get_round_km(n + 2, RX0d); \
	FB(RX2, fn2);

	#define round_enc_last(n, FXA, FXB) \
	get_round_km(n + 1, RX2d); \
	\
	FXA(RX0, shr_kr); \
	FXB(RX2, dummy);

	#define round_enc_1(n, FA, FB) \
	round_enc(n, FA, FB, shr_kr, shr_kr)

	#define round_enc_2(n, FA, FB) \
	round_enc(n, FA, FB, shr_kr, dummy)

	#define round_dec(n, FA, FB, fn1, fn2) \
	get_round_km(n - 1, RX2d); \
	FA(RX0, fn1); \
	get_round_km(n - 2, RX0d); \
	FB(RX2, fn2);

	#define round_dec_last(n, FXA, FXB) \
	get_round_km(n - 1, RX2d); \
	FXA(RX0, shr_kr); \
	FXB(RX2, dummy);

	#define round_dec_1(n, FA, FB) \
	round_dec(n, FA, FB, shr_kr, shr_kr)

	#define round_dec_2(n, FA, FB) \
	round_dec(n, FA, FB, shr_kr, dummy)

	#define read_block() \
	movq (RIO), RLR0; \
	bswapq RLR0;

	#define write_block() \
	bswapq RLR0; \
	rorq $32, RLR0; \
	movq RLR0, (RIO);

	.align 8
	.globl _gcry_cast5_amd64_encrypt_block
	.type _gcry_cast5_amd64_encrypt_block,@function;

	_gcry_cast5_amd64_encrypt_block:
	/* input:
	* %rdi: ctx, CTX
	* %rsi: dst
	* %rdx: src
	*/
	pushq %rbp;
	pushq %rbx;

	movq %rsi, %r10;

	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);

	movq %rdx, RIO;
	read_block();

	get_round_km(0, RX0d);
	get_round_kr_enc(0);
	round_enc_1(0, F1, F2);
	round_enc_1(2, F3, F1);
	round_enc_1(4, F2, F3);
	round_enc_2(6, F1, F2);
	get_round_kr_enc(8);
	round_enc_1(8, F3, F1);
	round_enc_1(10, F2, F3);
	round_enc_1(12, F1, F2);
	round_enc_last(14, F3, F1);

	movq %r10, RIO;
	write_block();

	popq %rbx;
	popq %rbp;
	ret;
	.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;

	.align 8
	.globl _gcry_cast5_amd64_decrypt_block
	.type _gcry_cast5_amd64_decrypt_block,@function;

	_gcry_cast5_amd64_decrypt_block:
	/* input:
	* %rdi: ctx, CTX
	* %rsi: dst
	* %rdx: src
	*/
	pushq %rbp;
	pushq %rbx;

	movq %rsi, %r10;

	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);

	movq %rdx, RIO;
	read_block();

	get_round_km(15, RX0d);
	get_round_kr_dec(15);
	round_dec_1(15, F1, F3);
	round_dec_1(13, F2, F1);
	round_dec_1(11, F3, F2);
	round_dec_2(9, F1, F3);
	get_round_kr_dec(7);
	round_dec_1(7, F2, F1);
	round_dec_1(5, F3, F2);
	round_dec_1(3, F1, F3);
	round_dec_last(1, F2, F1);

	movq %r10, RIO;
	write_block();

	popq %rbx;
	popq %rbp;
	ret;
	.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;

	/**********************************************************************
	4-way cast5, four blocks parallel
	**********************************************************************/
	#define F_tail(rlr, rx, op1, op2, op3) \
	movzbl rx ## bh, RT0d; \
	movzbl rx ## bl, RT1d; \
	roll $16, rx ## d; \
	movl s1(RTAB,RT0,4), RT0d; \
	op1 ## l s2(RTAB,RT1,4), RT0d; \
	movzbl rx ## bh, RT1d; \
	movzbl rx ## bl, rx ## d; \
	op2 ## l s3(RTAB,RT1,4), RT0d; \
	op3 ## l s4(RTAB,rx,4), RT0d; \
	xorq RT0, rlr;

	#define F4(km, load_next_kr, op0, op1, op2, op3) \
	movl km, RX0d; \
	op0 ## l RLR0d, RX0d; \
	roll RKRbl, RX0d; \
	rorq $32, RLR0; \
	\
	movl km, RX1d; \
	op0 ## l RLR1d, RX1d; \
	roll RKRbl, RX1d; \
	rorq $32, RLR1; \
	\
	movl km, RX2d; \
	op0 ## l RLR2d, RX2d; \
	roll RKRbl, RX2d; \
	rorq $32, RLR2; \
	\
	F_tail(RLR0, RX0, op1, op2, op3); \
	F_tail(RLR1, RX1, op1, op2, op3); \
	F_tail(RLR2, RX2, op1, op2, op3); \
	\
	movl km, RX0d; \
	op0 ## l RLR3d, RX0d; \
	roll RKRbl, RX0d; \
	load_next_kr(); \
	rorq $32, RLR3; \
	\
	F_tail(RLR3, RX0, op1, op2, op3);

	#define F4_1(km, load_next_kr) \
	F4(km, load_next_kr, add, xor, sub, add)
	#define F4_2(km, load_next_kr) \
	F4(km, load_next_kr, xor, sub, add, xor)
	#define F4_3(km, load_next_kr) \
	F4(km, load_next_kr, sub, add, xor, sub)

	#define round_enc4(n, FA, FB, fn1, fn2) \
	get_round_km(n + 1, RKM1d); \
	FA(RKM0d, fn1); \
	get_round_km(n + 2, RKM0d); \
	FB(RKM1d, fn2);

	#define round_enc_last4(n, FXA, FXB) \
	get_round_km(n + 1, RKM1d); \
	FXA(RKM0d, shr_kr); \
	FXB(RKM1d, dummy);

	#define round_enc4_1(n, FA, FB) \
	round_enc4(n, FA, FB, shr_kr, shr_kr);

	#define round_enc4_2(n, FA, FB) \
	round_enc4(n, FA, FB, shr_kr, dummy);

	#define round_dec4(n, FA, FB, fn1, fn2) \
	get_round_km(n - 1, RKM1d); \
	FA(RKM0d, fn1); \
	get_round_km(n - 2, RKM0d); \
	FB(RKM1d, fn2);

	#define round_dec_last4(n, FXA, FXB) \
	get_round_km(n - 1, RKM1d); \
	FXA(RKM0d, shr_kr); \
	FXB(RKM1d, dummy);

	#define round_dec4_1(n, FA, FB) \
	round_dec4(n, FA, FB, shr_kr, shr_kr);

	#define round_dec4_2(n, FA, FB) \
	round_dec4(n, FA, FB, shr_kr, dummy);

	#define inbswap_block4(a, b, c, d) \
	bswapq a; \
	bswapq b; \
	bswapq c; \
	bswapq d;

	#define outbswap_block4(a, b, c, d) \
	bswapq a; \
	bswapq b; \
	bswapq c; \
	bswapq d; \
	rorq $32, a; \
	rorq $32, b; \
	rorq $32, c; \
	rorq $32, d;

	.align 8
	.type __cast5_enc_blk4,@function;

	__cast5_enc_blk4:
	/* input:
	* %rdi: ctx, CTX
	* RLR0,RLR1,RLR2,RLR3: four input plaintext blocks
	* output:
	* RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks
	*/
	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);

	get_round_km(0, RKM0d);
	get_round_kr_enc(0);
	round_enc4_1(0, F4_1, F4_2);
	round_enc4_1(2, F4_3, F4_1);
	round_enc4_1(4, F4_2, F4_3);
	round_enc4_2(6, F4_1, F4_2);
	get_round_kr_enc(8);
	round_enc4_1(8, F4_3, F4_1);
	round_enc4_1(10, F4_2, F4_3);
	round_enc4_1(12, F4_1, F4_2);
	round_enc_last4(14, F4_3, F4_1);

	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
	ret;
	.size __cast5_enc_blk4,.-__cast5_enc_blk4;

	.align 8
	.type __cast5_dec_blk4,@function;

	__cast5_dec_blk4:
	/* input:
	* %rdi: ctx, CTX
	* RLR0,RLR1,RLR2,RLR3: four input ciphertext blocks
	* output:
	* RLR0,RLR1,RLR2,RLR3: four output plaintext blocks
	*/
	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);

	inbswap_block4(RLR0, RLR1, RLR2, RLR3);

	get_round_km(15, RKM0d);
	get_round_kr_dec(15);
	round_dec4_1(15, F4_1, F4_3);
	round_dec4_1(13, F4_2, F4_1);
	round_dec4_1(11, F4_3, F4_2);
	round_dec4_2(9, F4_1, F4_3);
	get_round_kr_dec(7);
	round_dec4_1(7, F4_2, F4_1);
	round_dec4_1(5, F4_3, F4_2);
	round_dec4_1(3, F4_1, F4_3);
	round_dec_last4(1, F4_2, F4_1);

	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
	ret;
	.size __cast5_dec_blk4,.-__cast5_dec_blk4;

	.align 8
	.globl _gcry_cast5_amd64_ctr_enc
	.type _gcry_cast5_amd64_ctr_enc,@function;
	_gcry_cast5_amd64_ctr_enc:
	/* input:
	* %rdi: ctx, CTX
	* %rsi: dst (8 blocks)
	* %rdx: src (8 blocks)
	* %rcx: iv (big endian, 64bit)
	*/

	pushq %rbp;
	pushq %rbx;
	pushq %r12;
	pushq %r13;
	pushq %r14;

	pushq %rsi;
	pushq %rdx;

	/* load IV and byteswap */
	movq (%rcx), RX0;
	bswapq RX0;
	movq RX0, RLR0;

	/* construct IVs */
	leaq 1(RX0), RLR1;
	leaq 2(RX0), RLR2;
	leaq 3(RX0), RLR3;
	leaq 4(RX0), RX0;
	bswapq RX0;

	/* store new IV */
	movq RX0, (%rcx);

	call __cast5_enc_blk4;

	popq %r14; /src/
	popq %r13; /dst/

	/* XOR key-stream with plaintext */
	xorq 0 * 8(%r14), RLR0;
	xorq 1 * 8(%r14), RLR1;
	xorq 2 * 8(%r14), RLR2;
	xorq 3 * 8(%r14), RLR3;
	movq RLR0, 0 * 8(%r13);
	movq RLR1, 1 * 8(%r13);
	movq RLR2, 2 * 8(%r13);
	movq RLR3, 3 * 8(%r13);

	popq %r14;
	popq %r13;
	popq %r12;
	popq %rbx;
	popq %rbp;
	ret
	.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;

	.align 8
	.globl _gcry_cast5_amd64_cbc_dec
	.type _gcry_cast5_amd64_cbc_dec,@function;
	_gcry_cast5_amd64_cbc_dec:
	/* input:
	* %rdi: ctx, CTX
	* %rsi: dst (8 blocks)
	* %rdx: src (8 blocks)
	* %rcx: iv (64bit)
	*/

	pushq %rbp;
	pushq %rbx;
	pushq %r12;
	pushq %r13;
	pushq %r14;

	pushq %rcx;
	pushq %rsi;
	pushq %rdx;

	/* load input */
	movq 0 * 8(%rdx), RLR0;
	movq 1 * 8(%rdx), RLR1;
	movq 2 * 8(%rdx), RLR2;
	movq 3 * 8(%rdx), RLR3;

	call __cast5_dec_blk4;

	popq RX0; /src/
	popq RX1; /dst/
	popq RX2; /iv/

	movq 3 * 8(RX0), %r14;
	xorq (RX2), RLR0;
	xorq 0 * 8(RX0), RLR1;
	xorq 1 * 8(RX0), RLR2;
	xorq 2 * 8(RX0), RLR3;
	movq %r14, (RX2); /* store new IV */

	movq RLR0, 0 * 8(RX1);
	movq RLR1, 1 * 8(RX1);
	movq RLR2, 2 * 8(RX1);
	movq RLR3, 3 * 8(RX1);

	popq %r14;
	popq %r13;
	popq %r12;
	popq %rbx;
	popq %rbp;
	ret;

	.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;

	.align 8
	.globl _gcry_cast5_amd64_cfb_dec
	.type _gcry_cast5_amd64_cfb_dec,@function;
	_gcry_cast5_amd64_cfb_dec:
	/* input:
	* %rdi: ctx, CTX
	* %rsi: dst (8 blocks)
	* %rdx: src (8 blocks)
	* %rcx: iv (64bit)
	*/

	pushq %rbp;
	pushq %rbx;
	pushq %r12;
	pushq %r13;
	pushq %r14;

	pushq %rsi;
	pushq %rdx;

	/* Load input */
	movq (%rcx), RLR0;
	movq 0 * 8(%rdx), RLR1;
	movq 1 * 8(%rdx), RLR2;
	movq 2 * 8(%rdx), RLR3;

	inbswap_block4(RLR0, RLR1, RLR2, RLR3);

	/* Update IV */
	movq 3 * 8(%rdx), %rdx;
	movq %rdx, (%rcx);

	call __cast5_enc_blk4;

	popq %rdx; /src/
	popq %rcx; /dst/

	xorq 0 * 8(%rdx), RLR0;
	xorq 1 * 8(%rdx), RLR1;
	xorq 2 * 8(%rdx), RLR2;
	xorq 3 * 8(%rdx), RLR3;
	movq RLR0, 0 * 8(%rcx);
	movq RLR1, 1 * 8(%rcx);
	movq RLR2, 2 * 8(%rcx);
	movq RLR3, 3 * 8(%rcx);

	popq %r14;
	popq %r13;
	popq %r12;
	popq %rbx;
	popq %rbp;
	ret;

	.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;

	#endif /defined(USE_CAST5)/
	#endif /__x86_64/