diff options
Diffstat (limited to 'arch/x86/crypto')
| -rw-r--r-- | arch/x86/crypto/Makefile | 1 | ||||
| -rw-r--r-- | arch/x86/crypto/blowfish-x86_64-asm_64.S | 48 | ||||
| -rw-r--r-- | arch/x86/crypto/camellia-x86_64-asm_64.S | 26 | ||||
| -rw-r--r-- | arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 47 | ||||
| -rw-r--r-- | arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 50 | ||||
| -rw-r--r-- | arch/x86/crypto/chacha20-avx2-x86_64.S | 4 | ||||
| -rw-r--r-- | arch/x86/crypto/chacha20-ssse3-x86_64.S | 4 | ||||
| -rw-r--r-- | arch/x86/crypto/des3_ede-asm_64.S | 15 | ||||
| -rw-r--r-- | arch/x86/crypto/salsa20-x86_64-asm_64.S | 1 | ||||
| -rw-r--r-- | arch/x86/crypto/sha1-mb/Makefile | 1 | ||||
| -rw-r--r-- | arch/x86/crypto/sha1_avx2_x86_64_asm.S | 4 | ||||
| -rw-r--r-- | arch/x86/crypto/sha1_ssse3_asm.S | 11 | ||||
| -rw-r--r-- | arch/x86/crypto/sha256-avx-asm.S | 15 | ||||
| -rw-r--r-- | arch/x86/crypto/sha256-avx2-asm.S | 22 | ||||
| -rw-r--r-- | arch/x86/crypto/sha256-mb/Makefile | 1 | ||||
| -rw-r--r-- | arch/x86/crypto/sha256-ssse3-asm.S | 15 | ||||
| -rw-r--r-- | arch/x86/crypto/sha512-avx2-asm.S | 75 | ||||
| -rw-r--r-- | arch/x86/crypto/sha512-mb/Makefile | 1 | ||||
| -rw-r--r-- | arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 12 | 
19 files changed, 193 insertions, 160 deletions
| diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 9e32d40d71bd..5f07333bb224 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  #  # Arch-specific CryptoAPI modules.  # diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 246c67006ed0..8c1fcb6bad21 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -33,7 +33,7 @@  #define s3	((16 + 2 + (3 * 256)) * 4)  /* register macros */ -#define CTX %rdi +#define CTX %r12  #define RIO %rsi  #define RX0 %rax @@ -56,12 +56,12 @@  #define RX2bh %ch  #define RX3bh %dh -#define RT0 %rbp +#define RT0 %rdi  #define RT1 %rsi  #define RT2 %r8  #define RT3 %r9 -#define RT0d %ebp +#define RT0d %edi  #define RT1d %esi  #define RT2d %r8d  #define RT3d %r9d @@ -120,13 +120,14 @@  ENTRY(__blowfish_enc_blk)  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	%rsi: dst  	 *	%rdx: src  	 *	%rcx: bool, if true: xor output  	 */ -	movq %rbp, %r11; +	movq %r12, %r11; +	movq %rdi, CTX;  	movq %rsi, %r10;  	movq %rdx, RIO; @@ -142,7 +143,7 @@ ENTRY(__blowfish_enc_blk)  	round_enc(14);  	add_roundkey_enc(16); -	movq %r11, %rbp; +	movq %r11, %r12;  	movq %r10, RIO;  	test %cl, %cl; @@ -157,12 +158,13 @@ ENDPROC(__blowfish_enc_blk)  ENTRY(blowfish_dec_blk)  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	%rsi: dst  	 *	%rdx: src  	 */ -	movq %rbp, %r11; +	movq %r12, %r11; +	movq %rdi, CTX;  	movq %rsi, %r10;  	movq %rdx, RIO; @@ -181,7 +183,7 @@ ENTRY(blowfish_dec_blk)  	movq %r10, RIO;  	write_block(); -	movq %r11, %rbp; +	movq %r11, %r12;  	ret;  ENDPROC(blowfish_dec_blk) @@ -298,20 +300,21 @@ ENDPROC(blowfish_dec_blk)  ENTRY(__blowfish_enc_blk_4way)  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	%rsi: dst  	 *	%rdx: src  	 *	%rcx: bool, if true: xor output  	 */ -	pushq %rbp; +	pushq %r12;  	pushq %rbx;  	pushq %rcx; -	preload_roundkey_enc(0); - +	movq %rdi, CTX  	movq %rsi, %r11;  	movq %rdx, RIO; +	preload_roundkey_enc(0); +  	read_block4();  	round_enc4(0); @@ -324,39 +327,40 @@ ENTRY(__blowfish_enc_blk_4way)  	round_enc4(14);  	add_preloaded_roundkey4(); -	popq %rbp; +	popq %r12;  	movq %r11, RIO; -	test %bpl, %bpl; +	test %r12b, %r12b;  	jnz .L__enc_xor4;  	write_block4();  	popq %rbx; -	popq %rbp; +	popq %r12;  	ret;  .L__enc_xor4:  	xor_block4();  	popq %rbx; -	popq %rbp; +	popq %r12;  	ret;  ENDPROC(__blowfish_enc_blk_4way)  ENTRY(blowfish_dec_blk_4way)  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	%rsi: dst  	 *	%rdx: src  	 */ -	pushq %rbp; +	pushq %r12;  	pushq %rbx; -	preload_roundkey_dec(17); -	movq %rsi, %r11; +	movq %rdi, CTX; +	movq %rsi, %r11  	movq %rdx, RIO; +	preload_roundkey_dec(17);  	read_block4();  	round_dec4(17); @@ -373,7 +377,7 @@ ENTRY(blowfish_dec_blk_4way)  	write_block4();  	popq %rbx; -	popq %rbp; +	popq %r12;  	ret;  ENDPROC(blowfish_dec_blk_4way) diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 310319c601ed..95ba6956a7f6 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -75,17 +75,17 @@  #define RCD1bh %dh  #define RT0 %rsi -#define RT1 %rbp +#define RT1 %r12  #define RT2 %r8  #define RT0d %esi -#define RT1d %ebp +#define RT1d %r12d  #define RT2d %r8d  #define RT2bl %r8b  #define RXOR %r9 -#define RRBP %r10 +#define RR12 %r10  #define RDST %r11  #define RXORd %r9d @@ -197,7 +197,7 @@ ENTRY(__camellia_enc_blk)  	 *	%rdx: src  	 *	%rcx: bool xor  	 */ -	movq %rbp, RRBP; +	movq %r12, RR12;  	movq %rcx, RXOR;  	movq %rsi, RDST; @@ -227,13 +227,13 @@ ENTRY(__camellia_enc_blk)  	enc_outunpack(mov, RT1); -	movq RRBP, %rbp; +	movq RR12, %r12;  	ret;  .L__enc_xor:  	enc_outunpack(xor, RT1); -	movq RRBP, %rbp; +	movq RR12, %r12;  	ret;  ENDPROC(__camellia_enc_blk) @@ -248,7 +248,7 @@ ENTRY(camellia_dec_blk)  	movl $24, RXORd;  	cmovel RXORd, RT2d; /* max */ -	movq %rbp, RRBP; +	movq %r12, RR12;  	movq %rsi, RDST;  	movq %rdx, RIO; @@ -271,7 +271,7 @@ ENTRY(camellia_dec_blk)  	dec_outunpack(); -	movq RRBP, %rbp; +	movq RR12, %r12;  	ret;  ENDPROC(camellia_dec_blk) @@ -433,7 +433,7 @@ ENTRY(__camellia_enc_blk_2way)  	 */  	pushq %rbx; -	movq %rbp, RRBP; +	movq %r12, RR12;  	movq %rcx, RXOR;  	movq %rsi, RDST;  	movq %rdx, RIO; @@ -461,14 +461,14 @@ ENTRY(__camellia_enc_blk_2way)  	enc_outunpack2(mov, RT2); -	movq RRBP, %rbp; +	movq RR12, %r12;  	popq %rbx;  	ret;  .L__enc2_xor:  	enc_outunpack2(xor, RT2); -	movq RRBP, %rbp; +	movq RR12, %r12;  	popq %rbx;  	ret;  ENDPROC(__camellia_enc_blk_2way) @@ -485,7 +485,7 @@ ENTRY(camellia_dec_blk_2way)  	cmovel RXORd, RT2d; /* max */  	movq %rbx, RXOR; -	movq %rbp, RRBP; +	movq %r12, RR12;  	movq %rsi, RDST;  	movq %rdx, RIO; @@ -508,7 +508,7 @@ ENTRY(camellia_dec_blk_2way)  	dec_outunpack2(); -	movq RRBP, %rbp; +	movq RR12, %r12;  	movq RXOR, %rbx;  	ret;  ENDPROC(camellia_dec_blk_2way) diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b4a8806234ea..86107c961bb4 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@  /**********************************************************************    16-way AVX cast5   **********************************************************************/ -#define CTX %rdi +#define CTX %r15  #define RL1 %xmm0  #define RR1 %xmm1 @@ -70,8 +70,8 @@  #define RTMP %xmm15 -#define RID1  %rbp -#define RID1d %ebp +#define RID1  %rdi +#define RID1d %edi  #define RID2  %rsi  #define RID2d %esi @@ -226,7 +226,7 @@  .align 16  __cast5_enc_blk16:  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	RL1: blocks 1 and 2  	 *	RR1: blocks 3 and 4  	 *	RL2: blocks 5 and 6 @@ -246,9 +246,11 @@ __cast5_enc_blk16:  	 *	RR4: encrypted blocks 15 and 16  	 */ -	pushq %rbp; +	pushq %r15;  	pushq %rbx; +	movq %rdi, CTX; +  	vmovdqa .Lbswap_mask, RKM;  	vmovd .Lfirst_mask, R1ST;  	vmovd .L32_mask, R32; @@ -283,7 +285,7 @@ __cast5_enc_blk16:  .L__skip_enc:  	popq %rbx; -	popq %rbp; +	popq %r15;  	vmovdqa .Lbswap_mask, RKM; @@ -298,7 +300,7 @@ ENDPROC(__cast5_enc_blk16)  .align 16  __cast5_dec_blk16:  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	RL1: encrypted blocks 1 and 2  	 *	RR1: encrypted blocks 3 and 4  	 *	RL2: encrypted blocks 5 and 6 @@ -318,9 +320,11 @@ __cast5_dec_blk16:  	 *	RR4: decrypted blocks 15 and 16  	 */ -	pushq %rbp; +	pushq %r15;  	pushq %rbx; +	movq %rdi, CTX; +  	vmovdqa .Lbswap_mask, RKM;  	vmovd .Lfirst_mask, R1ST;  	vmovd .L32_mask, R32; @@ -356,7 +360,7 @@ __cast5_dec_blk16:  	vmovdqa .Lbswap_mask, RKM;  	popq %rbx; -	popq %rbp; +	popq %r15;  	outunpack_blocks(RR1, RL1, RTMP, RX, RKM);  	outunpack_blocks(RR2, RL2, RTMP, RX, RKM); @@ -372,12 +376,14 @@ ENDPROC(__cast5_dec_blk16)  ENTRY(cast5_ecb_enc_16way)  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	%rsi: dst  	 *	%rdx: src  	 */  	FRAME_BEGIN +	pushq %r15; +	movq %rdi, CTX;  	movq %rsi, %r11;  	vmovdqu (0*4*4)(%rdx), RL1; @@ -400,18 +406,22 @@ ENTRY(cast5_ecb_enc_16way)  	vmovdqu RR4, (6*4*4)(%r11);  	vmovdqu RL4, (7*4*4)(%r11); +	popq %r15;  	FRAME_END  	ret;  ENDPROC(cast5_ecb_enc_16way)  ENTRY(cast5_ecb_dec_16way)  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	%rsi: dst  	 *	%rdx: src  	 */  	FRAME_BEGIN +	pushq %r15; + +	movq %rdi, CTX;  	movq %rsi, %r11;  	vmovdqu (0*4*4)(%rdx), RL1; @@ -434,20 +444,22 @@ ENTRY(cast5_ecb_dec_16way)  	vmovdqu RR4, (6*4*4)(%r11);  	vmovdqu RL4, (7*4*4)(%r11); +	popq %r15;  	FRAME_END  	ret;  ENDPROC(cast5_ecb_dec_16way)  ENTRY(cast5_cbc_dec_16way)  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	%rsi: dst  	 *	%rdx: src  	 */  	FRAME_BEGIN -  	pushq %r12; +	pushq %r15; +	movq %rdi, CTX;  	movq %rsi, %r11;  	movq %rdx, %r12; @@ -483,23 +495,24 @@ ENTRY(cast5_cbc_dec_16way)  	vmovdqu RR4, (6*16)(%r11);  	vmovdqu RL4, (7*16)(%r11); +	popq %r15;  	popq %r12; -  	FRAME_END  	ret;  ENDPROC(cast5_cbc_dec_16way)  ENTRY(cast5_ctr_16way)  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	%rsi: dst  	 *	%rdx: src  	 *	%rcx: iv (big endian, 64bit)  	 */  	FRAME_BEGIN -  	pushq %r12; +	pushq %r15; +	movq %rdi, CTX;  	movq %rsi, %r11;  	movq %rdx, %r12; @@ -558,8 +571,8 @@ ENTRY(cast5_ctr_16way)  	vmovdqu RR4, (6*16)(%r11);  	vmovdqu RL4, (7*16)(%r11); +	popq %r15;  	popq %r12; -  	FRAME_END  	ret;  ENDPROC(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 952d3156a933..7f30b6f0d72c 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@  /**********************************************************************    8-way AVX cast6   **********************************************************************/ -#define CTX %rdi +#define CTX %r15  #define RA1 %xmm0  #define RB1 %xmm1 @@ -70,8 +70,8 @@  #define RTMP %xmm15 -#define RID1  %rbp -#define RID1d %ebp +#define RID1  %rdi +#define RID1d %edi  #define RID2  %rsi  #define RID2d %esi @@ -264,15 +264,17 @@  .align 8  __cast6_enc_blk8:  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks  	 * output:  	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks  	 */ -	pushq %rbp; +	pushq %r15;  	pushq %rbx; +	movq %rdi, CTX; +  	vmovdqa .Lbswap_mask, RKM;  	vmovd .Lfirst_mask, R1ST;  	vmovd .L32_mask, R32; @@ -297,7 +299,7 @@ __cast6_enc_blk8:  	QBAR(11);  	popq %rbx; -	popq %rbp; +	popq %r15;  	vmovdqa .Lbswap_mask, RKM; @@ -310,15 +312,17 @@ ENDPROC(__cast6_enc_blk8)  .align 8  __cast6_dec_blk8:  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks  	 * output:  	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks  	 */ -	pushq %rbp; +	pushq %r15;  	pushq %rbx; +	movq %rdi, CTX; +  	vmovdqa .Lbswap_mask, RKM;  	vmovd .Lfirst_mask, R1ST;  	vmovd .L32_mask, R32; @@ -343,7 +347,7 @@ __cast6_dec_blk8:  	QBAR(0);  	popq %rbx; -	popq %rbp; +	popq %r15;  	vmovdqa .Lbswap_mask, RKM;  	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); @@ -354,12 +358,14 @@ ENDPROC(__cast6_dec_blk8)  ENTRY(cast6_ecb_enc_8way)  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	%rsi: dst  	 *	%rdx: src  	 */  	FRAME_BEGIN +	pushq %r15; +	movq %rdi, CTX;  	movq %rsi, %r11;  	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -368,18 +374,21 @@ ENTRY(cast6_ecb_enc_8way)  	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); +	popq %r15;  	FRAME_END  	ret;  ENDPROC(cast6_ecb_enc_8way)  ENTRY(cast6_ecb_dec_8way)  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	%rsi: dst  	 *	%rdx: src  	 */  	FRAME_BEGIN +	pushq %r15; +	movq %rdi, CTX;  	movq %rsi, %r11;  	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -388,20 +397,22 @@ ENTRY(cast6_ecb_dec_8way)  	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); +	popq %r15;  	FRAME_END  	ret;  ENDPROC(cast6_ecb_dec_8way)  ENTRY(cast6_cbc_dec_8way)  	/* input: -	 *	%rdi: ctx, CTX +	 *	%rdi: ctx  	 *	%rsi: dst  	 *	%rdx: src  	 */  	FRAME_BEGIN -  	pushq %r12; +	pushq %r15; +	movq %rdi, CTX;  	movq %rsi, %r11;  	movq %rdx, %r12; @@ -411,8 +422,8 @@ ENTRY(cast6_cbc_dec_8way)  	store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); +	popq %r15;  	popq %r12; -  	FRAME_END  	ret;  ENDPROC(cast6_cbc_dec_8way) @@ -425,9 +436,10 @@ ENTRY(cast6_ctr_8way)  	 *	%rcx: iv (little endian, 128bit)  	 */  	FRAME_BEGIN -  	pushq %r12; +	pushq %r15 +	movq %rdi, CTX;  	movq %rsi, %r11;  	movq %rdx, %r12; @@ -438,8 +450,8 @@ ENTRY(cast6_ctr_8way)  	store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); +	popq %r15;  	popq %r12; -  	FRAME_END  	ret;  ENDPROC(cast6_ctr_8way) @@ -452,7 +464,9 @@ ENTRY(cast6_xts_enc_8way)  	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))  	 */  	FRAME_BEGIN +	pushq %r15; +	movq %rdi, CTX  	movq %rsi, %r11;  	/* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -464,6 +478,7 @@ ENTRY(cast6_xts_enc_8way)  	/* dst <= regs xor IVs(in dst) */  	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); +	popq %r15;  	FRAME_END  	ret;  ENDPROC(cast6_xts_enc_8way) @@ -476,7 +491,9 @@ ENTRY(cast6_xts_dec_8way)  	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))  	 */  	FRAME_BEGIN +	pushq %r15; +	movq %rdi, CTX  	movq %rsi, %r11;  	/* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -488,6 +505,7 @@ ENTRY(cast6_xts_dec_8way)  	/* dst <= regs xor IVs(in dst) */  	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); +	popq %r15;  	FRAME_END  	ret;  ENDPROC(cast6_xts_dec_8way) diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S index 3a2dc3dc6cac..f3cd26f48332 100644 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -45,7 +45,7 @@ ENTRY(chacha20_8block_xor_avx2)  	vzeroupper  	# 4 * 32 byte stack, 32-byte aligned -	mov		%rsp, %r8 +	lea		8(%rsp),%r10  	and		$~31, %rsp  	sub		$0x80, %rsp @@ -443,6 +443,6 @@ ENTRY(chacha20_8block_xor_avx2)  	vmovdqu		%ymm15,0x01e0(%rsi)  	vzeroupper -	mov		%r8,%rsp +	lea		-8(%r10),%rsp  	ret  ENDPROC(chacha20_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index 3f511a7d73b8..512a2b500fd1 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -160,7 +160,7 @@ ENTRY(chacha20_4block_xor_ssse3)  	# done with the slightly better performing SSSE3 byte shuffling,  	# 7/12-bit word rotation uses traditional shift+OR. -	mov		%rsp,%r11 +	lea		8(%rsp),%r10  	sub		$0x80,%rsp  	and		$~63,%rsp @@ -625,6 +625,6 @@ ENTRY(chacha20_4block_xor_ssse3)  	pxor		%xmm1,%xmm15  	movdqu		%xmm15,0xf0(%rsi) -	mov		%r11,%rsp +	lea		-8(%r10),%rsp  	ret  ENDPROC(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index f3e91647ca27..8e49ce117494 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -64,12 +64,12 @@  #define RW2bh %ch  #define RT0 %r15 -#define RT1 %rbp +#define RT1 %rsi  #define RT2 %r14  #define RT3 %rdx  #define RT0d %r15d -#define RT1d %ebp +#define RT1d %esi  #define RT2d %r14d  #define RT3d %edx @@ -177,13 +177,14 @@ ENTRY(des3_ede_x86_64_crypt_blk)  	 *	%rsi: dst  	 *	%rdx: src  	 */ -	pushq %rbp;  	pushq %rbx;  	pushq %r12;  	pushq %r13;  	pushq %r14;  	pushq %r15; +	pushq %rsi; /* dst */ +  	read_block(%rdx, RL0, RR0);  	initial_permutation(RL0, RR0); @@ -241,6 +242,8 @@ ENTRY(des3_ede_x86_64_crypt_blk)  	round1(32+15, RL0, RR0, dummy2);  	final_permutation(RR0, RL0); + +	popq %rsi /* dst */  	write_block(%rsi, RR0, RL0);  	popq %r15; @@ -248,7 +251,6 @@ ENTRY(des3_ede_x86_64_crypt_blk)  	popq %r13;  	popq %r12;  	popq %rbx; -	popq %rbp;  	ret;  ENDPROC(des3_ede_x86_64_crypt_blk) @@ -432,13 +434,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)  	 *	%rdx: src (3 blocks)  	 */ -	pushq %rbp;  	pushq %rbx;  	pushq %r12;  	pushq %r13;  	pushq %r14;  	pushq %r15; +	pushq %rsi /* dst */ +  	/* load input */  	movl 0 * 4(%rdx), RL0d;  	movl 1 * 4(%rdx), RR0d; @@ -520,6 +523,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)  	bswapl RR2d;  	bswapl RL2d; +	popq %rsi /* dst */  	movl RR0d, 0 * 4(%rsi);  	movl RL0d, 1 * 4(%rsi);  	movl RR1d, 2 * 4(%rsi); @@ -532,7 +536,6 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)  	popq %r13;  	popq %r12;  	popq %rbx; -	popq %rbp;  	ret;  ENDPROC(des3_ede_x86_64_crypt_blk_3way) diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S index 9279e0b2d60e..10db30d58006 100644 --- a/arch/x86/crypto/salsa20-x86_64-asm_64.S +++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #include <linux/linkage.h>  # enter salsa20_encrypt_bytes diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile index 2e14acc3da25..815ded3ba90e 100644 --- a/arch/x86/crypto/sha1-mb/Makefile +++ b/arch/x86/crypto/sha1-mb/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  #  # Arch-specific CryptoAPI modules.  # diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1eab79c9ac48..9f712a7dfd79 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -89,7 +89,7 @@  #define	REG_RE	%rdx  #define	REG_RTA	%r12  #define	REG_RTB	%rbx -#define	REG_T1	%ebp +#define	REG_T1	%r11d  #define	xmm_mov	vmovups  #define	avx2_zeroupper	vzeroupper  #define	RND_F1	1 @@ -637,7 +637,6 @@ _loop3:  	ENTRY(\name)  	push	%rbx -	push	%rbp  	push	%r12  	push	%r13  	push	%r14 @@ -673,7 +672,6 @@ _loop3:  	pop	%r14  	pop	%r13  	pop	%r12 -	pop	%rbp  	pop	%rbx  	ret diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index a4109506a5e8..6204bd53528c 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -37,7 +37,7 @@  #define REG_A	%ecx  #define REG_B	%esi  #define REG_C	%edi -#define REG_D	%ebp +#define REG_D	%r12d  #define REG_E	%edx  #define REG_T1	%eax @@ -74,10 +74,10 @@  	ENTRY(\name)  	push	%rbx -	push	%rbp  	push	%r12 +	push	%rbp +	mov	%rsp, %rbp -	mov	%rsp, %r12  	sub	$64, %rsp		# allocate workspace  	and	$~15, %rsp		# align stack @@ -99,10 +99,9 @@  	xor	%rax, %rax  	rep stosq -	mov	%r12, %rsp		# deallocate workspace - -	pop	%r12 +	mov	%rbp, %rsp		# deallocate workspace  	pop	%rbp +	pop	%r12  	pop	%rbx  	ret diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index e08888a1a5f2..001bbcf93c79 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -103,7 +103,7 @@ SRND = %rsi       # clobbers INP  c = %ecx  d = %r8d  e = %edx -TBL = %rbp +TBL = %r12  a = %eax  b = %ebx @@ -350,13 +350,13 @@ a = TMP_  ENTRY(sha256_transform_avx)  .align 32  	pushq   %rbx -	pushq   %rbp +	pushq   %r12  	pushq   %r13  	pushq   %r14  	pushq   %r15 -	pushq   %r12 +	pushq	%rbp +	movq	%rsp, %rbp -	mov	%rsp, %r12  	subq    $STACK_SIZE, %rsp	# allocate stack space  	and	$~15, %rsp		# align stack pointer @@ -452,13 +452,12 @@ loop2:  done_hash: -	mov	%r12, %rsp - -	popq	%r12 +	mov	%rbp, %rsp +	popq	%rbp  	popq    %r15  	popq    %r14  	popq    %r13 -	popq    %rbp +	popq	%r12  	popq    %rbx  	ret  ENDPROC(sha256_transform_avx) diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 89c8f09787d2..1420db15dcdd 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -98,8 +98,6 @@ d	= %r8d  e       = %edx	# clobbers NUM_BLKS  y3	= %esi	# clobbers INP - -TBL	= %rbp  SRND	= CTX	# SRND is same register as CTX  a = %eax @@ -531,7 +529,6 @@ STACK_SIZE	= _RSP      + _RSP_SIZE  ENTRY(sha256_transform_rorx)  .align 32  	pushq	%rbx -	pushq	%rbp  	pushq	%r12  	pushq	%r13  	pushq	%r14 @@ -568,8 +565,6 @@ ENTRY(sha256_transform_rorx)  	mov	CTX, _CTX(%rsp)  loop0: -	lea     K256(%rip), TBL -  	## Load first 16 dwords from two blocks  	VMOVDQ	0*32(INP),XTMP0  	VMOVDQ	1*32(INP),XTMP1 @@ -597,19 +592,19 @@ last_block_enter:  .align 16  loop1: -	vpaddd	0*32(TBL, SRND), X0, XFER +	vpaddd	K256+0*32(SRND), X0, XFER  	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)  	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32 -	vpaddd	1*32(TBL, SRND), X0, XFER +	vpaddd	K256+1*32(SRND), X0, XFER  	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)  	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32 -	vpaddd	2*32(TBL, SRND), X0, XFER +	vpaddd	K256+2*32(SRND), X0, XFER  	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)  	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32 -	vpaddd	3*32(TBL, SRND), X0, XFER +	vpaddd	K256+3*32(SRND), X0, XFER  	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)  	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32 @@ -619,10 +614,11 @@ loop1:  loop2:  	## Do last 16 rounds with no scheduling -	vpaddd	0*32(TBL, SRND), X0, XFER +	vpaddd	K256+0*32(SRND), X0, XFER  	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)  	DO_4ROUNDS	_XFER + 0*32 -	vpaddd	1*32(TBL, SRND), X1, XFER + +	vpaddd	K256+1*32(SRND), X1, XFER  	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)  	DO_4ROUNDS	_XFER + 1*32  	add	$2*32, SRND @@ -676,9 +672,6 @@ loop3:  	ja	done_hash  do_last_block: -	#### do last block -	lea	K256(%rip), TBL -  	VMOVDQ	0*16(INP),XWORD0  	VMOVDQ	1*16(INP),XWORD1  	VMOVDQ	2*16(INP),XWORD2 @@ -718,7 +711,6 @@ done_hash:  	popq	%r14  	popq	%r13  	popq	%r12 -	popq	%rbp  	popq	%rbx  	ret  ENDPROC(sha256_transform_rorx) diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile index 45b4fca6c4a8..53ad6e7db747 100644 --- a/arch/x86/crypto/sha256-mb/Makefile +++ b/arch/x86/crypto/sha256-mb/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  #  # Arch-specific CryptoAPI modules.  # diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index 39b83c93e7fd..c6c05ed2c16a 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -95,7 +95,7 @@ SRND = %rsi       # clobbers INP  c = %ecx  d = %r8d  e = %edx -TBL = %rbp +TBL = %r12  a = %eax  b = %ebx @@ -356,13 +356,13 @@ a = TMP_  ENTRY(sha256_transform_ssse3)  .align 32  	pushq   %rbx -	pushq   %rbp +	pushq   %r12  	pushq   %r13  	pushq   %r14  	pushq   %r15 -	pushq   %r12 +	pushq   %rbp +	mov	%rsp, %rbp -	mov	%rsp, %r12  	subq    $STACK_SIZE, %rsp  	and	$~15, %rsp @@ -462,13 +462,12 @@ loop2:  done_hash: -	mov	%r12, %rsp - -	popq    %r12 +	mov	%rbp, %rsp +	popq	%rbp  	popq    %r15  	popq    %r14  	popq    %r13 -	popq    %rbp +	popq    %r12  	popq    %rbx  	ret diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 7f5f6c6ec72e..b16d56005162 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -69,8 +69,9 @@ XFER  = YTMP0  BYTE_FLIP_MASK  = %ymm9 -# 1st arg -CTX         = %rdi +# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 +CTX1        = %rdi +CTX2        = %r12  # 2nd arg  INP         = %rsi  # 3rd arg @@ -81,7 +82,7 @@ d           = %r8  e           = %rdx  y3          = %rsi -TBL   = %rbp +TBL   = %rdi # clobbers CTX1  a     = %rax  b     = %rbx @@ -91,26 +92,26 @@ g     = %r10  h     = %r11  old_h = %r11 -T1    = %r12 +T1    = %r12 # clobbers CTX2  y0    = %r13  y1    = %r14  y2    = %r15 -y4    = %r12 -  # Local variables (stack frame)  XFER_SIZE = 4*8  SRND_SIZE = 1*8  INP_SIZE = 1*8  INPEND_SIZE = 1*8 +CTX_SIZE = 1*8  RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 6*8 +GPRSAVE_SIZE = 5*8  frame_XFER = 0  frame_SRND = frame_XFER + XFER_SIZE  frame_INP = frame_SRND + SRND_SIZE  frame_INPEND = frame_INP + INP_SIZE -frame_RSPSAVE = frame_INPEND + INPEND_SIZE +frame_CTX = frame_INPEND + INPEND_SIZE +frame_RSPSAVE = frame_CTX + CTX_SIZE  frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE  frame_size = frame_GPRSAVE + GPRSAVE_SIZE @@ -576,12 +577,11 @@ ENTRY(sha512_transform_rorx)  	mov	%rax, frame_RSPSAVE(%rsp)  	# Save GPRs -	mov	%rbp, frame_GPRSAVE(%rsp) -	mov	%rbx, 8*1+frame_GPRSAVE(%rsp) -	mov	%r12, 8*2+frame_GPRSAVE(%rsp) -	mov	%r13, 8*3+frame_GPRSAVE(%rsp) -	mov	%r14, 8*4+frame_GPRSAVE(%rsp) -	mov	%r15, 8*5+frame_GPRSAVE(%rsp) +	mov	%rbx, 8*0+frame_GPRSAVE(%rsp) +	mov	%r12, 8*1+frame_GPRSAVE(%rsp) +	mov	%r13, 8*2+frame_GPRSAVE(%rsp) +	mov	%r14, 8*3+frame_GPRSAVE(%rsp) +	mov	%r15, 8*4+frame_GPRSAVE(%rsp)  	shl	$7, NUM_BLKS	# convert to bytes  	jz	done_hash @@ -589,14 +589,17 @@ ENTRY(sha512_transform_rorx)  	mov	NUM_BLKS, frame_INPEND(%rsp)  	## load initial digest -	mov	8*0(CTX),a -	mov	8*1(CTX),b -	mov	8*2(CTX),c -	mov	8*3(CTX),d -	mov	8*4(CTX),e -	mov	8*5(CTX),f -	mov	8*6(CTX),g -	mov	8*7(CTX),h +	mov	8*0(CTX1), a +	mov	8*1(CTX1), b +	mov	8*2(CTX1), c +	mov	8*3(CTX1), d +	mov	8*4(CTX1), e +	mov	8*5(CTX1), f +	mov	8*6(CTX1), g +	mov	8*7(CTX1), h + +	# save %rdi (CTX) before it gets clobbered +	mov	%rdi, frame_CTX(%rsp)  	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK @@ -652,14 +655,15 @@ loop2:  	subq	$1, frame_SRND(%rsp)  	jne	loop2 -	addm	8*0(CTX),a -	addm	8*1(CTX),b -	addm	8*2(CTX),c -	addm	8*3(CTX),d -	addm	8*4(CTX),e -	addm	8*5(CTX),f -	addm	8*6(CTX),g -	addm	8*7(CTX),h +	mov	frame_CTX(%rsp), CTX2 +	addm	8*0(CTX2), a +	addm	8*1(CTX2), b +	addm	8*2(CTX2), c +	addm	8*3(CTX2), d +	addm	8*4(CTX2), e +	addm	8*5(CTX2), f +	addm	8*6(CTX2), g +	addm	8*7(CTX2), h  	mov	frame_INP(%rsp), INP  	add	$128, INP @@ -669,12 +673,11 @@ loop2:  done_hash:  # Restore GPRs -	mov	frame_GPRSAVE(%rsp)     ,%rbp -	mov	8*1+frame_GPRSAVE(%rsp) ,%rbx -	mov	8*2+frame_GPRSAVE(%rsp) ,%r12 -	mov	8*3+frame_GPRSAVE(%rsp) ,%r13 -	mov	8*4+frame_GPRSAVE(%rsp) ,%r14 -	mov	8*5+frame_GPRSAVE(%rsp) ,%r15 +	mov	8*0+frame_GPRSAVE(%rsp), %rbx +	mov	8*1+frame_GPRSAVE(%rsp), %r12 +	mov	8*2+frame_GPRSAVE(%rsp), %r13 +	mov	8*3+frame_GPRSAVE(%rsp), %r14 +	mov	8*4+frame_GPRSAVE(%rsp), %r15  	# Restore Stack Pointer  	mov	frame_RSPSAVE(%rsp), %rsp diff --git a/arch/x86/crypto/sha512-mb/Makefile b/arch/x86/crypto/sha512-mb/Makefile index 0a57e2103980..90f1ef69152e 100644 --- a/arch/x86/crypto/sha512-mb/Makefile +++ b/arch/x86/crypto/sha512-mb/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  #  # Arch-specific CryptoAPI modules.  # diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index b3f49d286348..73b471da3622 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -76,8 +76,8 @@  #define RT %xmm14  #define RR %xmm15 -#define RID1  %rbp -#define RID1d %ebp +#define RID1  %r13 +#define RID1d %r13d  #define RID2  %rsi  #define RID2d %esi @@ -259,7 +259,7 @@ __twofish_enc_blk8:  	vmovdqu w(CTX), RK1; -	pushq %rbp; +	pushq %r13;  	pushq %rbx;  	pushq %rcx; @@ -282,7 +282,7 @@ __twofish_enc_blk8:  	popq %rcx;  	popq %rbx; -	popq %rbp; +	popq %r13;  	outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);  	outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); @@ -301,7 +301,7 @@ __twofish_dec_blk8:  	vmovdqu (w+4*4)(CTX), RK1; -	pushq %rbp; +	pushq %r13;  	pushq %rbx;  	inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); @@ -322,7 +322,7 @@ __twofish_dec_blk8:  	vmovdqu (w)(CTX), RK1;  	popq %rbx; -	popq %rbp; +	popq %r13;  	outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);  	outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); |