diff options
Diffstat (limited to 'arch/x86')
31 files changed, 310 insertions, 243 deletions
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 246c67006ed0..8c1fcb6bad21 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -33,7 +33,7 @@ #define s3 ((16 + 2 + (3 * 256)) * 4) /* register macros */ -#define CTX %rdi +#define CTX %r12 #define RIO %rsi #define RX0 %rax @@ -56,12 +56,12 @@ #define RX2bh %ch #define RX3bh %dh -#define RT0 %rbp +#define RT0 %rdi #define RT1 %rsi #define RT2 %r8 #define RT3 %r9 -#define RT0d %ebp +#define RT0d %edi #define RT1d %esi #define RT2d %r8d #define RT3d %r9d @@ -120,13 +120,14 @@ ENTRY(__blowfish_enc_blk) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: bool, if true: xor output */ - movq %rbp, %r11; + movq %r12, %r11; + movq %rdi, CTX; movq %rsi, %r10; movq %rdx, RIO; @@ -142,7 +143,7 @@ ENTRY(__blowfish_enc_blk) round_enc(14); add_roundkey_enc(16); - movq %r11, %rbp; + movq %r11, %r12; movq %r10, RIO; test %cl, %cl; @@ -157,12 +158,13 @@ ENDPROC(__blowfish_enc_blk) ENTRY(blowfish_dec_blk) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ - movq %rbp, %r11; + movq %r12, %r11; + movq %rdi, CTX; movq %rsi, %r10; movq %rdx, RIO; @@ -181,7 +183,7 @@ ENTRY(blowfish_dec_blk) movq %r10, RIO; write_block(); - movq %r11, %rbp; + movq %r11, %r12; ret; ENDPROC(blowfish_dec_blk) @@ -298,20 +300,21 @@ ENDPROC(blowfish_dec_blk) ENTRY(__blowfish_enc_blk_4way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: bool, if true: xor output */ - pushq %rbp; + pushq %r12; pushq %rbx; pushq %rcx; - preload_roundkey_enc(0); - + movq %rdi, CTX movq %rsi, %r11; movq %rdx, RIO; + preload_roundkey_enc(0); + read_block4(); round_enc4(0); @@ -324,39 +327,40 @@ ENTRY(__blowfish_enc_blk_4way) round_enc4(14); add_preloaded_roundkey4(); - popq %rbp; + popq %r12; movq %r11, RIO; - test %bpl, %bpl; + test %r12b, %r12b; jnz .L__enc_xor4; write_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; .L__enc_xor4: xor_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; ENDPROC(__blowfish_enc_blk_4way) ENTRY(blowfish_dec_blk_4way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ - pushq %rbp; + pushq %r12; pushq %rbx; - preload_roundkey_dec(17); - movq %rsi, %r11; + movq %rdi, CTX; + movq %rsi, %r11 movq %rdx, RIO; + preload_roundkey_dec(17); read_block4(); round_dec4(17); @@ -373,7 +377,7 @@ ENTRY(blowfish_dec_blk_4way) write_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; ENDPROC(blowfish_dec_blk_4way) diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 310319c601ed..95ba6956a7f6 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -75,17 +75,17 @@ #define RCD1bh %dh #define RT0 %rsi -#define RT1 %rbp +#define RT1 %r12 #define RT2 %r8 #define RT0d %esi -#define RT1d %ebp +#define RT1d %r12d #define RT2d %r8d #define RT2bl %r8b #define RXOR %r9 -#define RRBP %r10 +#define RR12 %r10 #define RDST %r11 #define RXORd %r9d @@ -197,7 +197,7 @@ ENTRY(__camellia_enc_blk) * %rdx: src * %rcx: bool xor */ - movq %rbp, RRBP; + movq %r12, RR12; movq %rcx, RXOR; movq %rsi, RDST; @@ -227,13 +227,13 @@ ENTRY(__camellia_enc_blk) enc_outunpack(mov, RT1); - movq RRBP, %rbp; + movq RR12, %r12; ret; .L__enc_xor: enc_outunpack(xor, RT1); - movq RRBP, %rbp; + movq RR12, %r12; ret; ENDPROC(__camellia_enc_blk) @@ -248,7 +248,7 @@ ENTRY(camellia_dec_blk) movl $24, RXORd; cmovel RXORd, RT2d; /* max */ - movq %rbp, RRBP; + movq %r12, RR12; movq %rsi, RDST; movq %rdx, RIO; @@ -271,7 +271,7 @@ ENTRY(camellia_dec_blk) dec_outunpack(); - movq RRBP, %rbp; + movq RR12, %r12; ret; ENDPROC(camellia_dec_blk) @@ -433,7 +433,7 @@ ENTRY(__camellia_enc_blk_2way) */ pushq %rbx; - movq %rbp, RRBP; + movq %r12, RR12; movq %rcx, RXOR; movq %rsi, RDST; movq %rdx, RIO; @@ -461,14 +461,14 @@ ENTRY(__camellia_enc_blk_2way) enc_outunpack2(mov, RT2); - movq RRBP, %rbp; + movq RR12, %r12; popq %rbx; ret; .L__enc2_xor: enc_outunpack2(xor, RT2); - movq RRBP, %rbp; + movq RR12, %r12; popq %rbx; ret; ENDPROC(__camellia_enc_blk_2way) @@ -485,7 +485,7 @@ ENTRY(camellia_dec_blk_2way) cmovel RXORd, RT2d; /* max */ movq %rbx, RXOR; - movq %rbp, RRBP; + movq %r12, RR12; movq %rsi, RDST; movq %rdx, RIO; @@ -508,7 +508,7 @@ ENTRY(camellia_dec_blk_2way) dec_outunpack2(); - movq RRBP, %rbp; + movq RR12, %r12; movq RXOR, %rbx; ret; ENDPROC(camellia_dec_blk_2way) diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b4a8806234ea..86107c961bb4 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@ /********************************************************************** 16-way AVX cast5 **********************************************************************/ -#define CTX %rdi +#define CTX %r15 #define RL1 %xmm0 #define RR1 %xmm1 @@ -70,8 +70,8 @@ #define RTMP %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %rdi +#define RID1d %edi #define RID2 %rsi #define RID2d %esi @@ -226,7 +226,7 @@ .align 16 __cast5_enc_blk16: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RL1: blocks 1 and 2 * RR1: blocks 3 and 4 * RL2: blocks 5 and 6 @@ -246,9 +246,11 @@ __cast5_enc_blk16: * RR4: encrypted blocks 15 and 16 */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -283,7 +285,7 @@ __cast5_enc_blk16: .L__skip_enc: popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; @@ -298,7 +300,7 @@ ENDPROC(__cast5_enc_blk16) .align 16 __cast5_dec_blk16: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RL1: encrypted blocks 1 and 2 * RR1: encrypted blocks 3 and 4 * RL2: encrypted blocks 5 and 6 @@ -318,9 +320,11 @@ __cast5_dec_blk16: * RR4: decrypted blocks 15 and 16 */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -356,7 +360,7 @@ __cast5_dec_blk16: vmovdqa .Lbswap_mask, RKM; popq %rbx; - popq %rbp; + popq %r15; outunpack_blocks(RR1, RL1, RTMP, RX, RKM); outunpack_blocks(RR2, RL2, RTMP, RX, RKM); @@ -372,12 +376,14 @@ ENDPROC(__cast5_dec_blk16) ENTRY(cast5_ecb_enc_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; vmovdqu (0*4*4)(%rdx), RL1; @@ -400,18 +406,22 @@ ENTRY(cast5_ecb_enc_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + popq %r15; FRAME_END ret; ENDPROC(cast5_ecb_enc_16way) ENTRY(cast5_ecb_dec_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + + movq %rdi, CTX; movq %rsi, %r11; vmovdqu (0*4*4)(%rdx), RL1; @@ -434,20 +444,22 @@ ENTRY(cast5_ecb_dec_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + popq %r15; FRAME_END ret; ENDPROC(cast5_ecb_dec_16way) ENTRY(cast5_cbc_dec_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -483,23 +495,24 @@ ENTRY(cast5_cbc_dec_16way) vmovdqu RR4, (6*16)(%r11); vmovdqu RL4, (7*16)(%r11); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast5_cbc_dec_16way) ENTRY(cast5_ctr_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: iv (big endian, 64bit) */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -558,8 +571,8 @@ ENTRY(cast5_ctr_16way) vmovdqu RR4, (6*16)(%r11); vmovdqu RL4, (7*16)(%r11); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 952d3156a933..7f30b6f0d72c 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@ /********************************************************************** 8-way AVX cast6 **********************************************************************/ -#define CTX %rdi +#define CTX %r15 #define RA1 %xmm0 #define RB1 %xmm1 @@ -70,8 +70,8 @@ #define RTMP %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %rdi +#define RID1d %edi #define RID2 %rsi #define RID2d %esi @@ -264,15 +264,17 @@ .align 8 __cast6_enc_blk8: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks * output: * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -297,7 +299,7 @@ __cast6_enc_blk8: QBAR(11); popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; @@ -310,15 +312,17 @@ ENDPROC(__cast6_enc_blk8) .align 8 __cast6_dec_blk8: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks * output: * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -343,7 +347,7 @@ __cast6_dec_blk8: QBAR(0); popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); @@ -354,12 +358,14 @@ ENDPROC(__cast6_dec_blk8) ENTRY(cast6_ecb_enc_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -368,18 +374,21 @@ ENTRY(cast6_ecb_enc_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_ecb_enc_8way) ENTRY(cast6_ecb_dec_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -388,20 +397,22 @@ ENTRY(cast6_ecb_dec_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_ecb_dec_8way) ENTRY(cast6_cbc_dec_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -411,8 +422,8 @@ ENTRY(cast6_cbc_dec_8way) store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast6_cbc_dec_8way) @@ -425,9 +436,10 @@ ENTRY(cast6_ctr_8way) * %rcx: iv (little endian, 128bit) */ FRAME_BEGIN - pushq %r12; + pushq %r15 + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -438,8 +450,8 @@ ENTRY(cast6_ctr_8way) store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast6_ctr_8way) @@ -452,7 +464,9 @@ ENTRY(cast6_xts_enc_8way) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -464,6 +478,7 @@ ENTRY(cast6_xts_enc_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_xts_enc_8way) @@ -476,7 +491,9 @@ ENTRY(cast6_xts_dec_8way) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -488,6 +505,7 @@ ENTRY(cast6_xts_dec_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_xts_dec_8way) diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index f3e91647ca27..8e49ce117494 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -64,12 +64,12 @@ #define RW2bh %ch #define RT0 %r15 -#define RT1 %rbp +#define RT1 %rsi #define RT2 %r14 #define RT3 %rdx #define RT0d %r15d -#define RT1d %ebp +#define RT1d %esi #define RT2d %r14d #define RT3d %edx @@ -177,13 +177,14 @@ ENTRY(des3_ede_x86_64_crypt_blk) * %rsi: dst * %rdx: src */ - pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %r15; + pushq %rsi; /* dst */ + read_block(%rdx, RL0, RR0); initial_permutation(RL0, RR0); @@ -241,6 +242,8 @@ ENTRY(des3_ede_x86_64_crypt_blk) round1(32+15, RL0, RR0, dummy2); final_permutation(RR0, RL0); + + popq %rsi /* dst */ write_block(%rsi, RR0, RL0); popq %r15; @@ -248,7 +251,6 @@ ENTRY(des3_ede_x86_64_crypt_blk) popq %r13; popq %r12; popq %rbx; - popq %rbp; ret; ENDPROC(des3_ede_x86_64_crypt_blk) @@ -432,13 +434,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) * %rdx: src (3 blocks) */ - pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %r15; + pushq %rsi /* dst */ + /* load input */ movl 0 * 4(%rdx), RL0d; movl 1 * 4(%rdx), RR0d; @@ -520,6 +523,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) bswapl RR2d; bswapl RL2d; + popq %rsi /* dst */ movl RR0d, 0 * 4(%rsi); movl RL0d, 1 * 4(%rsi); movl RR1d, 2 * 4(%rsi); @@ -532,7 +536,6 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) popq %r13; popq %r12; popq %rbx; - popq %rbp; ret; ENDPROC(des3_ede_x86_64_crypt_blk_3way) diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1eab79c9ac48..9f712a7dfd79 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -89,7 +89,7 @@ #define REG_RE %rdx #define REG_RTA %r12 #define REG_RTB %rbx -#define REG_T1 %ebp +#define REG_T1 %r11d #define xmm_mov vmovups #define avx2_zeroupper vzeroupper #define RND_F1 1 @@ -637,7 +637,6 @@ _loop3: ENTRY(\name) push %rbx - push %rbp push %r12 push %r13 push %r14 @@ -673,7 +672,6 @@ _loop3: pop %r14 pop %r13 pop %r12 - pop %rbp pop %rbx ret diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index a4109506a5e8..6204bd53528c 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -37,7 +37,7 @@ #define REG_A %ecx #define REG_B %esi #define REG_C %edi -#define REG_D %ebp +#define REG_D %r12d #define REG_E %edx #define REG_T1 %eax @@ -74,10 +74,10 @@ ENTRY(\name) push %rbx - push %rbp push %r12 + push %rbp + mov %rsp, %rbp - mov %rsp, %r12 sub $64, %rsp # allocate workspace and $~15, %rsp # align stack @@ -99,10 +99,9 @@ xor %rax, %rax rep stosq - mov %r12, %rsp # deallocate workspace - - pop %r12 + mov %rbp, %rsp # deallocate workspace pop %rbp + pop %r12 pop %rbx ret diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index e08888a1a5f2..001bbcf93c79 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -103,7 +103,7 @@ SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx -TBL = %rbp +TBL = %r12 a = %eax b = %ebx @@ -350,13 +350,13 @@ a = TMP_ ENTRY(sha256_transform_avx) .align 32 pushq %rbx - pushq %rbp + pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %r12 + pushq %rbp + movq %rsp, %rbp - mov %rsp, %r12 subq $STACK_SIZE, %rsp # allocate stack space and $~15, %rsp # align stack pointer @@ -452,13 +452,12 @@ loop2: done_hash: - mov %r12, %rsp - - popq %r12 + mov %rbp, %rsp + popq %rbp popq %r15 popq %r14 popq %r13 - popq %rbp + popq %r12 popq %rbx ret ENDPROC(sha256_transform_avx) diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 89c8f09787d2..1420db15dcdd 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -98,8 +98,6 @@ d = %r8d e = %edx # clobbers NUM_BLKS y3 = %esi # clobbers INP - -TBL = %rbp SRND = CTX # SRND is same register as CTX a = %eax @@ -531,7 +529,6 @@ STACK_SIZE = _RSP + _RSP_SIZE ENTRY(sha256_transform_rorx) .align 32 pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 @@ -568,8 +565,6 @@ ENTRY(sha256_transform_rorx) mov CTX, _CTX(%rsp) loop0: - lea K256(%rip), TBL - ## Load first 16 dwords from two blocks VMOVDQ 0*32(INP),XTMP0 VMOVDQ 1*32(INP),XTMP1 @@ -597,19 +592,19 @@ last_block_enter: .align 16 loop1: - vpaddd 0*32(TBL, SRND), X0, XFER + vpaddd K256+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 0*32 - vpaddd 1*32(TBL, SRND), X0, XFER + vpaddd K256+1*32(SRND), X0, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 1*32 - vpaddd 2*32(TBL, SRND), X0, XFER + vpaddd K256+2*32(SRND), X0, XFER vmovdqa XFER, 2*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 2*32 - vpaddd 3*32(TBL, SRND), X0, XFER + vpaddd K256+3*32(SRND), X0, XFER vmovdqa XFER, 3*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 3*32 @@ -619,10 +614,11 @@ loop1: loop2: ## Do last 16 rounds with no scheduling - vpaddd 0*32(TBL, SRND), X0, XFER + vpaddd K256+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 0*32 - vpaddd 1*32(TBL, SRND), X1, XFER + + vpaddd K256+1*32(SRND), X1, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 1*32 add $2*32, SRND @@ -676,9 +672,6 @@ loop3: ja done_hash do_last_block: - #### do last block - lea K256(%rip), TBL - VMOVDQ 0*16(INP),XWORD0 VMOVDQ 1*16(INP),XWORD1 VMOVDQ 2*16(INP),XWORD2 @@ -718,7 +711,6 @@ done_hash: popq %r14 popq %r13 popq %r12 - popq %rbp popq %rbx ret ENDPROC(sha256_transform_rorx) diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index 39b83c93e7fd..c6c05ed2c16a 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -95,7 +95,7 @@ SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx -TBL = %rbp +TBL = %r12 a = %eax b = %ebx @@ -356,13 +356,13 @@ a = TMP_ ENTRY(sha256_transform_ssse3) .align 32 pushq %rbx - pushq %rbp + pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %r12 + pushq %rbp + mov %rsp, %rbp - mov %rsp, %r12 subq $STACK_SIZE, %rsp and $~15, %rsp @@ -462,13 +462,12 @@ loop2: done_hash: - mov %r12, %rsp - - popq %r12 + mov %rbp, %rsp + popq %rbp popq %r15 popq %r14 popq %r13 - popq %rbp + popq %r12 popq %rbx ret diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 7f5f6c6ec72e..b16d56005162 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -69,8 +69,9 @@ XFER = YTMP0 BYTE_FLIP_MASK = %ymm9 -# 1st arg -CTX = %rdi +# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 +CTX1 = %rdi +CTX2 = %r12 # 2nd arg INP = %rsi # 3rd arg @@ -81,7 +82,7 @@ d = %r8 e = %rdx y3 = %rsi -TBL = %rbp +TBL = %rdi # clobbers CTX1 a = %rax b = %rbx @@ -91,26 +92,26 @@ g = %r10 h = %r11 old_h = %r11 -T1 = %r12 +T1 = %r12 # clobbers CTX2 y0 = %r13 y1 = %r14 y2 = %r15 -y4 = %r12 - # Local variables (stack frame) XFER_SIZE = 4*8 SRND_SIZE = 1*8 INP_SIZE = 1*8 INPEND_SIZE = 1*8 +CTX_SIZE = 1*8 RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 6*8 +GPRSAVE_SIZE = 5*8 frame_XFER = 0 frame_SRND = frame_XFER + XFER_SIZE frame_INP = frame_SRND + SRND_SIZE frame_INPEND = frame_INP + INP_SIZE -frame_RSPSAVE = frame_INPEND + INPEND_SIZE +frame_CTX = frame_INPEND + INPEND_SIZE +frame_RSPSAVE = frame_CTX + CTX_SIZE frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE frame_size = frame_GPRSAVE + GPRSAVE_SIZE @@ -576,12 +577,11 @@ ENTRY(sha512_transform_rorx) mov %rax, frame_RSPSAVE(%rsp) # Save GPRs - mov %rbp, frame_GPRSAVE(%rsp) - mov %rbx, 8*1+frame_GPRSAVE(%rsp) - mov %r12, 8*2+frame_GPRSAVE(%rsp) - mov %r13, 8*3+frame_GPRSAVE(%rsp) - mov %r14, 8*4+frame_GPRSAVE(%rsp) - mov %r15, 8*5+frame_GPRSAVE(%rsp) + mov %rbx, 8*0+frame_GPRSAVE(%rsp) + mov %r12, 8*1+frame_GPRSAVE(%rsp) + mov %r13, 8*2+frame_GPRSAVE(%rsp) + mov %r14, 8*3+frame_GPRSAVE(%rsp) + mov %r15, 8*4+frame_GPRSAVE(%rsp) shl $7, NUM_BLKS # convert to bytes jz done_hash @@ -589,14 +589,17 @@ ENTRY(sha512_transform_rorx) mov NUM_BLKS, frame_INPEND(%rsp) ## load initial digest - mov 8*0(CTX),a - mov 8*1(CTX),b - mov 8*2(CTX),c - mov 8*3(CTX),d - mov 8*4(CTX),e - mov 8*5(CTX),f - mov 8*6(CTX),g - mov 8*7(CTX),h + mov 8*0(CTX1), a + mov 8*1(CTX1), b + mov 8*2(CTX1), c + mov 8*3(CTX1), d + mov 8*4(CTX1), e + mov 8*5(CTX1), f + mov 8*6(CTX1), g + mov 8*7(CTX1), h + + # save %rdi (CTX) before it gets clobbered + mov %rdi, frame_CTX(%rsp) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK @@ -652,14 +655,15 @@ loop2: subq $1, frame_SRND(%rsp) jne loop2 - addm 8*0(CTX),a - addm 8*1(CTX),b - addm 8*2(CTX),c - addm 8*3(CTX),d - addm 8*4(CTX),e - addm 8*5(CTX),f - addm 8*6(CTX),g - addm 8*7(CTX),h + mov frame_CTX(%rsp), CTX2 + addm 8*0(CTX2), a + addm 8*1(CTX2), b + addm 8*2(CTX2), c + addm 8*3(CTX2), d + addm 8*4(CTX2), e + addm 8*5(CTX2), f + addm 8*6(CTX2), g + addm 8*7(CTX2), h mov frame_INP(%rsp), INP add $128, INP @@ -669,12 +673,11 @@ loop2: done_hash: # Restore GPRs - mov frame_GPRSAVE(%rsp) ,%rbp - mov 8*1+frame_GPRSAVE(%rsp) ,%rbx - mov 8*2+frame_GPRSAVE(%rsp) ,%r12 - mov 8*3+frame_GPRSAVE(%rsp) ,%r13 - mov 8*4+frame_GPRSAVE(%rsp) ,%r14 - mov 8*5+frame_GPRSAVE(%rsp) ,%r15 + mov 8*0+frame_GPRSAVE(%rsp), %rbx + mov 8*1+frame_GPRSAVE(%rsp), %r12 + mov 8*2+frame_GPRSAVE(%rsp), %r13 + mov 8*3+frame_GPRSAVE(%rsp), %r14 + mov 8*4+frame_GPRSAVE(%rsp), %r15 # Restore Stack Pointer mov frame_RSPSAVE(%rsp), %rsp diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index b3f49d286348..73b471da3622 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -76,8 +76,8 @@ #define RT %xmm14 #define RR %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %r13 +#define RID1d %r13d #define RID2 %rsi #define RID2d %esi @@ -259,7 +259,7 @@ __twofish_enc_blk8: vmovdqu w(CTX), RK1; - pushq %rbp; + pushq %r13; pushq %rbx; pushq %rcx; @@ -282,7 +282,7 @@ __twofish_enc_blk8: popq %rcx; popq %rbx; - popq %rbp; + popq %r13; outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); @@ -301,7 +301,7 @@ __twofish_dec_blk8: vmovdqu (w+4*4)(CTX), RK1; - pushq %rbp; + pushq %r13; pushq %rbx; inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); @@ -322,7 +322,7 @@ __twofish_dec_blk8: vmovdqu (w)(CTX), RK1; popq %rbx; - popq %rbp; + popq %r13; outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 1b020381ab38..c096624137ae 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -218,10 +218,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ output, input...) \ { \ - register void *__sp asm(_ASM_SP); \ asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ "call %P[new2]", feature2) \ - : output, "+r" (__sp) \ + : output, ASM_CALL_CONSTRAINT \ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ [new2] "i" (newfunc2), ## input); \ } diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 676ee5807d86..c1eadbaf1115 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -132,4 +132,15 @@ /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif +#ifndef __ASSEMBLY__ +/* + * This output constraint should be used for any inline asm which has a "call" + * instruction. Otherwise the asm may be inserted before the frame pointer + * gets set up by the containing function. If you forget to do this, objtool + * may print a "call without frame pointer save/setup" warning. + */ +register unsigned int __asm_call_sp asm("esp"); +#define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) +#endif + #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 7ae318c340d9..c120b5db178a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -286,6 +286,32 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } +/* + * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID + * bits. This serves two purposes. It prevents a nasty situation in + * which PCID-unaware code saves CR3, loads some other value (with PCID + * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if + * the saved ASID was nonzero. It also means that any bugs involving + * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger + * deterministically. + */ + +static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) +{ + if (static_cpu_has(X86_FEATURE_PCID)) { + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(mm->pgd) | (asid + 1); + } else { + VM_WARN_ON_ONCE(asid != 0); + return __sme_pa(mm->pgd); + } +} + +static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) +{ + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; +} /* * This can be used from process context to figure out what the value of @@ -296,10 +322,8 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, */ static inline unsigned long __get_current_cr3_fast(void) { - unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); - - if (static_cpu_has(X86_FEATURE_PCID)) - cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 63cc96f064dc..738503e1f80c 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -179,7 +179,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) u64 input_address = input ? virt_to_phys(input) : 0; u64 output_address = output ? virt_to_phys(output) : 0; u64 hv_status; - register void *__sp asm(_ASM_SP); #ifdef CONFIG_X86_64 if (!hv_hypercall_pg) @@ -187,7 +186,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) __asm__ __volatile__("mov %4, %%r8\n" "call *%5" - : "=a" (hv_status), "+r" (__sp), + : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) : "r" (output_address), "m" (hv_hypercall_pg) : "cc", "memory", "r8", "r9", "r10", "r11"); @@ -202,7 +201,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) __asm__ __volatile__("call *%7" : "=A" (hv_status), - "+c" (input_address_lo), "+r" (__sp) + "+c" (input_address_lo), ASM_CALL_CONSTRAINT : "A" (control), "b" (input_address_hi), "D"(output_address_hi), "S"(output_address_lo), @@ -224,12 +223,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) { u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; - register void *__sp asm(_ASM_SP); #ifdef CONFIG_X86_64 { __asm__ __volatile__("call *%4" - : "=a" (hv_status), "+r" (__sp), + : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) : "m" (hv_hypercall_pg) : "cc", "r8", "r9", "r10", "r11"); @@ -242,7 +240,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) __asm__ __volatile__ ("call *%5" : "=A"(hv_status), "+c"(input1_lo), - "+r"(__sp) + ASM_CALL_CONSTRAINT : "A" (control), "b" (input1_hi), "m" (hv_hypercall_pg) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 42873edd9f9d..280d94c36dad 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -459,8 +459,8 @@ int paravirt_disable_iospace(void); */ #ifdef CONFIG_X86_32 #define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; \ - register void *__sp asm("esp") + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) @@ -480,8 +480,8 @@ int paravirt_disable_iospace(void); /* [re]ax isn't an arg, but the return val */ #define PVOP_VCALL_ARGS \ unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx, __eax = __eax; \ - register void *__sp asm("rsp") + __edx = __edx, __ecx = __ecx, __eax = __eax; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) @@ -532,7 +532,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -542,7 +542,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -569,7 +569,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index ec1f3c651150..4f44505dbf87 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -100,19 +100,14 @@ static __always_inline bool should_resched(int preempt_offset) #ifdef CONFIG_PREEMPT extern asmlinkage void ___preempt_schedule(void); -# define __preempt_schedule() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule" : "+r"(__sp)); \ -}) +# define __preempt_schedule() \ + asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule(void); extern asmlinkage void ___preempt_schedule_notrace(void); -# define __preempt_schedule_notrace() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp)); \ -}) +# define __preempt_schedule_notrace() \ + asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT) + extern asmlinkage void preempt_schedule_notrace(void); #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3fa26a61eabc..b390ff76e58f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -677,8 +677,6 @@ static inline void sync_core(void) * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ - register void *__sp asm(_ASM_SP); - #ifdef CONFIG_X86_32 asm volatile ( "pushfl\n\t" @@ -686,7 +684,7 @@ static inline void sync_core(void) "pushl $1f\n\t" "iret\n\t" "1:" - : "+r" (__sp) : : "memory"); + : ASM_CALL_CONSTRAINT : : "memory"); #else unsigned int tmp; @@ -703,7 +701,7 @@ static inline void sync_core(void) "iretq\n\t" UNWIND_HINT_RESTORE "1:" - : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); + : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); #endif } diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index a34e0d4b957d..7116b7931c7b 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -103,7 +103,6 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) ({ \ long tmp; \ struct rw_semaphore* ret; \ - register void *__sp asm(_ASM_SP); \ \ asm volatile("# beginning down_write\n\t" \ LOCK_PREFIX " xadd %1,(%4)\n\t" \ @@ -114,7 +113,8 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) " call " slow_path "\n" \ "1:\n" \ "# ending down_write" \ - : "+m" (sem->count), "=d" (tmp), "=a" (ret), "+r" (__sp) \ + : "+m" (sem->count), "=d" (tmp), \ + "=a" (ret), ASM_CALL_CONSTRAINT \ : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ : "memory", "cc"); \ ret; \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 184eb9894dae..78e8fcc87d4c 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -166,11 +166,11 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ - register void *__sp asm(_ASM_SP); \ __chk_user_ptr(ptr); \ might_fault(); \ asm volatile("call __get_user_%P4" \ - : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp) \ + : "=a" (__ret_gu), "=r" (__val_gu), \ + ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 9606688caa4b..128a1a0b1450 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -113,10 +113,9 @@ extern struct { char _entry[32]; } hypercall_page[]; register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \ - register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \ - register void *__sp asm(_ASM_SP); + register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; -#define __HYPERCALL_0PARAM "=r" (__res), "+r" (__sp) +#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT #define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1) #define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2) #define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9862e2cd6d93..d58184b7cd44 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -763,6 +763,16 @@ static void init_amd_bd(struct cpuinfo_x86 *c) } } +static void init_amd_zn(struct cpuinfo_x86 *c) +{ + /* + * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects + * all up to and including B1. + */ + if (c->x86_model <= 1 && c->x86_mask <= 1) + set_cpu_cap(c, X86_FEATURE_CPB); +} + static void init_amd(struct cpuinfo_x86 *c) { early_init_amd(c); @@ -791,6 +801,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; + case 0x17: init_amd_zn(c); break; } /* Enable workaround for FXSAVE leak */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index db684880d74a..0af86d9242da 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -21,14 +21,6 @@ void __init check_bugs(void) { -#ifdef CONFIG_X86_32 - /* - * Regardless of whether PCID is enumerated, the SDM says - * that it can't be enabled in 32-bit mode. - */ - setup_clear_cpu_cap(X86_FEATURE_PCID); -#endif - identify_boot_cpu(); if (!IS_ENABLED(CONFIG_SMP)) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 775f10100d7f..c9176bae7fd8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -904,6 +904,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); fpu__init_system(c); + +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif } void __init early_cpu_init(void) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0854ff169274..ad59edd84de7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -232,12 +232,6 @@ static void notrace start_secondary(void *unused) */ if (boot_cpu_has(X86_FEATURE_PCID)) __write_cr4(__read_cr4() | X86_CR4_PCIDE); - cpu_init(); - x86_cpuinit.early_percpu_clock_init(); - preempt_disable(); - smp_callin(); - - enable_start_cpu0 = 0; #ifdef CONFIG_X86_32 /* switch away from the initial page table */ @@ -245,6 +239,13 @@ static void notrace start_secondary(void *unused) __flush_tlb_all(); #endif + cpu_init(); + x86_cpuinit.early_percpu_clock_init(); + preempt_disable(); + smp_callin(); + + enable_start_cpu0 = 0; + /* otherwise gcc will move up smp_processor_id before the cpu_init */ barrier(); /* diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 16bf6655aa85..a36254cbf776 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4102,10 +4102,12 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) { u64 maxphyaddr; - u32 eax = 0x80000008; + u32 eax, ebx, ecx, edx; - if (ctxt->ops->get_cpuid(ctxt, &eax, NULL, NULL, - NULL, false)) + eax = 0x80000008; + ecx = 0; + if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, + &edx, false)) maxphyaddr = eax & 0xff; else maxphyaddr = 36; @@ -5296,7 +5298,6 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { - register void *__sp asm(_ASM_SP); ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; if (!(ctxt->d & ByteOp)) @@ -5304,7 +5305,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), - [fastop]"+S"(fop), "+r"(__sp) + [fastop]"+S"(fop), ASM_CALL_CONSTRAINT : "c"(ctxt->src2.val)); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06c0c6d0541e..6970249c09fc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5077,21 +5077,30 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; if (vcpu->mode == IN_GUEST_MODE) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * Currently, we don't support urgent interrupt, - * all interrupts are recognized as non-urgent - * interrupt, so we cannot post interrupts when - * 'SN' is set. + * The vector of interrupt to be delivered to vcpu had + * been set in PIR before this function. + * + * Following cases will be reached in this block, and + * we always send a notification event in all cases as + * explained below. + * + * Case 1: vcpu keeps in non-root mode. Sending a + * notification event posts the interrupt to vcpu. * - * If the vcpu is in guest mode, it means it is - * running instead of being scheduled out and - * waiting in the run queue, and that's the only - * case when 'SN' is set currently, warning if - * 'SN' is set. + * Case 2: vcpu exits to root mode and is still + * runnable. PIR will be synced to vIRR before the + * next vcpu entry. Sending a notification event in + * this case has no effect, as vcpu is not in root + * mode. + * + * Case 3: vcpu exits to root mode and is blocked. + * vcpu_block() has already synced PIR to vIRR and + * never blocks vcpu if vIRR is not cleared. Therefore, + * a blocked vcpu here does not wait for any requested + * interrupts in PIR, and sending a notification event + * which has no effect is safe here. */ - WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return true; @@ -9036,7 +9045,6 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) { u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - register void *__sp asm(_ASM_SP); if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { @@ -9065,7 +9073,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 [sp]"=&r"(tmp), #endif - "+r"(__sp) + ASM_CALL_CONSTRAINT : [entry]"r"(entry), [ss]"i"(__KERNEL_DS), @@ -11911,12 +11919,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, if (set) ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else { - /* suppress notification event before unposting */ - pi_set_sn(vcpu_to_pi_desc(vcpu)); + else ret = irq_set_vcpu_affinity(host_irq, NULL); - pi_clear_sn(vcpu_to_pi_desc(vcpu)); - } if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b836a7274e12..39567b5c33da 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -806,7 +806,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - register void *__sp asm("rsp"); unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); /* * We're likely to be running with very little stack space @@ -821,7 +820,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, asm volatile ("movq %[stack], %%rsp\n\t" "call handle_stack_overflow\n\t" "1: jmp 1b" - : "+r" (__sp) + : ASM_CALL_CONSTRAINT : "D" ("kernel stack overflow (page fault)"), "S" (regs), "d" (address), [stack] "rm" (stack)); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 1ab3821f9e26..93fe97cce581 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -126,8 +126,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != - (__sme_pa(real_prev->pgd) | prev_asid))) { + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. @@ -172,7 +171,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, next_tlb_gen); - write_cr3(__sme_pa(next->pgd) | prev_asid); + write_cr3(build_cr3(next, prev_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } @@ -216,12 +215,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - write_cr3(__sme_pa(next->pgd) | new_asid); + write_cr3(build_cr3(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); + write_cr3(build_cr3_noflush(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } @@ -265,7 +264,7 @@ void initialize_tlbstate_and_flush(void) !(cr4_read_shadow() & X86_CR4_PCIDE)); /* Force ASID 0 and force a TLB flush. */ - write_cr3(cr3 & ~CR3_PCID_MASK); + write_cr3(build_cr3(mm, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 509f560bd0c6..7330cb3b2283 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2220,7 +2220,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) * not the first page table in the page table pool. * Iterate through the initial page tables to find the real page table base. */ -static phys_addr_t xen_find_pt_base(pmd_t *pmd) +static phys_addr_t __init xen_find_pt_base(pmd_t *pmd) { phys_addr_t pt_base, paddr; unsigned pmdidx; |