From b2d3d79780fa8e579aecfab6bb99c6693f3ef9ee Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Apr 2024 11:14:58 -0700 Subject: crypto: x86/twofish - Switch to new Intel CPU model defines New CPU #defines encode vendor and family as well as model. Signed-off-by: Tony Luck Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_glue_3way.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 90454cf18e0d..1a1ecfa7f72a 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -5,6 +5,7 @@ * Copyright (c) 2011 Jussi Kivilinna */ +#include #include #include #include @@ -107,10 +108,10 @@ static bool is_blacklisted_cpu(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return false; - if (boot_cpu_data.x86 == 0x06 && - (boot_cpu_data.x86_model == 0x1c || - boot_cpu_data.x86_model == 0x26 || - boot_cpu_data.x86_model == 0x36)) { + switch (boot_cpu_data.x86_vfm) { + case INTEL_ATOM_BONNELL: + case INTEL_ATOM_BONNELL_MID: + case INTEL_ATOM_SALTWELL: /* * On Atom, twofish-3way is slower than original assembler * implementation. Twofish-3way trades off some performance in -- cgit From adc5167be5553ff2f68cd6a36de0106f1f73c1ea Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 24 Apr 2024 11:14:58 -0700 Subject: crypto: x86/poly1305 - Switch to new Intel CPU model defines New CPU #defines encode vendor and family as well as model. Signed-off-by: Tony Luck Signed-off-by: Herbert Xu --- arch/x86/crypto/poly1305_glue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 1dfb8af48a3c..08ff4b489f7e 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include asmlinkage void poly1305_init_x86_64(void *ctx, @@ -269,7 +269,7 @@ static int __init poly1305_simd_mod_init(void) boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ - boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) + boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) static_branch_enable(&poly1305_use_avx512); return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; } -- cgit From 2dcdf3be65e968b4a67209a825a69b098e73cb25 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 11 May 2024 15:50:17 +0100 Subject: crypto: axis - Remove unused struct 'dbgfs_u32' 'dbgfs_u32' appears unused. Remove it. (pdma_stat_descr is also unused, but I'm assuming it's some useful layout description of firmware/hardware so best left in) Signed-off-by: Dr. David Alan Gilbert Acked-by: Jesper Nilsson Signed-off-by: Herbert Xu --- drivers/crypto/axis/artpec6_crypto.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/crypto/axis/artpec6_crypto.c b/drivers/crypto/axis/artpec6_crypto.c index dbc1d483f2af..75440ea6206e 100644 --- a/drivers/crypto/axis/artpec6_crypto.c +++ b/drivers/crypto/axis/artpec6_crypto.c @@ -2811,13 +2811,6 @@ static struct aead_alg aead_algos[] = { #ifdef CONFIG_DEBUG_FS -struct dbgfs_u32 { - char *name; - mode_t mode; - u32 *flag; - char *desc; -}; - static struct dentry *dbgfs_root; static void artpec6_crypto_init_debugfs(void) -- cgit From e793f6c4dabe56cb8b93db26bbfcfba2a65f7215 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 11 May 2024 15:52:00 +0100 Subject: crypto: ccree - Remove unused struct 'tdes_keys' 'tdes_keys' appears unused. Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu --- drivers/crypto/ccree/cc_cipher.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c index cd66a580e8b6..3fb667a17bbb 100644 --- a/drivers/crypto/ccree/cc_cipher.c +++ b/drivers/crypto/ccree/cc_cipher.c @@ -261,12 +261,6 @@ static void cc_cipher_exit(struct crypto_tfm *tfm) kfree_sensitive(ctx_p->user.key); } -struct tdes_keys { - u8 key1[DES_KEY_SIZE]; - u8 key2[DES_KEY_SIZE]; - u8 key3[DES_KEY_SIZE]; -}; - static enum cc_hw_crypto_key cc_slot_to_hw_key(u8 slot_num) { switch (slot_num) { -- cgit From eaa857782fdbba36544c74006d8037f4d101b90a Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 11 May 2024 15:54:26 +0100 Subject: crypto: hifn_795x - Remove unused hifn_*_command structs Remove 'hifn_mac_command' and 'hifn_comp_command' which are unused. They're the same structure as 'hifn_crypt_command' which is used. (I was tempted to remove hifn_base_result hifn_comp_result hifn_mac_result and hifn_crypt_result which are also unused, but they vary, and perhaps they're telling someone in the future what to look at.) Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Herbert Xu --- drivers/crypto/hifn_795x.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c index b4a4ec35bce0..925991526745 100644 --- a/drivers/crypto/hifn_795x.c +++ b/drivers/crypto/hifn_795x.c @@ -495,16 +495,6 @@ struct hifn_crypt_command { #define HIFN_CRYPT_CMD_SRCLEN_M 0xc000 #define HIFN_CRYPT_CMD_SRCLEN_S 14 -/* - * Structure to help build up the command data structure. - */ -struct hifn_mac_command { - volatile __le16 masks; - volatile __le16 header_skip; - volatile __le16 source_count; - volatile __le16 reserved; -}; - #define HIFN_MAC_CMD_ALG_MASK 0x0001 #define HIFN_MAC_CMD_ALG_SHA1 0x0000 #define HIFN_MAC_CMD_ALG_MD5 0x0001 @@ -526,13 +516,6 @@ struct hifn_mac_command { #define HIFN_MAC_CMD_POS_IPSEC 0x0200 #define HIFN_MAC_CMD_NEW_KEY 0x0800 -struct hifn_comp_command { - volatile __le16 masks; - volatile __le16 header_skip; - volatile __le16 source_count; - volatile __le16 reserved; -}; - #define HIFN_COMP_CMD_SRCLEN_M 0xc000 #define HIFN_COMP_CMD_SRCLEN_S 14 #define HIFN_COMP_CMD_ONE 0x0100 /* must be one */ -- cgit From 198faf0301ce1a51b9c859510ab884020dcb6f19 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 11 May 2024 15:56:20 +0100 Subject: crypto: n2 - Remove unused struct 'n2_skcipher_request_context' 'n2_skcipher_request_context' was added in commit 23a6564a6b51 ("crypto: niagara2 - switch to skcipher API") but never used. Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Herbert Xu --- drivers/crypto/n2_core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c index 59d472cb11e7..251e088a53df 100644 --- a/drivers/crypto/n2_core.c +++ b/drivers/crypto/n2_core.c @@ -720,10 +720,6 @@ static inline struct n2_skcipher_alg *n2_skcipher_alg(struct crypto_skcipher *tf return container_of(alg, struct n2_skcipher_alg, skcipher); } -struct n2_skcipher_request_context { - struct skcipher_walk walk; -}; - static int n2_aes_setkey(struct crypto_skcipher *skcipher, const u8 *key, unsigned int keylen) { -- cgit From 6684f97981c528965d7458dd4f89cfbc8fa980b2 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Mon, 13 May 2024 02:07:42 -0400 Subject: crypto: qat - Fix typo The mapings should be replaced by mappings. Signed-off-by: Deming Wang Signed-off-by: Herbert Xu --- drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c b/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c index f07b748795f7..96ddd1c419c4 100644 --- a/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c +++ b/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c @@ -59,7 +59,7 @@ static int adf_get_vf_real_id(u32 fake) } /** - * adf_clean_vf_map() - Cleans VF id mapings + * adf_clean_vf_map() - Cleans VF id mappings * @vf: flag indicating whether mappings is cleaned * for vfs only or for vfs and pfs * -- cgit From bbb66f218d9e12c5bd5a6208e96e5117449c7c88 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Thu, 16 May 2024 11:19:55 -0400 Subject: crypto: ppc/curve25519 - Low-level primitives for ppc64le Use the perl output of x25519-ppc64.pl from CRYPTOGAMs (see https://github.com/dot-asm/cryptogams/) and added four supporting functions, x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes and x25519_cswap. Signed-off-by: Danny Tsen Signed-off-by: Herbert Xu --- arch/powerpc/crypto/curve25519-ppc64le_asm.S | 671 +++++++++++++++++++++++++++ 1 file changed, 671 insertions(+) create mode 100644 arch/powerpc/crypto/curve25519-ppc64le_asm.S diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/arch/powerpc/crypto/curve25519-ppc64le_asm.S new file mode 100644 index 000000000000..06c1febe24b9 --- /dev/null +++ b/arch/powerpc/crypto/curve25519-ppc64le_asm.S @@ -0,0 +1,671 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# This code is taken from CRYPTOGAMs[1] and is included here using the option +# in the license to distribute the code under the GPL. Therefore this program +# is free software; you can redistribute it and/or modify it under the terms of +# the GNU General Public License version 2 as published by the Free Software +# Foundation. +# +# [1] https://github.com/dot-asm/cryptogams/ + +# Copyright (c) 2006-2017, CRYPTOGAMS by +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the CRYPTOGAMS nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# +# ==================================================================== +# Written and Modified by Danny Tsen +# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes +# and x25519_cswap +# +# Copyright 2024- IBM Corp. +# +# X25519 lower-level primitives for PPC64. +# + +#include + +.text + +.align 5 +SYM_FUNC_START(x25519_fe51_mul) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + ld 6,0(5) + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + mulld 22,7,6 + mulhdu 23,7,6 + + mulld 24,8,6 + mulhdu 25,8,6 + + mulld 30,11,6 + mulhdu 31,11,6 + ld 4,8(5) + mulli 11,11,19 + + mulld 26,9,6 + mulhdu 27,9,6 + + mulld 28,10,6 + mulhdu 29,10,6 + mulld 12,11,4 + mulhdu 21,11,4 + addc 22,22,12 + adde 23,23,21 + + mulld 12,7,4 + mulhdu 21,7,4 + addc 24,24,12 + adde 25,25,21 + + mulld 12,10,4 + mulhdu 21,10,4 + ld 6,16(5) + mulli 10,10,19 + addc 30,30,12 + adde 31,31,21 + + mulld 12,8,4 + mulhdu 21,8,4 + addc 26,26,12 + adde 27,27,21 + + mulld 12,9,4 + mulhdu 21,9,4 + addc 28,28,12 + adde 29,29,21 + mulld 12,10,6 + mulhdu 21,10,6 + addc 22,22,12 + adde 23,23,21 + + mulld 12,11,6 + mulhdu 21,11,6 + addc 24,24,12 + adde 25,25,21 + + mulld 12,9,6 + mulhdu 21,9,6 + ld 4,24(5) + mulli 9,9,19 + addc 30,30,12 + adde 31,31,21 + + mulld 12,7,6 + mulhdu 21,7,6 + addc 26,26,12 + adde 27,27,21 + + mulld 12,8,6 + mulhdu 21,8,6 + addc 28,28,12 + adde 29,29,21 + mulld 12,9,4 + mulhdu 21,9,4 + addc 22,22,12 + adde 23,23,21 + + mulld 12,10,4 + mulhdu 21,10,4 + addc 24,24,12 + adde 25,25,21 + + mulld 12,8,4 + mulhdu 21,8,4 + ld 6,32(5) + mulli 8,8,19 + addc 30,30,12 + adde 31,31,21 + + mulld 12,11,4 + mulhdu 21,11,4 + addc 26,26,12 + adde 27,27,21 + + mulld 12,7,4 + mulhdu 21,7,4 + addc 28,28,12 + adde 29,29,21 + mulld 12,8,6 + mulhdu 21,8,6 + addc 22,22,12 + adde 23,23,21 + + mulld 12,9,6 + mulhdu 21,9,6 + addc 24,24,12 + adde 25,25,21 + + mulld 12,10,6 + mulhdu 21,10,6 + addc 26,26,12 + adde 27,27,21 + + mulld 12,11,6 + mulhdu 21,11,6 + addc 28,28,12 + adde 29,29,21 + + mulld 12,7,6 + mulhdu 21,7,6 + addc 30,30,12 + adde 31,31,21 + +.Lfe51_reduce: + li 0,-1 + srdi 0,0,13 + + srdi 12,26,51 + and 9,26,0 + insrdi 12,27,51,0 + srdi 21,22,51 + and 7,22,0 + insrdi 21,23,51,0 + addc 28,28,12 + addze 29,29 + addc 24,24,21 + addze 25,25 + + srdi 12,28,51 + and 10,28,0 + insrdi 12,29,51,0 + srdi 21,24,51 + and 8,24,0 + insrdi 21,25,51,0 + addc 30,30,12 + addze 31,31 + add 9,9,21 + + srdi 12,30,51 + and 11,30,0 + insrdi 12,31,51,0 + mulli 12,12,19 + + add 7,7,12 + + srdi 21,9,51 + and 9,9,0 + add 10,10,21 + + srdi 12,7,51 + and 7,7,0 + add 8,8,12 + + std 9,16(3) + std 10,24(3) + std 11,32(3) + std 7,0(3) + std 8,8(3) + + ld 21,56(1) + ld 22,64(1) + ld 23,72(1) + ld 24,80(1) + ld 25,88(1) + ld 26,96(1) + ld 27,104(1) + ld 28,112(1) + ld 29,120(1) + ld 30,128(1) + ld 31,136(1) + addi 1,1,144 + blr +SYM_FUNC_END(x25519_fe51_mul) + +.align 5 +SYM_FUNC_START(x25519_fe51_sqr) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + add 6,7,7 + mulli 21,11,19 + + mulld 22,7,7 + mulhdu 23,7,7 + mulld 24,8,6 + mulhdu 25,8,6 + mulld 26,9,6 + mulhdu 27,9,6 + mulld 28,10,6 + mulhdu 29,10,6 + mulld 30,11,6 + mulhdu 31,11,6 + add 6,8,8 + mulld 12,11,21 + mulhdu 11,11,21 + addc 28,28,12 + adde 29,29,11 + + mulli 5,10,19 + + mulld 12,8,8 + mulhdu 11,8,8 + addc 26,26,12 + adde 27,27,11 + mulld 12,9,6 + mulhdu 11,9,6 + addc 28,28,12 + adde 29,29,11 + mulld 12,10,6 + mulhdu 11,10,6 + addc 30,30,12 + adde 31,31,11 + mulld 12,21,6 + mulhdu 11,21,6 + add 6,10,10 + addc 22,22,12 + adde 23,23,11 + mulld 12,10,5 + mulhdu 10,10,5 + addc 24,24,12 + adde 25,25,10 + mulld 12,6,21 + mulhdu 10,6,21 + add 6,9,9 + addc 26,26,12 + adde 27,27,10 + + mulld 12,9,9 + mulhdu 10,9,9 + addc 30,30,12 + adde 31,31,10 + mulld 12,5,6 + mulhdu 10,5,6 + addc 22,22,12 + adde 23,23,10 + mulld 12,21,6 + mulhdu 10,21,6 + addc 24,24,12 + adde 25,25,10 + + b .Lfe51_reduce +SYM_FUNC_END(x25519_fe51_sqr) + +.align 5 +SYM_FUNC_START(x25519_fe51_mul121666) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + lis 6,1 + ori 6,6,56130 + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + mulld 22,7,6 + mulhdu 23,7,6 + mulld 24,8,6 + mulhdu 25,8,6 + mulld 26,9,6 + mulhdu 27,9,6 + mulld 28,10,6 + mulhdu 29,10,6 + mulld 30,11,6 + mulhdu 31,11,6 + + b .Lfe51_reduce +SYM_FUNC_END(x25519_fe51_mul121666) + +.align 5 +SYM_FUNC_START(x25519_fe51_sqr_times) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + mtctr 5 + +.Lsqr_times_loop: + add 6,7,7 + mulli 21,11,19 + + mulld 22,7,7 + mulhdu 23,7,7 + mulld 24,8,6 + mulhdu 25,8,6 + mulld 26,9,6 + mulhdu 27,9,6 + mulld 28,10,6 + mulhdu 29,10,6 + mulld 30,11,6 + mulhdu 31,11,6 + add 6,8,8 + mulld 12,11,21 + mulhdu 11,11,21 + addc 28,28,12 + adde 29,29,11 + + mulli 5,10,19 + + mulld 12,8,8 + mulhdu 11,8,8 + addc 26,26,12 + adde 27,27,11 + mulld 12,9,6 + mulhdu 11,9,6 + addc 28,28,12 + adde 29,29,11 + mulld 12,10,6 + mulhdu 11,10,6 + addc 30,30,12 + adde 31,31,11 + mulld 12,21,6 + mulhdu 11,21,6 + add 6,10,10 + addc 22,22,12 + adde 23,23,11 + mulld 12,10,5 + mulhdu 10,10,5 + addc 24,24,12 + adde 25,25,10 + mulld 12,6,21 + mulhdu 10,6,21 + add 6,9,9 + addc 26,26,12 + adde 27,27,10 + + mulld 12,9,9 + mulhdu 10,9,9 + addc 30,30,12 + adde 31,31,10 + mulld 12,5,6 + mulhdu 10,5,6 + addc 22,22,12 + adde 23,23,10 + mulld 12,21,6 + mulhdu 10,21,6 + addc 24,24,12 + adde 25,25,10 + + # fe51_reduce + li 0,-1 + srdi 0,0,13 + + srdi 12,26,51 + and 9,26,0 + insrdi 12,27,51,0 + srdi 21,22,51 + and 7,22,0 + insrdi 21,23,51,0 + addc 28,28,12 + addze 29,29 + addc 24,24,21 + addze 25,25 + + srdi 12,28,51 + and 10,28,0 + insrdi 12,29,51,0 + srdi 21,24,51 + and 8,24,0 + insrdi 21,25,51,0 + addc 30,30,12 + addze 31,31 + add 9,9,21 + + srdi 12,30,51 + and 11,30,0 + insrdi 12,31,51,0 + mulli 12,12,19 + + add 7,7,12 + + srdi 21,9,51 + and 9,9,0 + add 10,10,21 + + srdi 12,7,51 + and 7,7,0 + add 8,8,12 + + bdnz .Lsqr_times_loop + + std 9,16(3) + std 10,24(3) + std 11,32(3) + std 7,0(3) + std 8,8(3) + + ld 21,56(1) + ld 22,64(1) + ld 23,72(1) + ld 24,80(1) + ld 25,88(1) + ld 26,96(1) + ld 27,104(1) + ld 28,112(1) + ld 29,120(1) + ld 30,128(1) + ld 31,136(1) + addi 1,1,144 + blr +SYM_FUNC_END(x25519_fe51_sqr_times) + +.align 5 +SYM_FUNC_START(x25519_fe51_frombytes) + + li 12, -1 + srdi 12, 12, 13 # 0x7ffffffffffff + + ld 5, 0(4) + ld 6, 8(4) + ld 7, 16(4) + ld 8, 24(4) + + srdi 10, 5, 51 + and 5, 5, 12 # h0 + + sldi 11, 6, 13 + or 11, 10, 11 # h1t + srdi 10, 6, 38 + and 6, 11, 12 # h1 + + sldi 11, 7, 26 + or 10, 10, 11 # h2t + + srdi 11, 7, 25 + and 7, 10, 12 # h2 + sldi 10, 8, 39 + or 11, 11, 10 # h3t + + srdi 9, 8, 12 + and 8, 11, 12 # h3 + and 9, 9, 12 # h4 + + std 5, 0(3) + std 6, 8(3) + std 7, 16(3) + std 8, 24(3) + std 9, 32(3) + + blr +SYM_FUNC_END(x25519_fe51_frombytes) + +.align 5 +SYM_FUNC_START(x25519_fe51_tobytes) + + ld 5, 0(4) + ld 6, 8(4) + ld 7, 16(4) + ld 8, 24(4) + ld 9, 32(4) + + li 12, -1 + srdi 12, 12, 13 # 0x7ffffffffffff + + # Full reducuction + addi 10, 5, 19 + srdi 10, 10, 51 + add 10, 10, 6 + srdi 10, 10, 51 + add 10, 10, 7 + srdi 10, 10, 51 + add 10, 10, 8 + srdi 10, 10, 51 + add 10, 10, 9 + srdi 10, 10, 51 + + mulli 10, 10, 19 + add 5, 5, 10 + srdi 11, 5, 51 + add 6, 6, 11 + srdi 11, 6, 51 + add 7, 7, 11 + srdi 11, 7, 51 + add 8, 8, 11 + srdi 11, 8, 51 + add 9, 9, 11 + + and 5, 5, 12 + and 6, 6, 12 + and 7, 7, 12 + and 8, 8, 12 + and 9, 9, 12 + + sldi 10, 6, 51 + or 5, 5, 10 # s0 + + srdi 11, 6, 13 + sldi 10, 7, 38 + or 6, 11, 10 # s1 + + srdi 11, 7, 26 + sldi 10, 8, 25 + or 7, 11, 10 # s2 + + srdi 11, 8, 39 + sldi 10, 9, 12 + or 8, 11, 10 # s4 + + std 5, 0(3) + std 6, 8(3) + std 7, 16(3) + std 8, 24(3) + + blr +SYM_FUNC_END(x25519_fe51_tobytes) + +.align 5 +SYM_FUNC_START(x25519_cswap) + + li 7, 5 + neg 6, 5 + mtctr 7 + +.Lswap_loop: + ld 8, 0(3) + ld 9, 0(4) + xor 10, 8, 9 + and 10, 10, 6 + xor 11, 8, 10 + xor 12, 9, 10 + std 11, 0(3) + addi 3, 3, 8 + std 12, 0(4) + addi 4, 4, 8 + bdnz .Lswap_loop + + blr +SYM_FUNC_END(x25519_cswap) -- cgit From a1bfed35d081112582f688947141763e75e090ee Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Thu, 16 May 2024 11:19:56 -0400 Subject: crypto: ppc/curve25519 - Core functions for ppc64le X25519 core functions to handle scalar multiplication for ppc64le. Signed-off-by: Danny Tsen Signed-off-by: Herbert Xu --- arch/powerpc/crypto/curve25519-ppc64le-core.c | 299 ++++++++++++++++++++++++++ 1 file changed, 299 insertions(+) create mode 100644 arch/powerpc/crypto/curve25519-ppc64le-core.c diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/arch/powerpc/crypto/curve25519-ppc64le-core.c new file mode 100644 index 000000000000..4e3e44ea4484 --- /dev/null +++ b/arch/powerpc/crypto/curve25519-ppc64le-core.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2024- IBM Corp. + * + * X25519 scalar multiplication with 51 bits limbs for PPC64le. + * Based on RFC7748 and AArch64 optimized implementation for X25519 + * - Algorithm 1 Scalar multiplication of a variable point + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +typedef uint64_t fe51[5]; + +asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); +asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f); +asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f); +asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n); +asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s); +asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h); +asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit); + +#define fmul x25519_fe51_mul +#define fsqr x25519_fe51_sqr +#define fmul121666 x25519_fe51_mul121666 +#define fe51_tobytes x25519_fe51_tobytes + +static void fadd(fe51 h, const fe51 f, const fe51 g) +{ + h[0] = f[0] + g[0]; + h[1] = f[1] + g[1]; + h[2] = f[2] + g[2]; + h[3] = f[3] + g[3]; + h[4] = f[4] + g[4]; +} + +/* + * Prime = 2 ** 255 - 19, 255 bits + * (0x7fffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed) + * + * Prime in 5 51-bit limbs + */ +static fe51 prime51 = { 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff}; + +static void fsub(fe51 h, const fe51 f, const fe51 g) +{ + h[0] = (f[0] + ((prime51[0] * 2))) - g[0]; + h[1] = (f[1] + ((prime51[1] * 2))) - g[1]; + h[2] = (f[2] + ((prime51[2] * 2))) - g[2]; + h[3] = (f[3] + ((prime51[3] * 2))) - g[3]; + h[4] = (f[4] + ((prime51[4] * 2))) - g[4]; +} + +static void fe51_frombytes(fe51 h, const uint8_t *s) +{ + /* + * Make sure 64-bit aligned. + */ + unsigned char sbuf[32+8]; + unsigned char *sb = PTR_ALIGN((void *)sbuf, 8); + + memcpy(sb, s, 32); + x25519_fe51_frombytes(h, sb); +} + +static void finv(fe51 o, const fe51 i) +{ + fe51 a0, b, c, t00; + + fsqr(a0, i); + x25519_fe51_sqr_times(t00, a0, 2); + + fmul(b, t00, i); + fmul(a0, b, a0); + + fsqr(t00, a0); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 5); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 10); + + fmul(c, t00, b); + x25519_fe51_sqr_times(t00, c, 20); + + fmul(t00, t00, c); + x25519_fe51_sqr_times(t00, t00, 10); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 50); + + fmul(c, t00, b); + x25519_fe51_sqr_times(t00, c, 100); + + fmul(t00, t00, c); + x25519_fe51_sqr_times(t00, t00, 50); + + fmul(t00, t00, b); + x25519_fe51_sqr_times(t00, t00, 5); + + fmul(o, t00, a0); +} + +static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) +{ + fe51 x1, x2, z2, x3, z3; + uint8_t s[32]; + unsigned int swap = 0; + int i; + + memcpy(s, scalar, 32); + s[0] &= 0xf8; + s[31] &= 0x7f; + s[31] |= 0x40; + fe51_frombytes(x1, point); + + z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0; + x3[0] = x1[0]; + x3[1] = x1[1]; + x3[2] = x1[2]; + x3[3] = x1[3]; + x3[4] = x1[4]; + + x2[0] = z3[0] = 1; + x2[1] = z3[1] = 0; + x2[2] = z3[2] = 0; + x2[3] = z3[3] = 0; + x2[4] = z3[4] = 0; + + for (i = 254; i >= 0; --i) { + unsigned int k_t = 1 & (s[i / 8] >> (i & 7)); + fe51 a, b, c, d, e; + fe51 da, cb, aa, bb; + fe51 dacb_p, dacb_m; + + swap ^= k_t; + x25519_cswap(x2, x3, swap); + x25519_cswap(z2, z3, swap); + swap = k_t; + + fsub(b, x2, z2); // B = x_2 - z_2 + fadd(a, x2, z2); // A = x_2 + z_2 + fsub(d, x3, z3); // D = x_3 - z_3 + fadd(c, x3, z3); // C = x_3 + z_3 + + fsqr(bb, b); // BB = B^2 + fsqr(aa, a); // AA = A^2 + fmul(da, d, a); // DA = D * A + fmul(cb, c, b); // CB = C * B + + fsub(e, aa, bb); // E = AA - BB + fmul(x2, aa, bb); // x2 = AA * BB + fadd(dacb_p, da, cb); // DA + CB + fsub(dacb_m, da, cb); // DA - CB + + fmul121666(z3, e); // 121666 * E + fsqr(z2, dacb_m); // (DA - CB)^2 + fsqr(x3, dacb_p); // x3 = (DA + CB)^2 + fadd(b, bb, z3); // BB + 121666 * E + fmul(z3, x1, z2); // z3 = x1 * (DA - CB)^2 + fmul(z2, e, b); // z2 = e * (BB + (DA + CB)^2) + } + + finv(z2, z2); + fmul(x2, x2, z2); + fe51_tobytes(out, x2); +} + +void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + curve25519_fe51(mypublic, secret, basepoint); +} +EXPORT_SYMBOL(curve25519_arch); + +void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + curve25519_fe51(pub, secret, curve25519_base_point); +} +EXPORT_SYMBOL(curve25519_base_arch); + +static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, + unsigned int len) +{ + u8 *secret = kpp_tfm_ctx(tfm); + + if (!len) + curve25519_generate_secret(secret); + else if (len == CURVE25519_KEY_SIZE && + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) + memcpy(secret, buf, CURVE25519_KEY_SIZE); + else + return -EINVAL; + return 0; +} + +static int curve25519_generate_public_key(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (req->src) + return -EINVAL; + + curve25519_base_arch(buf, secret); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static int curve25519_compute_shared_secret(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 public_key[CURVE25519_KEY_SIZE]; + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (!req->src) + return -EINVAL; + + copied = sg_copy_to_buffer(req->src, + sg_nents_for_len(req->src, + CURVE25519_KEY_SIZE), + public_key, CURVE25519_KEY_SIZE); + if (copied != CURVE25519_KEY_SIZE) + return -EINVAL; + + curve25519_arch(buf, secret, public_key); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static unsigned int curve25519_max_size(struct crypto_kpp *tfm) +{ + return CURVE25519_KEY_SIZE; +} + +static struct kpp_alg curve25519_alg = { + .base.cra_name = "curve25519", + .base.cra_driver_name = "curve25519-ppc64le", + .base.cra_priority = 200, + .base.cra_module = THIS_MODULE, + .base.cra_ctxsize = CURVE25519_KEY_SIZE, + + .set_secret = curve25519_set_secret, + .generate_public_key = curve25519_generate_public_key, + .compute_shared_secret = curve25519_compute_shared_secret, + .max_size = curve25519_max_size, +}; + + +static int __init curve25519_mod_init(void) +{ + return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? + crypto_register_kpp(&curve25519_alg) : 0; +} + +static void __exit curve25519_mod_exit(void) +{ + if (IS_REACHABLE(CONFIG_CRYPTO_KPP)) + crypto_unregister_kpp(&curve25519_alg); +} + +module_init(curve25519_mod_init); +module_exit(curve25519_mod_exit); + +MODULE_ALIAS_CRYPTO("curve25519"); +MODULE_ALIAS_CRYPTO("curve25519-ppc64le"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Danny Tsen "); -- cgit From b42519dbba838c928e82b55f32712fbe3eed2c45 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Thu, 16 May 2024 11:19:57 -0400 Subject: crypto: ppc/curve25519 - Update Kconfig and Makefile for ppc64le Defined CRYPTO_CURVE25519_PPC64 to support X25519 for ppc64le. Added new module curve25519-ppc64le for X25519. Signed-off-by: Danny Tsen Signed-off-by: Herbert Xu --- arch/powerpc/crypto/Kconfig | 11 +++++++++++ arch/powerpc/crypto/Makefile | 2 ++ 2 files changed, 13 insertions(+) diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 1e201b7ae2fc..09ebcbdfb34f 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -2,6 +2,17 @@ menu "Accelerated Cryptographic Algorithms for CPU (powerpc)" +config CRYPTO_CURVE25519_PPC64 + tristate "Public key crypto: Curve25519 (PowerPC64)" + depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_LIB_CURVE25519_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CURVE25519 + help + Curve25519 algorithm + + Architecture: PowerPC64 + - Little-endian + config CRYPTO_CRC32C_VPMSUM tristate "CRC32c" depends on PPC64 && ALTIVEC diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index fca0e9739869..59808592f0a1 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o +obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o @@ -29,6 +30,7 @@ aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-p chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o +curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) override flavour := linux-ppc64le -- cgit From f9110822fca5b92daefc2bfae4cfcda7dcfb03c9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 21 May 2024 10:54:50 +0800 Subject: crypto: api - Disable boot-test-finished if algapi is a module The boot-test-finished toggle is only necessary if algapi is built into the kernel. Do not include this code if it is a module. Signed-off-by: Herbert Xu --- crypto/algapi.c | 3 +++ crypto/api.c | 4 ++-- crypto/internal.h | 7 +++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/crypto/algapi.c b/crypto/algapi.c index 85bc279b4233..122cd910c4e1 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -1056,6 +1056,9 @@ EXPORT_SYMBOL_GPL(crypto_type_has_alg); static void __init crypto_start_tests(void) { + if (!IS_BUILTIN(CONFIG_CRYPTO_ALGAPI)) + return; + if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) return; diff --git a/crypto/api.c b/crypto/api.c index 6aa5a3b4ed5e..22556907b3bc 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -31,9 +31,9 @@ EXPORT_SYMBOL_GPL(crypto_alg_sem); BLOCKING_NOTIFIER_HEAD(crypto_chain); EXPORT_SYMBOL_GPL(crypto_chain); -#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS +#if IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) && \ + !IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished); -EXPORT_SYMBOL_GPL(__crypto_boot_test_finished); #endif static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg); diff --git a/crypto/internal.h b/crypto/internal.h index 63e59240d5fb..aee31319be2e 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -66,7 +66,8 @@ extern struct blocking_notifier_head crypto_chain; int alg_test(const char *driver, const char *alg, u32 type, u32 mask); -#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS +#if !IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) || \ + IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) static inline bool crypto_boot_test_finished(void) { return true; @@ -84,7 +85,9 @@ static inline void set_crypto_boot_test_finished(void) { static_branch_enable(&__crypto_boot_test_finished); } -#endif /* !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS */ +#endif /* !IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) || + * IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) + */ #ifdef CONFIG_PROC_FS void __init crypto_init_proc(void); -- cgit From a720de9fba164552d920632c2d7531dc8203f063 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 21 May 2024 21:22:49 +0100 Subject: crypto: arm64/crc10dif - Raise priority of NEON crct10dif implementation The NEON implementation of crctd10dif is registered with a priority of 100 which is identical to that used by the generic C implementation. Raise the priority to 150, half way between the PMULL based implementation and the NEON one, so that it will be preferred over the generic implementation. Signed-off-by: Mark Brown Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/crct10dif-ce-glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index 09eb1456aed4..59016518f44d 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -98,7 +98,7 @@ static struct shash_alg crc_t10dif_alg[] = {{ .base.cra_name = "crct10dif", .base.cra_driver_name = "crct10dif-arm64-neon", - .base.cra_priority = 100, + .base.cra_priority = 150, .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { -- cgit From 7c699fe9a5740a228b2974325e817de28f7b6afd Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 23 May 2024 12:47:39 -0700 Subject: crypto: Add missing MODULE_DESCRIPTION() macros Fix the 'make W=1' warnings: WARNING: modpost: missing MODULE_DESCRIPTION() in crypto/cast_common.o WARNING: modpost: missing MODULE_DESCRIPTION() in crypto/af_alg.o WARNING: modpost: missing MODULE_DESCRIPTION() in crypto/algif_hash.o WARNING: modpost: missing MODULE_DESCRIPTION() in crypto/algif_skcipher.o WARNING: modpost: missing MODULE_DESCRIPTION() in crypto/ecc.o WARNING: modpost: missing MODULE_DESCRIPTION() in crypto/curve25519-generic.o WARNING: modpost: missing MODULE_DESCRIPTION() in crypto/xor.o WARNING: modpost: missing MODULE_DESCRIPTION() in crypto/crypto_simd.o Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- crypto/af_alg.c | 1 + crypto/algif_hash.c | 1 + crypto/algif_skcipher.c | 1 + crypto/cast_common.c | 1 + crypto/curve25519-generic.c | 1 + crypto/ecc.c | 1 + crypto/simd.c | 1 + crypto/xor.c | 1 + 8 files changed, 8 insertions(+) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 18cfead0081d..0da7c1ac778a 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1317,5 +1317,6 @@ static void __exit af_alg_exit(void) module_init(af_alg_init); module_exit(af_alg_exit); +MODULE_DESCRIPTION("Crypto userspace interface"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(AF_ALG); diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index 7c7394d46a23..5498a87249d3 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -471,4 +471,5 @@ static void __exit algif_hash_exit(void) module_init(algif_hash_init); module_exit(algif_hash_exit); +MODULE_DESCRIPTION("Userspace interface for hash algorithms"); MODULE_LICENSE("GPL"); diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 02cea2149504..125d395c5e00 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -437,4 +437,5 @@ static void __exit algif_skcipher_exit(void) module_init(algif_skcipher_init); module_exit(algif_skcipher_exit); +MODULE_DESCRIPTION("Userspace interface for skcipher algorithms"); MODULE_LICENSE("GPL"); diff --git a/crypto/cast_common.c b/crypto/cast_common.c index 9b2f60fd4cef..fec1f6609a40 100644 --- a/crypto/cast_common.c +++ b/crypto/cast_common.c @@ -282,4 +282,5 @@ __visible const u32 cast_s4[256] = { }; EXPORT_SYMBOL_GPL(cast_s4); +MODULE_DESCRIPTION("Common lookup tables for CAST-128 (cast5) and CAST-256 (cast6)"); MODULE_LICENSE("GPL"); diff --git a/crypto/curve25519-generic.c b/crypto/curve25519-generic.c index d055b0784c77..68a673262e04 100644 --- a/crypto/curve25519-generic.c +++ b/crypto/curve25519-generic.c @@ -87,4 +87,5 @@ module_exit(curve25519_exit); MODULE_ALIAS_CRYPTO("curve25519"); MODULE_ALIAS_CRYPTO("curve25519-generic"); +MODULE_DESCRIPTION("Curve25519 elliptic curve (RFC7748)"); MODULE_LICENSE("GPL"); diff --git a/crypto/ecc.c b/crypto/ecc.c index fe761256e335..af698f8852fb 100644 --- a/crypto/ecc.c +++ b/crypto/ecc.c @@ -1715,4 +1715,5 @@ out: } EXPORT_SYMBOL(crypto_ecdh_shared_secret); +MODULE_DESCRIPTION("core elliptic curve module"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/crypto/simd.c b/crypto/simd.c index edaa479a1ec5..2aa4f72e224f 100644 --- a/crypto/simd.c +++ b/crypto/simd.c @@ -523,4 +523,5 @@ void simd_unregister_aeads(struct aead_alg *algs, int count, } EXPORT_SYMBOL_GPL(simd_unregister_aeads); +MODULE_DESCRIPTION("Shared crypto SIMD helpers"); MODULE_LICENSE("GPL"); diff --git a/crypto/xor.c b/crypto/xor.c index 8e72e5d5db0d..a1363162978c 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -165,6 +165,7 @@ out: static __exit void xor_exit(void) { } +MODULE_DESCRIPTION("RAID-5 checksumming functions"); MODULE_LICENSE("GPL"); #ifndef MODULE -- cgit From 8d7c52cb4184d3dc26dde62b4f5acd48de0768ae Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 25 May 2024 17:14:35 +0200 Subject: crypto: tegra - Remove an incorrect iommu_fwspec_free() call in tegra_se_remove() The only iommu function call in this driver is a tegra_dev_iommu_get_stream_id() which does not allocate anything and does not take any reference. So there is no point in calling iommu_fwspec_free() in the remove function. Remove this incorrect function call. Fixes: 0880bb3b00c8 ("crypto: tegra - Add Tegra Security Engine driver") Signed-off-by: Christophe JAILLET Tested-by: Akhil R Acked-by: Akhil R Signed-off-by: Herbert Xu --- drivers/crypto/tegra/tegra-se-main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/crypto/tegra/tegra-se-main.c b/drivers/crypto/tegra/tegra-se-main.c index 9955874b3dc3..f94c0331b148 100644 --- a/drivers/crypto/tegra/tegra-se-main.c +++ b/drivers/crypto/tegra/tegra-se-main.c @@ -326,7 +326,6 @@ static void tegra_se_remove(struct platform_device *pdev) crypto_engine_stop(se->engine); crypto_engine_exit(se->engine); - iommu_fwspec_free(se->dev); host1x_client_unregister(&se->client); } -- cgit From aabbf2135f9a9526991f17cb0c78cf1ec878f1c2 Mon Sep 17 00:00:00 2001 From: Lothar Rubusch Date: Sun, 26 May 2024 10:31:28 +0000 Subject: crypto: atmel-sha204a - fix negated return value Fix negated variable return value. Fixes: e05ce444e9e5 ("crypto: atmel-sha204a - add reading from otp zone") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-crypto/34cd4179-090e-479d-b459-8d0d35dd327d@moroto.mountain/ Signed-off-by: Lothar Rubusch Reviewed-by: Dan Carpenter Signed-off-by: Herbert Xu --- drivers/crypto/atmel-sha204a.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/atmel-sha204a.c b/drivers/crypto/atmel-sha204a.c index 24ffdf505023..2034f6031518 100644 --- a/drivers/crypto/atmel-sha204a.c +++ b/drivers/crypto/atmel-sha204a.c @@ -106,7 +106,7 @@ static int atmel_sha204a_otp_read(struct i2c_client *client, u16 addr, u8 *otp) if (cmd.data[0] == 0xff) { dev_err(&client->dev, "failed, device not ready\n"); - return -ret; + return -EINVAL; } memcpy(otp, cmd.data+1, 4); -- cgit From fa501bf25eb353422a0c966ed91662051edac839 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 27 May 2024 01:05:39 -0700 Subject: crypto: testmgr - test setkey in no-SIMD context Since crypto_shash_setkey(), crypto_ahash_setkey(), crypto_skcipher_setkey(), and crypto_aead_setkey() apparently need to work in no-SIMD context on some architectures, make the self-tests cover this scenario. Specifically, sometimes do the setkey while under crypto_disable_simd_for_test(), and do this independently from disabling SIMD for the other parts of the crypto operation since there is no guarantee that all parts happen in the same context. (I.e., drivers mustn't store the key in different formats for SIMD vs. no-SIMD.) Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/testmgr.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 00f5a6cf341a..1b30e3565e80 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -293,6 +293,10 @@ struct test_sg_division { * the @key_offset * @finalization_type: what finalization function to use for hashes * @nosimd: execute with SIMD disabled? Requires !CRYPTO_TFM_REQ_MAY_SLEEP. + * This applies to the parts of the operation that aren't controlled + * individually by @nosimd_setkey or @src_divs[].nosimd. + * @nosimd_setkey: set the key (if applicable) with SIMD disabled? Requires + * !CRYPTO_TFM_REQ_MAY_SLEEP. */ struct testvec_config { const char *name; @@ -306,6 +310,7 @@ struct testvec_config { bool key_offset_relative_to_alignmask; enum finalization_type finalization_type; bool nosimd; + bool nosimd_setkey; }; #define TESTVEC_CONFIG_NAMELEN 192 @@ -533,7 +538,8 @@ static bool valid_testvec_config(const struct testvec_config *cfg) cfg->finalization_type == FINALIZATION_TYPE_DIGEST) return false; - if ((cfg->nosimd || (flags & SGDIVS_HAVE_NOSIMD)) && + if ((cfg->nosimd || cfg->nosimd_setkey || + (flags & SGDIVS_HAVE_NOSIMD)) && (cfg->req_flags & CRYPTO_TFM_REQ_MAY_SLEEP)) return false; @@ -841,7 +847,10 @@ static int prepare_keybuf(const u8 *key, unsigned int ksize, return 0; } -/* Like setkey_f(tfm, key, ksize), but sometimes misalign the key */ +/* + * Like setkey_f(tfm, key, ksize), but sometimes misalign the key. + * In addition, run the setkey function in no-SIMD context if requested. + */ #define do_setkey(setkey_f, tfm, key, ksize, cfg, alignmask) \ ({ \ const u8 *keybuf, *keyptr; \ @@ -850,7 +859,11 @@ static int prepare_keybuf(const u8 *key, unsigned int ksize, err = prepare_keybuf((key), (ksize), (cfg), (alignmask), \ &keybuf, &keyptr); \ if (err == 0) { \ + if ((cfg)->nosimd_setkey) \ + crypto_disable_simd_for_test(); \ err = setkey_f((tfm), keyptr, (ksize)); \ + if ((cfg)->nosimd_setkey) \ + crypto_reenable_simd_for_test(); \ kfree(keybuf); \ } \ err; \ @@ -1118,9 +1131,15 @@ static void generate_random_testvec_config(struct rnd_state *rng, break; } - if (!(cfg->req_flags & CRYPTO_TFM_REQ_MAY_SLEEP) && prandom_bool(rng)) { - cfg->nosimd = true; - p += scnprintf(p, end - p, " nosimd"); + if (!(cfg->req_flags & CRYPTO_TFM_REQ_MAY_SLEEP)) { + if (prandom_bool(rng)) { + cfg->nosimd = true; + p += scnprintf(p, end - p, " nosimd"); + } + if (prandom_bool(rng)) { + cfg->nosimd_setkey = true; + p += scnprintf(p, end - p, " nosimd_setkey"); + } } p += scnprintf(p, end - p, " src_divs=["); -- cgit From 14cba6ace79627a57fb9058582b03f0ed3832390 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Mon, 27 May 2024 16:26:15 +0300 Subject: hwrng: amd - Convert PCIBIOS_* return codes to errnos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit amd_rng_mod_init() uses pci_read_config_dword() that returns PCIBIOS_* codes. The return code is then returned as is but amd_rng_mod_init() is a module_init() function that should return normal errnos. Convert PCIBIOS_* returns code using pcibios_err_to_errno() into normal errno before returning it. Fixes: 96d63c0297cc ("[PATCH] Add AMD HW RNG driver") Cc: stable@vger.kernel.org Signed-off-by: Ilpo Järvinen Signed-off-by: Herbert Xu --- drivers/char/hw_random/amd-rng.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/char/hw_random/amd-rng.c b/drivers/char/hw_random/amd-rng.c index 86162a13681e..9a24d19236dc 100644 --- a/drivers/char/hw_random/amd-rng.c +++ b/drivers/char/hw_random/amd-rng.c @@ -143,8 +143,10 @@ static int __init amd_rng_mod_init(void) found: err = pci_read_config_dword(pdev, 0x58, &pmbase); - if (err) + if (err) { + err = pcibios_err_to_errno(err); goto put_dev; + } pmbase &= 0x0000FF00; if (pmbase == 0) { -- cgit From d7c897a9d8c35d6788b6d92072f5c93be89d4451 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 27 May 2024 23:28:39 +0300 Subject: crypto: ecdsa - Fix the public key format description Public key blob is not just x and y concatenated. It follows RFC5480 section 2.2. Address this by re-documenting the function with the correct description of the format. Link: https://datatracker.ietf.org/doc/html/rfc5480 Fixes: 4e6602916bc6 ("crypto: ecdsa - Add support for ECDSA signature verification") Signed-off-by: Jarkko Sakkinen Reviewed-by: Stefan Berger Signed-off-by: Herbert Xu --- crypto/ecdsa.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crypto/ecdsa.c b/crypto/ecdsa.c index 258fffbf623d..55114146ff84 100644 --- a/crypto/ecdsa.c +++ b/crypto/ecdsa.c @@ -215,9 +215,8 @@ static int ecdsa_ecc_ctx_reset(struct ecc_ctx *ctx) } /* - * Set the public key given the raw uncompressed key data from an X509 - * certificate. The key data contain the concatenated X and Y coordinates of - * the public key. + * Set the public ECC key as defined by RFC5480 section 2.2 "Subject Public + * Key". Only the uncompressed format is supported. */ static int ecdsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen) { -- cgit From fb11a4f6affd61bbee120ca90ea20e4435dc2bde Mon Sep 17 00:00:00 2001 From: Maxime Méré Date: Tue, 28 May 2024 16:05:45 +0200 Subject: crypto: stm32/cryp - use dma when possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use DMA when buffer are aligned and with expected size. If buffer are correctly aligned and bigger than 1KB we have some performance gain: With DMA enable: $ openssl speed -evp aes-256-cbc -engine afalg -elapsed The 'numbers' are in 1000s of bytes per second processed. type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes aes-256-cbc 120.02k 406.78k 1588.82k 5873.32k 26020.52k 34258.94k Without DMA: $ openssl speed -evp aes-256-cbc -engine afalg -elapsed The 'numbers' are in 1000s of bytes per second processed. type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes aes-256-cbc 121.06k 419.95k 1112.23k 1897.47k 2362.03k 2386.60k With DMA: extract of $ modprobe tcrypt mode=500 testing speed of async cbc(aes) (stm32-cbc-aes) encryption tcrypt: test 14 (256 bit key, 16 byte blocks): 1 operation in 1679 cycles (16 bytes) tcrypt: test 15 (256 bit key, 64 byte blocks): 1 operation in 1893 cycles (64 bytes) tcrypt: test 16 (256 bit key, 128 byte blocks): 1 operation in 1760 cycles (128 bytes) tcrypt: test 17 (256 bit key, 256 byte blocks): 1 operation in 2154 cycles (256 bytes) tcrypt: test 18 (256 bit key, 1024 byte blocks): 1 operation in 2132 cycles (1024 bytes) tcrypt: test 19 (256 bit key, 1424 byte blocks): 1 operation in 2466 cycles (1424 bytes) tcrypt: test 20 (256 bit key, 4096 byte blocks): 1 operation in 4040 cycles (4096 bytes) Without DMA: $ modprobe tcrypt mode=500 tcrypt: test 14 (256 bit key, 16 byte blocks): 1 operation in 1671 cycles (16 bytes) tcrypt: test 15 (256 bit key, 64 byte blocks): 1 operation in 2263 cycles (64 bytes) tcrypt: test 16 (256 bit key, 128 byte blocks): 1 operation in 2881 cycles (128 bytes) tcrypt: test 17 (256 bit key, 256 byte blocks): 1 operation in 4270 cycles (256 bytes) tcrypt: test 18 (256 bit key, 1024 byte blocks): 1 operation in 11537 cycles (1024 bytes) tcrypt: test 19 (256 bit key, 1424 byte blocks): 1 operation in 15025 cycles (1424 bytes) tcrypt: test 20 (256 bit key, 4096 byte blocks): 1 operation in 40747 cycles (4096 bytes) Signed-off-by: Alexandre Torgue Signed-off-by: Maxime Méré Signed-off-by: Nicolas Toromanoff Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-cryp.c | 677 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 651 insertions(+), 26 deletions(-) diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c index 11ad4ffdce0d..8b74c45a4f39 100644 --- a/drivers/crypto/stm32/stm32-cryp.c +++ b/drivers/crypto/stm32/stm32-cryp.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -40,6 +42,8 @@ /* Mode mask = bits [15..0] */ #define FLG_MODE_MASK GENMASK(15, 0) /* Bit [31..16] status */ +#define FLG_IN_OUT_DMA BIT(16) +#define FLG_HEADER_DMA BIT(17) /* Registers */ #define CRYP_CR 0x00000000 @@ -121,8 +125,12 @@ #define CR_PH_MASK 0x00030000 #define CR_NBPBL_SHIFT 20 -#define SR_BUSY 0x00000010 -#define SR_OFNE 0x00000004 +#define SR_IFNF BIT(1) +#define SR_OFNE BIT(2) +#define SR_BUSY BIT(8) + +#define DMACR_DIEN BIT(0) +#define DMACR_DOEN BIT(1) #define IMSCR_IN BIT(0) #define IMSCR_OUT BIT(1) @@ -133,7 +141,15 @@ /* Misc */ #define AES_BLOCK_32 (AES_BLOCK_SIZE / sizeof(u32)) #define GCM_CTR_INIT 2 -#define CRYP_AUTOSUSPEND_DELAY 50 +#define CRYP_AUTOSUSPEND_DELAY 50 + +#define CRYP_DMA_BURST_REG 4 + +enum stm32_dma_mode { + NO_DMA, + DMA_PLAIN_SG, + DMA_NEED_SG_TRUNC +}; struct stm32_cryp_caps { bool aeads_support; @@ -146,6 +162,7 @@ struct stm32_cryp_caps { u32 sr; u32 din; u32 dout; + u32 dmacr; u32 imsc; u32 mis; u32 k1l; @@ -172,6 +189,7 @@ struct stm32_cryp { struct list_head list; struct device *dev; void __iomem *regs; + phys_addr_t phys_base; struct clk *clk; unsigned long flags; u32 irq_status; @@ -190,8 +208,20 @@ struct stm32_cryp { size_t header_in; size_t payload_out; + /* DMA process fields */ + struct scatterlist *in_sg; + struct scatterlist *header_sg; struct scatterlist *out_sg; + size_t in_sg_len; + size_t header_sg_len; + size_t out_sg_len; + struct completion dma_completion; + + struct dma_chan *dma_lch_in; + struct dma_chan *dma_lch_out; + enum stm32_dma_mode dma_mode; + /* IT process fields */ struct scatter_walk in_walk; struct scatter_walk out_walk; @@ -291,12 +321,20 @@ static inline int stm32_cryp_wait_enable(struct stm32_cryp *cryp) !(status & CR_CRYPEN), 10, 100000); } +static inline int stm32_cryp_wait_input(struct stm32_cryp *cryp) +{ + u32 status; + + return readl_relaxed_poll_timeout_atomic(cryp->regs + cryp->caps->sr, status, + status & SR_IFNF, 1, 10); +} + static inline int stm32_cryp_wait_output(struct stm32_cryp *cryp) { u32 status; - return readl_relaxed_poll_timeout(cryp->regs + cryp->caps->sr, status, - status & SR_OFNE, 10, 100000); + return readl_relaxed_poll_timeout_atomic(cryp->regs + cryp->caps->sr, status, + status & SR_OFNE, 1, 10); } static inline void stm32_cryp_key_read_enable(struct stm32_cryp *cryp) @@ -311,8 +349,13 @@ static inline void stm32_cryp_key_read_disable(struct stm32_cryp *cryp) cryp->regs + cryp->caps->cr); } +static void stm32_cryp_irq_read_data(struct stm32_cryp *cryp); +static void stm32_cryp_irq_write_data(struct stm32_cryp *cryp); +static void stm32_cryp_irq_write_gcmccm_header(struct stm32_cryp *cryp); static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp); static void stm32_cryp_finish_req(struct stm32_cryp *cryp, int err); +static int stm32_cryp_dma_start(struct stm32_cryp *cryp); +static int stm32_cryp_it_start(struct stm32_cryp *cryp); static struct stm32_cryp *stm32_cryp_find_dev(struct stm32_cryp_ctx *ctx) { @@ -813,11 +856,238 @@ static void stm32_cryp_finish_req(struct stm32_cryp *cryp, int err) if (is_gcm(cryp) || is_ccm(cryp)) crypto_finalize_aead_request(cryp->engine, cryp->areq, err); else - crypto_finalize_skcipher_request(cryp->engine, cryp->req, - err); + crypto_finalize_skcipher_request(cryp->engine, cryp->req, err); +} + +static void stm32_cryp_header_dma_callback(void *param) +{ + struct stm32_cryp *cryp = (struct stm32_cryp *)param; + int ret; + u32 reg; + + dma_unmap_sg(cryp->dev, cryp->header_sg, cryp->header_sg_len, DMA_TO_DEVICE); + + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); + stm32_cryp_write(cryp, cryp->caps->dmacr, reg & ~(DMACR_DOEN | DMACR_DIEN)); + + kfree(cryp->header_sg); + + reg = stm32_cryp_read(cryp, cryp->caps->cr); + + if (cryp->header_in) { + stm32_cryp_write(cryp, cryp->caps->cr, reg | CR_CRYPEN); + + ret = stm32_cryp_wait_input(cryp); + if (ret) { + dev_err(cryp->dev, "input header ready timeout after dma\n"); + stm32_cryp_finish_req(cryp, ret); + return; + } + stm32_cryp_irq_write_gcmccm_header(cryp); + WARN_ON(cryp->header_in); + } + + if (stm32_cryp_get_input_text_len(cryp)) { + /* Phase 3 : payload */ + reg = stm32_cryp_read(cryp, cryp->caps->cr); + stm32_cryp_write(cryp, cryp->caps->cr, reg & ~CR_CRYPEN); + + reg &= ~CR_PH_MASK; + reg |= CR_PH_PAYLOAD | CR_CRYPEN; + stm32_cryp_write(cryp, cryp->caps->cr, reg); + + if (cryp->flags & FLG_IN_OUT_DMA) { + ret = stm32_cryp_dma_start(cryp); + if (ret) + stm32_cryp_finish_req(cryp, ret); + } else { + stm32_cryp_it_start(cryp); + } + } else { + /* + * Phase 4 : tag. + * Nothing to read, nothing to write => end request + */ + stm32_cryp_finish_req(cryp, 0); + } +} + +static void stm32_cryp_dma_callback(void *param) +{ + struct stm32_cryp *cryp = (struct stm32_cryp *)param; + int ret; + u32 reg; + + complete(&cryp->dma_completion); /* completion to indicate no timeout */ + + dma_sync_sg_for_device(cryp->dev, cryp->out_sg, cryp->out_sg_len, DMA_FROM_DEVICE); + + if (cryp->in_sg != cryp->out_sg) + dma_unmap_sg(cryp->dev, cryp->in_sg, cryp->in_sg_len, DMA_TO_DEVICE); + + dma_unmap_sg(cryp->dev, cryp->out_sg, cryp->out_sg_len, DMA_FROM_DEVICE); + + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); + stm32_cryp_write(cryp, cryp->caps->dmacr, reg & ~(DMACR_DOEN | DMACR_DIEN)); + + reg = stm32_cryp_read(cryp, cryp->caps->cr); + + if (is_gcm(cryp) || is_ccm(cryp)) { + kfree(cryp->in_sg); + kfree(cryp->out_sg); + } else { + if (cryp->in_sg != cryp->req->src) + kfree(cryp->in_sg); + if (cryp->out_sg != cryp->req->dst) + kfree(cryp->out_sg); + } + + if (cryp->payload_in) { + stm32_cryp_write(cryp, cryp->caps->cr, reg | CR_CRYPEN); + + ret = stm32_cryp_wait_input(cryp); + if (ret) { + dev_err(cryp->dev, "input ready timeout after dma\n"); + stm32_cryp_finish_req(cryp, ret); + return; + } + stm32_cryp_irq_write_data(cryp); + + ret = stm32_cryp_wait_output(cryp); + if (ret) { + dev_err(cryp->dev, "output ready timeout after dma\n"); + stm32_cryp_finish_req(cryp, ret); + return; + } + stm32_cryp_irq_read_data(cryp); + } + + stm32_cryp_finish_req(cryp, 0); } -static int stm32_cryp_cpu_start(struct stm32_cryp *cryp) +static int stm32_cryp_header_dma_start(struct stm32_cryp *cryp) +{ + int ret; + struct dma_async_tx_descriptor *tx_in; + u32 reg; + size_t align_size; + + ret = dma_map_sg(cryp->dev, cryp->header_sg, cryp->header_sg_len, DMA_TO_DEVICE); + if (!ret) { + dev_err(cryp->dev, "dma_map_sg() error\n"); + return -ENOMEM; + } + + dma_sync_sg_for_device(cryp->dev, cryp->header_sg, cryp->header_sg_len, DMA_TO_DEVICE); + + tx_in = dmaengine_prep_slave_sg(cryp->dma_lch_in, cryp->header_sg, cryp->header_sg_len, + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + if (!tx_in) { + dev_err(cryp->dev, "IN prep_slave_sg() failed\n"); + return -EINVAL; + } + + tx_in->callback_param = cryp; + tx_in->callback = stm32_cryp_header_dma_callback; + + /* Advance scatterwalk to not DMA'ed data */ + align_size = ALIGN_DOWN(cryp->header_in, cryp->hw_blocksize); + scatterwalk_copychunks(NULL, &cryp->in_walk, align_size, 2); + cryp->header_in -= align_size; + + ret = dma_submit_error(dmaengine_submit(tx_in)); + if (ret < 0) { + dev_err(cryp->dev, "DMA in submit failed\n"); + return ret; + } + dma_async_issue_pending(cryp->dma_lch_in); + + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); + stm32_cryp_write(cryp, cryp->caps->dmacr, reg | DMACR_DIEN); + + return 0; +} + +static int stm32_cryp_dma_start(struct stm32_cryp *cryp) +{ + int ret; + size_t align_size; + struct dma_async_tx_descriptor *tx_in, *tx_out; + u32 reg; + + if (cryp->in_sg != cryp->out_sg) { + ret = dma_map_sg(cryp->dev, cryp->in_sg, cryp->in_sg_len, DMA_TO_DEVICE); + if (!ret) { + dev_err(cryp->dev, "dma_map_sg() error\n"); + return -ENOMEM; + } + } + + ret = dma_map_sg(cryp->dev, cryp->out_sg, cryp->out_sg_len, DMA_FROM_DEVICE); + if (!ret) { + dev_err(cryp->dev, "dma_map_sg() error\n"); + return -ENOMEM; + } + + dma_sync_sg_for_device(cryp->dev, cryp->in_sg, cryp->in_sg_len, DMA_TO_DEVICE); + + tx_in = dmaengine_prep_slave_sg(cryp->dma_lch_in, cryp->in_sg, cryp->in_sg_len, + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + if (!tx_in) { + dev_err(cryp->dev, "IN prep_slave_sg() failed\n"); + return -EINVAL; + } + + /* No callback necessary */ + tx_in->callback_param = cryp; + tx_in->callback = NULL; + + tx_out = dmaengine_prep_slave_sg(cryp->dma_lch_out, cryp->out_sg, cryp->out_sg_len, + DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + if (!tx_out) { + dev_err(cryp->dev, "OUT prep_slave_sg() failed\n"); + return -EINVAL; + } + + reinit_completion(&cryp->dma_completion); + tx_out->callback = stm32_cryp_dma_callback; + tx_out->callback_param = cryp; + + /* Advance scatterwalk to not DMA'ed data */ + align_size = ALIGN_DOWN(cryp->payload_in, cryp->hw_blocksize); + scatterwalk_copychunks(NULL, &cryp->in_walk, align_size, 2); + cryp->payload_in -= align_size; + + ret = dma_submit_error(dmaengine_submit(tx_in)); + if (ret < 0) { + dev_err(cryp->dev, "DMA in submit failed\n"); + return ret; + } + dma_async_issue_pending(cryp->dma_lch_in); + + /* Advance scatterwalk to not DMA'ed data */ + scatterwalk_copychunks(NULL, &cryp->out_walk, align_size, 2); + cryp->payload_out -= align_size; + ret = dma_submit_error(dmaengine_submit(tx_out)); + if (ret < 0) { + dev_err(cryp->dev, "DMA out submit failed\n"); + return ret; + } + dma_async_issue_pending(cryp->dma_lch_out); + + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); + stm32_cryp_write(cryp, cryp->caps->dmacr, reg | DMACR_DOEN | DMACR_DIEN); + + if (!wait_for_completion_timeout(&cryp->dma_completion, msecs_to_jiffies(1000))) { + dev_err(cryp->dev, "DMA out timed out\n"); + dmaengine_terminate_sync(cryp->dma_lch_out); + return -ETIMEDOUT; + } + + return 0; +} + +static int stm32_cryp_it_start(struct stm32_cryp *cryp) { /* Enable interrupt and let the IRQ handler do everything */ stm32_cryp_write(cryp, cryp->caps->imsc, IMSCR_IN | IMSCR_OUT); @@ -1149,13 +1419,256 @@ static int stm32_cryp_tdes_cbc_decrypt(struct skcipher_request *req) return stm32_cryp_crypt(req, FLG_TDES | FLG_CBC); } +static enum stm32_dma_mode stm32_cryp_dma_check_sg(struct scatterlist *test_sg, size_t len, + size_t block_size) +{ + struct scatterlist *sg; + int i; + + if (len <= 16) + return NO_DMA; /* Faster */ + + for_each_sg(test_sg, sg, sg_nents(test_sg), i) { + if (!IS_ALIGNED(sg->length, block_size) && !sg_is_last(sg)) + return NO_DMA; + + if (sg->offset % sizeof(u32)) + return NO_DMA; + + if (sg_is_last(sg) && !IS_ALIGNED(sg->length, AES_BLOCK_SIZE)) + return DMA_NEED_SG_TRUNC; + } + + return DMA_PLAIN_SG; +} + +static enum stm32_dma_mode stm32_cryp_dma_check(struct stm32_cryp *cryp, struct scatterlist *in_sg, + struct scatterlist *out_sg) +{ + enum stm32_dma_mode ret = DMA_PLAIN_SG; + + if (!is_aes(cryp)) + return NO_DMA; + + if (!cryp->dma_lch_in || !cryp->dma_lch_out) + return NO_DMA; + + ret = stm32_cryp_dma_check_sg(in_sg, cryp->payload_in, AES_BLOCK_SIZE); + if (ret == NO_DMA) + return ret; + + ret = stm32_cryp_dma_check_sg(out_sg, cryp->payload_out, AES_BLOCK_SIZE); + if (ret == NO_DMA) + return ret; + + /* Check CTR counter overflow */ + if (is_aes(cryp) && is_ctr(cryp)) { + u32 c; + __be32 iv3; + + memcpy(&iv3, &cryp->req->iv[3 * sizeof(u32)], sizeof(iv3)); + c = be32_to_cpu(iv3); + if ((c + cryp->payload_in) < cryp->payload_in) + return NO_DMA; + } + + /* Workaround */ + if (is_aes(cryp) && is_ctr(cryp) && ret == DMA_NEED_SG_TRUNC) + return NO_DMA; + + return ret; +} + +static int stm32_cryp_truncate_sg(struct scatterlist **new_sg, size_t *new_sg_len, + struct scatterlist *sg, off_t skip, size_t size) +{ + struct scatterlist *cur; + int alloc_sg_len; + + *new_sg_len = 0; + + if (!sg || !size) { + *new_sg = NULL; + return 0; + } + + alloc_sg_len = sg_nents_for_len(sg, skip + size); + if (alloc_sg_len < 0) + return alloc_sg_len; + + /* We allocate to much sg entry, but it is easier */ + *new_sg = kmalloc_array((size_t)alloc_sg_len, sizeof(struct scatterlist), GFP_KERNEL); + if (!*new_sg) + return -ENOMEM; + + sg_init_table(*new_sg, (unsigned int)alloc_sg_len); + + cur = *new_sg; + while (sg && size) { + unsigned int len = sg->length; + unsigned int offset = sg->offset; + + if (skip > len) { + skip -= len; + sg = sg_next(sg); + continue; + } + + if (skip) { + len -= skip; + offset += skip; + skip = 0; + } + + if (size < len) + len = size; + + if (len > 0) { + (*new_sg_len)++; + size -= len; + sg_set_page(cur, sg_page(sg), len, offset); + if (size == 0) + sg_mark_end(cur); + cur = sg_next(cur); + } + + sg = sg_next(sg); + } + + return 0; +} + +static int stm32_cryp_cipher_prepare(struct stm32_cryp *cryp, struct scatterlist *in_sg, + struct scatterlist *out_sg) +{ + size_t align_size; + int ret; + + cryp->dma_mode = stm32_cryp_dma_check(cryp, in_sg, out_sg); + + scatterwalk_start(&cryp->in_walk, in_sg); + scatterwalk_start(&cryp->out_walk, out_sg); + + if (cryp->dma_mode == NO_DMA) { + cryp->flags &= ~FLG_IN_OUT_DMA; + + if (is_ctr(cryp)) + memset(cryp->last_ctr, 0, sizeof(cryp->last_ctr)); + + } else if (cryp->dma_mode == DMA_NEED_SG_TRUNC) { + + cryp->flags |= FLG_IN_OUT_DMA; + + align_size = ALIGN_DOWN(cryp->payload_in, cryp->hw_blocksize); + ret = stm32_cryp_truncate_sg(&cryp->in_sg, &cryp->in_sg_len, in_sg, 0, align_size); + if (ret) + return ret; + + ret = stm32_cryp_truncate_sg(&cryp->out_sg, &cryp->out_sg_len, out_sg, 0, + align_size); + if (ret) { + kfree(cryp->in_sg); + return ret; + } + } else { + cryp->flags |= FLG_IN_OUT_DMA; + + cryp->in_sg = in_sg; + cryp->out_sg = out_sg; + + ret = sg_nents_for_len(cryp->in_sg, cryp->payload_in); + if (ret < 0) + return ret; + cryp->in_sg_len = (size_t)ret; + + ret = sg_nents_for_len(out_sg, cryp->payload_out); + if (ret < 0) + return ret; + cryp->out_sg_len = (size_t)ret; + } + + return 0; +} + +static int stm32_cryp_aead_prepare(struct stm32_cryp *cryp, struct scatterlist *in_sg, + struct scatterlist *out_sg) +{ + size_t align_size; + off_t skip; + int ret, ret2; + + cryp->header_sg = NULL; + cryp->in_sg = NULL; + cryp->out_sg = NULL; + + if (!cryp->dma_lch_in || !cryp->dma_lch_out) { + cryp->dma_mode = NO_DMA; + cryp->flags &= ~(FLG_IN_OUT_DMA | FLG_HEADER_DMA); + + return 0; + } + + /* CCM hw_init may have advanced in header */ + skip = cryp->areq->assoclen - cryp->header_in; + + align_size = ALIGN_DOWN(cryp->header_in, cryp->hw_blocksize); + ret = stm32_cryp_truncate_sg(&cryp->header_sg, &cryp->header_sg_len, in_sg, skip, + align_size); + if (ret) + return ret; + + ret = stm32_cryp_dma_check_sg(cryp->header_sg, align_size, AES_BLOCK_SIZE); + if (ret == NO_DMA) { + /* We cannot DMA the header */ + kfree(cryp->header_sg); + cryp->header_sg = NULL; + + cryp->flags &= ~FLG_HEADER_DMA; + } else { + cryp->flags |= FLG_HEADER_DMA; + } + + /* Now skip all header to be at payload start */ + skip = cryp->areq->assoclen; + align_size = ALIGN_DOWN(cryp->payload_in, cryp->hw_blocksize); + ret = stm32_cryp_truncate_sg(&cryp->in_sg, &cryp->in_sg_len, in_sg, skip, align_size); + if (ret) { + kfree(cryp->header_sg); + return ret; + } + + /* For out buffer align_size is same as in buffer */ + ret = stm32_cryp_truncate_sg(&cryp->out_sg, &cryp->out_sg_len, out_sg, skip, align_size); + if (ret) { + kfree(cryp->header_sg); + kfree(cryp->in_sg); + return ret; + } + + ret = stm32_cryp_dma_check_sg(cryp->in_sg, align_size, AES_BLOCK_SIZE); + ret2 = stm32_cryp_dma_check_sg(cryp->out_sg, align_size, AES_BLOCK_SIZE); + if (ret == NO_DMA || ret2 == NO_DMA) { + kfree(cryp->in_sg); + cryp->in_sg = NULL; + + kfree(cryp->out_sg); + cryp->out_sg = NULL; + + cryp->flags &= ~FLG_IN_OUT_DMA; + } else { + cryp->flags |= FLG_IN_OUT_DMA; + } + + return 0; +} + static int stm32_cryp_prepare_req(struct skcipher_request *req, struct aead_request *areq) { struct stm32_cryp_ctx *ctx; struct stm32_cryp *cryp; struct stm32_cryp_reqctx *rctx; - struct scatterlist *in_sg; + struct scatterlist *in_sg, *out_sg; int ret; if (!req && !areq) @@ -1169,8 +1682,6 @@ static int stm32_cryp_prepare_req(struct skcipher_request *req, rctx = req ? skcipher_request_ctx(req) : aead_request_ctx(areq); rctx->mode &= FLG_MODE_MASK; - ctx->cryp = cryp; - cryp->flags = (cryp->flags & ~FLG_MODE_MASK) | rctx->mode; cryp->hw_blocksize = is_aes(cryp) ? AES_BLOCK_SIZE : DES_BLOCK_SIZE; cryp->ctx = ctx; @@ -1182,6 +1693,15 @@ static int stm32_cryp_prepare_req(struct skcipher_request *req, cryp->payload_in = req->cryptlen; cryp->payload_out = req->cryptlen; cryp->authsize = 0; + + in_sg = req->src; + out_sg = req->dst; + + ret = stm32_cryp_cipher_prepare(cryp, in_sg, out_sg); + if (ret) + return ret; + + ret = stm32_cryp_hw_init(cryp); } else { /* * Length of input and output data: @@ -1211,23 +1731,22 @@ static int stm32_cryp_prepare_req(struct skcipher_request *req, cryp->header_in = areq->assoclen; cryp->payload_out = cryp->payload_in; } - } - - in_sg = req ? req->src : areq->src; - scatterwalk_start(&cryp->in_walk, in_sg); - cryp->out_sg = req ? req->dst : areq->dst; - scatterwalk_start(&cryp->out_walk, cryp->out_sg); + in_sg = areq->src; + out_sg = areq->dst; - if (is_gcm(cryp) || is_ccm(cryp)) { + scatterwalk_start(&cryp->in_walk, in_sg); + scatterwalk_start(&cryp->out_walk, out_sg); /* In output, jump after assoc data */ scatterwalk_copychunks(NULL, &cryp->out_walk, cryp->areq->assoclen, 2); - } - if (is_ctr(cryp)) - memset(cryp->last_ctr, 0, sizeof(cryp->last_ctr)); + ret = stm32_cryp_hw_init(cryp); + if (ret) + return ret; + + ret = stm32_cryp_aead_prepare(cryp, in_sg, out_sg); + } - ret = stm32_cryp_hw_init(cryp); return ret; } @@ -1239,12 +1758,24 @@ static int stm32_cryp_cipher_one_req(struct crypto_engine *engine, void *areq) struct stm32_cryp_ctx *ctx = crypto_skcipher_ctx( crypto_skcipher_reqtfm(req)); struct stm32_cryp *cryp = ctx->cryp; + int ret; if (!cryp) return -ENODEV; - return stm32_cryp_prepare_req(req, NULL) ?: - stm32_cryp_cpu_start(cryp); + ret = stm32_cryp_prepare_req(req, NULL); + if (ret) + return ret; + + if (cryp->flags & FLG_IN_OUT_DMA) + ret = stm32_cryp_dma_start(cryp); + else + ret = stm32_cryp_it_start(cryp); + + if (ret == -ETIMEDOUT) + stm32_cryp_finish_req(cryp, ret); + + return ret; } static int stm32_cryp_aead_one_req(struct crypto_engine *engine, void *areq) @@ -1262,13 +1793,20 @@ static int stm32_cryp_aead_one_req(struct crypto_engine *engine, void *areq) if (err) return err; - if (unlikely(!cryp->payload_in && !cryp->header_in)) { + if (!stm32_cryp_get_input_text_len(cryp) && !cryp->header_in && + !(cryp->flags & FLG_HEADER_DMA)) { /* No input data to process: get tag and finish */ stm32_cryp_finish_req(cryp, 0); return 0; } - return stm32_cryp_cpu_start(cryp); + if (cryp->flags & FLG_HEADER_DMA) + return stm32_cryp_header_dma_start(cryp); + + if (!cryp->header_in && cryp->flags & FLG_IN_OUT_DMA) + return stm32_cryp_dma_start(cryp); + + return stm32_cryp_it_start(cryp); } static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp) @@ -1680,6 +2218,65 @@ static irqreturn_t stm32_cryp_irq(int irq, void *arg) return IRQ_WAKE_THREAD; } +static int stm32_cryp_dma_init(struct stm32_cryp *cryp) +{ + struct dma_slave_config dma_conf; + struct dma_chan *chan; + int ret; + + memset(&dma_conf, 0, sizeof(dma_conf)); + + dma_conf.direction = DMA_MEM_TO_DEV; + dma_conf.dst_addr = cryp->phys_base + cryp->caps->din; + dma_conf.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; + dma_conf.dst_maxburst = CRYP_DMA_BURST_REG; + dma_conf.device_fc = false; + + chan = dma_request_chan(cryp->dev, "in"); + if (IS_ERR(chan)) + return PTR_ERR(chan); + + cryp->dma_lch_in = chan; + ret = dmaengine_slave_config(cryp->dma_lch_in, &dma_conf); + if (ret) { + dma_release_channel(cryp->dma_lch_in); + cryp->dma_lch_in = NULL; + dev_err(cryp->dev, "Couldn't configure DMA in slave.\n"); + return ret; + } + + memset(&dma_conf, 0, sizeof(dma_conf)); + + dma_conf.direction = DMA_DEV_TO_MEM; + dma_conf.src_addr = cryp->phys_base + cryp->caps->dout; + dma_conf.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; + dma_conf.src_maxburst = CRYP_DMA_BURST_REG; + dma_conf.device_fc = false; + + chan = dma_request_chan(cryp->dev, "out"); + if (IS_ERR(chan)) { + dma_release_channel(cryp->dma_lch_in); + cryp->dma_lch_in = NULL; + return PTR_ERR(chan); + } + + cryp->dma_lch_out = chan; + + ret = dmaengine_slave_config(cryp->dma_lch_out, &dma_conf); + if (ret) { + dma_release_channel(cryp->dma_lch_out); + cryp->dma_lch_out = NULL; + dev_err(cryp->dev, "Couldn't configure DMA out slave.\n"); + dma_release_channel(cryp->dma_lch_in); + cryp->dma_lch_in = NULL; + return ret; + } + + init_completion(&cryp->dma_completion); + + return 0; +} + static struct skcipher_engine_alg crypto_algs[] = { { .base = { @@ -1901,6 +2498,7 @@ static const struct stm32_cryp_caps ux500_data = { .sr = UX500_CRYP_SR, .din = UX500_CRYP_DIN, .dout = UX500_CRYP_DOUT, + .dmacr = UX500_CRYP_DMACR, .imsc = UX500_CRYP_IMSC, .mis = UX500_CRYP_MIS, .k1l = UX500_CRYP_K1L, @@ -1923,6 +2521,7 @@ static const struct stm32_cryp_caps f7_data = { .sr = CRYP_SR, .din = CRYP_DIN, .dout = CRYP_DOUT, + .dmacr = CRYP_DMACR, .imsc = CRYP_IMSCR, .mis = CRYP_MISR, .k1l = CRYP_K1LR, @@ -1945,6 +2544,7 @@ static const struct stm32_cryp_caps mp1_data = { .sr = CRYP_SR, .din = CRYP_DIN, .dout = CRYP_DOUT, + .dmacr = CRYP_DMACR, .imsc = CRYP_IMSCR, .mis = CRYP_MISR, .k1l = CRYP_K1LR, @@ -1985,6 +2585,8 @@ static int stm32_cryp_probe(struct platform_device *pdev) if (IS_ERR(cryp->regs)) return PTR_ERR(cryp->regs); + cryp->phys_base = platform_get_resource(pdev, IORESOURCE_MEM, 0)->start; + irq = platform_get_irq(pdev, 0); if (irq < 0) return irq; @@ -2030,6 +2632,17 @@ static int stm32_cryp_probe(struct platform_device *pdev) platform_set_drvdata(pdev, cryp); + ret = stm32_cryp_dma_init(cryp); + switch (ret) { + case 0: + break; + case -ENODEV: + dev_dbg(dev, "DMA mode not available\n"); + break; + default: + goto err_dma; + } + spin_lock(&cryp_list.lock); list_add(&cryp->list, &cryp_list.dev_list); spin_unlock(&cryp_list.lock); @@ -2075,6 +2688,12 @@ err_engine1: spin_lock(&cryp_list.lock); list_del(&cryp->list); spin_unlock(&cryp_list.lock); + + if (cryp->dma_lch_in) + dma_release_channel(cryp->dma_lch_in); + if (cryp->dma_lch_out) + dma_release_channel(cryp->dma_lch_out); +err_dma: err_rst: pm_runtime_disable(dev); pm_runtime_put_noidle(dev); @@ -2101,6 +2720,12 @@ static void stm32_cryp_remove(struct platform_device *pdev) list_del(&cryp->list); spin_unlock(&cryp_list.lock); + if (cryp->dma_lch_in) + dma_release_channel(cryp->dma_lch_in); + + if (cryp->dma_lch_out) + dma_release_channel(cryp->dma_lch_out); + pm_runtime_disable(cryp->dev); pm_runtime_put_noidle(cryp->dev); -- cgit From 6364352ec9907a5232225a9d677109827ffc0875 Mon Sep 17 00:00:00 2001 From: Maxime Méré Date: Tue, 28 May 2024 16:05:46 +0200 Subject: crypto: stm32/cryp - increase priority MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increase STM32 CRYP priority, to be greater than the ARM-NEON accelerated version. Signed-of-by: Maxime Méré Signed-off-by: Nicolas Toromanoff Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-cryp.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c index 8b74c45a4f39..b0d278421461 100644 --- a/drivers/crypto/stm32/stm32-cryp.c +++ b/drivers/crypto/stm32/stm32-cryp.c @@ -2282,7 +2282,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "ecb(aes)", .base.cra_driver_name = "stm32-ecb-aes", - .base.cra_priority = 200, + .base.cra_priority = 300, .base.cra_flags = CRYPTO_ALG_ASYNC, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), @@ -2304,7 +2304,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "cbc(aes)", .base.cra_driver_name = "stm32-cbc-aes", - .base.cra_priority = 200, + .base.cra_priority = 300, .base.cra_flags = CRYPTO_ALG_ASYNC, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), @@ -2327,7 +2327,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "ctr(aes)", .base.cra_driver_name = "stm32-ctr-aes", - .base.cra_priority = 200, + .base.cra_priority = 300, .base.cra_flags = CRYPTO_ALG_ASYNC, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), @@ -2350,7 +2350,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "ecb(des)", .base.cra_driver_name = "stm32-ecb-des", - .base.cra_priority = 200, + .base.cra_priority = 300, .base.cra_flags = CRYPTO_ALG_ASYNC, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), @@ -2372,7 +2372,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "cbc(des)", .base.cra_driver_name = "stm32-cbc-des", - .base.cra_priority = 200, + .base.cra_priority = 300, .base.cra_flags = CRYPTO_ALG_ASYNC, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), @@ -2395,7 +2395,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "ecb(des3_ede)", .base.cra_driver_name = "stm32-ecb-des3", - .base.cra_priority = 200, + .base.cra_priority = 300, .base.cra_flags = CRYPTO_ALG_ASYNC, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), @@ -2417,7 +2417,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "cbc(des3_ede)", .base.cra_driver_name = "stm32-cbc-des3", - .base.cra_priority = 200, + .base.cra_priority = 300, .base.cra_flags = CRYPTO_ALG_ASYNC, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), @@ -2451,7 +2451,7 @@ static struct aead_engine_alg aead_algs[] = { .base.base = { .cra_name = "gcm(aes)", .cra_driver_name = "stm32-gcm-aes", - .cra_priority = 200, + .cra_priority = 300, .cra_flags = CRYPTO_ALG_ASYNC, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct stm32_cryp_ctx), @@ -2474,7 +2474,7 @@ static struct aead_engine_alg aead_algs[] = { .base.base = { .cra_name = "ccm(aes)", .cra_driver_name = "stm32-ccm-aes", - .cra_priority = 200, + .cra_priority = 300, .cra_flags = CRYPTO_ALG_ASYNC, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct stm32_cryp_ctx), -- cgit From 4027725259cc55138672f21373ac04f14aab2836 Mon Sep 17 00:00:00 2001 From: Maxime Méré Date: Tue, 28 May 2024 16:05:47 +0200 Subject: crypto: stm32/cryp - add CRYPTO_ALG_KERN_DRIVER_ONLY flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This flag is needed to make the driver visible from openssl and cryptodev. Signed-off-by: Maxime Méré Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-cryp.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c index b0d278421461..445276b848ed 100644 --- a/drivers/crypto/stm32/stm32-cryp.c +++ b/drivers/crypto/stm32/stm32-cryp.c @@ -2283,7 +2283,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base.cra_name = "ecb(aes)", .base.cra_driver_name = "stm32-ecb-aes", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -2305,7 +2305,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base.cra_name = "cbc(aes)", .base.cra_driver_name = "stm32-cbc-aes", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -2328,7 +2328,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base.cra_name = "ctr(aes)", .base.cra_driver_name = "stm32-ctr-aes", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -2351,7 +2351,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base.cra_name = "ecb(des)", .base.cra_driver_name = "stm32-ecb-des", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -2373,7 +2373,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base.cra_name = "cbc(des)", .base.cra_driver_name = "stm32-cbc-des", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -2396,7 +2396,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base.cra_name = "ecb(des3_ede)", .base.cra_driver_name = "stm32-ecb-des3", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -2418,7 +2418,7 @@ static struct skcipher_engine_alg crypto_algs[] = { .base.cra_name = "cbc(des3_ede)", .base.cra_driver_name = "stm32-cbc-des3", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -2452,7 +2452,7 @@ static struct aead_engine_alg aead_algs[] = { .cra_name = "gcm(aes)", .cra_driver_name = "stm32-gcm-aes", .cra_priority = 300, - .cra_flags = CRYPTO_ALG_ASYNC, + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct stm32_cryp_ctx), .cra_alignmask = 0, @@ -2475,7 +2475,7 @@ static struct aead_engine_alg aead_algs[] = { .cra_name = "ccm(aes)", .cra_driver_name = "stm32-ccm-aes", .cra_priority = 300, - .cra_flags = CRYPTO_ALG_ASYNC, + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct stm32_cryp_ctx), .cra_alignmask = 0, -- cgit From 56ddb9aa3b324c2d9645b5a7343e46010cf3f6ce Mon Sep 17 00:00:00 2001 From: Maxime Méré Date: Tue, 28 May 2024 16:05:48 +0200 Subject: crypto: stm32/cryp - call finalize with bh disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The finalize operation in interrupt mode produce a produces a spinlock recursion warning. The reason is the fact that BH must be disabled during this process. Signed-off-by: Maxime Méré Signed-off-by: Herbert Xu --- drivers/crypto/stm32/stm32-cryp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c index 445276b848ed..937f6dab8955 100644 --- a/drivers/crypto/stm32/stm32-cryp.c +++ b/drivers/crypto/stm32/stm32-cryp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -2203,8 +2204,11 @@ static irqreturn_t stm32_cryp_irq_thread(int irq, void *arg) it_mask &= ~IMSCR_OUT; stm32_cryp_write(cryp, cryp->caps->imsc, it_mask); - if (!cryp->payload_in && !cryp->header_in && !cryp->payload_out) + if (!cryp->payload_in && !cryp->header_in && !cryp->payload_out) { + local_bh_disable(); stm32_cryp_finish_req(cryp, 0); + local_bh_enable(); + } return IRQ_HANDLED; } -- cgit From 8609dd25f9b271b3338e38b018fd39e49de1e6ca Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 28 May 2024 16:07:08 -0500 Subject: crypto: ccp - Represent capabilities register as a union Making the capabilities register a union makes it easier to refer to the members instead of always doing bit shifts. No intended functional changes. Acked-by: Tom Lendacky Suggested-by: Yazen Ghannam Signed-off-by: Mario Limonciello Signed-off-by: Herbert Xu --- drivers/crypto/ccp/dbc.c | 2 +- drivers/crypto/ccp/psp-dev.c | 11 +++++------ drivers/crypto/ccp/psp-dev.h | 44 ++++++++++++++++++++++++-------------------- drivers/crypto/ccp/sp-dev.h | 1 - drivers/crypto/ccp/sp-pci.c | 26 ++++++++++++-------------- 5 files changed, 42 insertions(+), 42 deletions(-) diff --git a/drivers/crypto/ccp/dbc.c b/drivers/crypto/ccp/dbc.c index d373caab52f8..5b105a23f699 100644 --- a/drivers/crypto/ccp/dbc.c +++ b/drivers/crypto/ccp/dbc.c @@ -223,7 +223,7 @@ int dbc_dev_init(struct psp_device *psp) dbc_dev->dev = dev; dbc_dev->psp = psp; - if (PSP_CAPABILITY(psp, DBC_THRU_EXT)) { + if (psp->capability.dbc_thru_ext) { dbc_dev->use_ext = true; dbc_dev->payload_size = &dbc_dev->mbox->ext_req.header.payload_size; dbc_dev->result = &dbc_dev->mbox->ext_req.header.status; diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 56bf832c2947..7d9d2042be35 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -154,11 +154,10 @@ static unsigned int psp_get_capability(struct psp_device *psp) dev_notice(psp->dev, "psp: unable to access the device: you might be running a broken BIOS.\n"); return -ENODEV; } - psp->capability = val; + psp->capability.raw = val; /* Detect TSME and/or SME status */ - if (PSP_CAPABILITY(psp, PSP_SECURITY_REPORTING) && - psp->capability & (PSP_SECURITY_TSME_STATUS << PSP_CAPABILITY_PSP_SECURITY_OFFSET)) { + if (psp->capability.security_reporting && psp->capability.tsme_status) { if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) dev_notice(psp->dev, "psp: Both TSME and SME are active, SME is unnecessary when TSME is active.\n"); else @@ -171,7 +170,7 @@ static unsigned int psp_get_capability(struct psp_device *psp) static int psp_check_sev_support(struct psp_device *psp) { /* Check if device supports SEV feature */ - if (!PSP_CAPABILITY(psp, SEV)) { + if (!psp->capability.sev) { dev_dbg(psp->dev, "psp does not support SEV\n"); return -ENODEV; } @@ -182,7 +181,7 @@ static int psp_check_sev_support(struct psp_device *psp) static int psp_check_tee_support(struct psp_device *psp) { /* Check if device supports TEE feature */ - if (!PSP_CAPABILITY(psp, TEE)) { + if (!psp->capability.tee) { dev_dbg(psp->dev, "psp does not support TEE\n"); return -ENODEV; } @@ -214,7 +213,7 @@ static int psp_init(struct psp_device *psp) /* dbc must come after platform access as it tests the feature */ if (PSP_FEATURE(psp, DBC) || - PSP_CAPABILITY(psp, DBC_THRU_EXT)) { + psp->capability.dbc_thru_ext) { ret = dbc_dev_init(psp); if (ret) return ret; diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index ae582ba63729..02a7c94c02df 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -26,6 +26,29 @@ extern struct psp_device *psp_master; typedef void (*psp_irq_handler_t)(int, void *, unsigned int); +union psp_cap_register { + unsigned int raw; + struct { + unsigned int sev :1, + tee :1, + dbc_thru_ext :1, + rsvd1 :4, + security_reporting :1, + fused_part :1, + rsvd2 :1, + debug_lock_on :1, + rsvd3 :2, + tsme_status :1, + rsvd4 :1, + anti_rollback_status :1, + rpmc_production_enabled :1, + rpmc_spirom_available :1, + hsp_tpm_available :1, + rom_armor_enforced :1, + rsvd5 :12; + }; +}; + struct psp_device { struct list_head entry; @@ -46,7 +69,7 @@ struct psp_device { void *platform_access_data; void *dbc_data; - unsigned int capability; + union psp_cap_register capability; }; void psp_set_sev_irq_handler(struct psp_device *psp, psp_irq_handler_t handler, @@ -55,26 +78,7 @@ void psp_clear_sev_irq_handler(struct psp_device *psp); struct psp_device *psp_get_master_device(void); -#define PSP_CAPABILITY_SEV BIT(0) -#define PSP_CAPABILITY_TEE BIT(1) -#define PSP_CAPABILITY_DBC_THRU_EXT BIT(2) -#define PSP_CAPABILITY_PSP_SECURITY_REPORTING BIT(7) - #define PSP_CAPABILITY_PSP_SECURITY_OFFSET 8 -/* - * The PSP doesn't directly store these bits in the capability register - * but instead copies them from the results of query command. - * - * The offsets from the query command are below, and shifted when used. - */ -#define PSP_SECURITY_FUSED_PART BIT(0) -#define PSP_SECURITY_DEBUG_LOCK_ON BIT(2) -#define PSP_SECURITY_TSME_STATUS BIT(5) -#define PSP_SECURITY_ANTI_ROLLBACK_STATUS BIT(7) -#define PSP_SECURITY_RPMC_PRODUCTION_ENABLED BIT(8) -#define PSP_SECURITY_RPMC_SPIROM_AVAILABLE BIT(9) -#define PSP_SECURITY_HSP_TPM_AVAILABLE BIT(10) -#define PSP_SECURITY_ROM_ARMOR_ENFORCED BIT(11) /** * enum psp_cmd - PSP mailbox commands diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h index 03d5b9e04084..c4e125efe6c7 100644 --- a/drivers/crypto/ccp/sp-dev.h +++ b/drivers/crypto/ccp/sp-dev.h @@ -30,7 +30,6 @@ #define PLATFORM_FEATURE_DBC 0x1 -#define PSP_CAPABILITY(psp, cap) (psp->capability & PSP_CAPABILITY_##cap) #define PSP_FEATURE(psp, feat) (psp->vdata && psp->vdata->platform_features & PLATFORM_FEATURE_##feat) /* Structure to hold CCP device data */ diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index 300dda14182b..b57392292af1 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -39,31 +39,30 @@ struct sp_pci { }; static struct sp_device *sp_dev_master; -#define security_attribute_show(name, def) \ +#define security_attribute_show(name) \ static ssize_t name##_show(struct device *d, struct device_attribute *attr, \ char *buf) \ { \ struct sp_device *sp = dev_get_drvdata(d); \ struct psp_device *psp = sp->psp_data; \ - int bit = PSP_SECURITY_##def << PSP_CAPABILITY_PSP_SECURITY_OFFSET; \ - return sysfs_emit(buf, "%d\n", (psp->capability & bit) > 0); \ + return sysfs_emit(buf, "%d\n", psp->capability.name); \ } -security_attribute_show(fused_part, FUSED_PART) +security_attribute_show(fused_part) static DEVICE_ATTR_RO(fused_part); -security_attribute_show(debug_lock_on, DEBUG_LOCK_ON) +security_attribute_show(debug_lock_on) static DEVICE_ATTR_RO(debug_lock_on); -security_attribute_show(tsme_status, TSME_STATUS) +security_attribute_show(tsme_status) static DEVICE_ATTR_RO(tsme_status); -security_attribute_show(anti_rollback_status, ANTI_ROLLBACK_STATUS) +security_attribute_show(anti_rollback_status) static DEVICE_ATTR_RO(anti_rollback_status); -security_attribute_show(rpmc_production_enabled, RPMC_PRODUCTION_ENABLED) +security_attribute_show(rpmc_production_enabled) static DEVICE_ATTR_RO(rpmc_production_enabled); -security_attribute_show(rpmc_spirom_available, RPMC_SPIROM_AVAILABLE) +security_attribute_show(rpmc_spirom_available) static DEVICE_ATTR_RO(rpmc_spirom_available); -security_attribute_show(hsp_tpm_available, HSP_TPM_AVAILABLE) +security_attribute_show(hsp_tpm_available) static DEVICE_ATTR_RO(hsp_tpm_available); -security_attribute_show(rom_armor_enforced, ROM_ARMOR_ENFORCED) +security_attribute_show(rom_armor_enforced) static DEVICE_ATTR_RO(rom_armor_enforced); static struct attribute *psp_security_attrs[] = { @@ -84,7 +83,7 @@ static umode_t psp_security_is_visible(struct kobject *kobj, struct attribute *a struct sp_device *sp = dev_get_drvdata(dev); struct psp_device *psp = sp->psp_data; - if (psp && PSP_CAPABILITY(psp, PSP_SECURITY_REPORTING)) + if (psp && psp->capability.security_reporting) return 0444; return 0; @@ -134,8 +133,7 @@ static umode_t psp_firmware_is_visible(struct kobject *kobj, struct attribute *a psp->vdata->bootloader_info_reg) val = ioread32(psp->io_regs + psp->vdata->bootloader_info_reg); - if (attr == &dev_attr_tee_version.attr && - PSP_CAPABILITY(psp, TEE) && + if (attr == &dev_attr_tee_version.attr && psp->capability.tee && psp->vdata->tee->info_reg) val = ioread32(psp->io_regs + psp->vdata->tee->info_reg); -- cgit From 56e0d883735002c506e73fa1f1197f3959fc7f0c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 28 May 2024 16:07:09 -0500 Subject: crypto: ccp - Move security attributes to their own file To prepare for other code that will manipulate security attributes move the handling code out of sp-pci.c. No intended functional changes. Signed-off-by: Mario Limonciello Acked-by: Tom Lendacky Signed-off-by: Herbert Xu --- MAINTAINERS | 6 ++++ drivers/crypto/ccp/Makefile | 3 +- drivers/crypto/ccp/hsti.c | 68 ++++++++++++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/hsti.h | 15 ++++++++++ drivers/crypto/ccp/psp-dev.c | 1 + drivers/crypto/ccp/sp-pci.c | 58 ++----------------------------------- 6 files changed, 95 insertions(+), 56 deletions(-) create mode 100644 drivers/crypto/ccp/hsti.c create mode 100644 drivers/crypto/ccp/hsti.h diff --git a/MAINTAINERS b/MAINTAINERS index d6c90161c7bf..883fb3b246b6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -991,6 +991,12 @@ F: include/uapi/linux/psp-dbc.h F: tools/crypto/ccp/*.c F: tools/crypto/ccp/*.py +AMD CRYPTOGRAPHIC COPROCESSOR (CCP) DRIVER - HSTI SUPPORT +M: Mario Limonciello +L: linux-crypto@vger.kernel.org +S: Supported +F: drivers/crypto/ccp/hsti.* + AMD DISPLAY CORE M: Harry Wentland M: Leo Li diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index aa0ba2d17e1e..394484929dae 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -12,7 +12,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ sev-dev.o \ tee-dev.o \ platform-access.o \ - dbc.o + dbc.o \ + hsti.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/hsti.c b/drivers/crypto/ccp/hsti.c new file mode 100644 index 000000000000..076c1d175b2b --- /dev/null +++ b/drivers/crypto/ccp/hsti.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure Processor device driver, security attributes + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. + * + * Author: Mario Limonciello + */ + +#include + +#include "psp-dev.h" +#include "hsti.h" + +#define security_attribute_show(name) \ +static ssize_t name##_show(struct device *d, struct device_attribute *attr, \ + char *buf) \ +{ \ + struct sp_device *sp = dev_get_drvdata(d); \ + struct psp_device *psp = sp->psp_data; \ + return sysfs_emit(buf, "%d\n", psp->capability.name); \ +} + +security_attribute_show(fused_part) +static DEVICE_ATTR_RO(fused_part); +security_attribute_show(debug_lock_on) +static DEVICE_ATTR_RO(debug_lock_on); +security_attribute_show(tsme_status) +static DEVICE_ATTR_RO(tsme_status); +security_attribute_show(anti_rollback_status) +static DEVICE_ATTR_RO(anti_rollback_status); +security_attribute_show(rpmc_production_enabled) +static DEVICE_ATTR_RO(rpmc_production_enabled); +security_attribute_show(rpmc_spirom_available) +static DEVICE_ATTR_RO(rpmc_spirom_available); +security_attribute_show(hsp_tpm_available) +static DEVICE_ATTR_RO(hsp_tpm_available); +security_attribute_show(rom_armor_enforced) +static DEVICE_ATTR_RO(rom_armor_enforced); + +static struct attribute *psp_security_attrs[] = { + &dev_attr_fused_part.attr, + &dev_attr_debug_lock_on.attr, + &dev_attr_tsme_status.attr, + &dev_attr_anti_rollback_status.attr, + &dev_attr_rpmc_production_enabled.attr, + &dev_attr_rpmc_spirom_available.attr, + &dev_attr_hsp_tpm_available.attr, + &dev_attr_rom_armor_enforced.attr, + NULL +}; + +static umode_t psp_security_is_visible(struct kobject *kobj, struct attribute *attr, int idx) +{ + struct device *dev = kobj_to_dev(kobj); + struct sp_device *sp = dev_get_drvdata(dev); + struct psp_device *psp = sp->psp_data; + + if (psp && psp->capability.security_reporting) + return 0444; + + return 0; +} + +struct attribute_group psp_security_attr_group = { + .attrs = psp_security_attrs, + .is_visible = psp_security_is_visible, +}; diff --git a/drivers/crypto/ccp/hsti.h b/drivers/crypto/ccp/hsti.h new file mode 100644 index 000000000000..e5c5ceab9973 --- /dev/null +++ b/drivers/crypto/ccp/hsti.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Secure Processor device driver, security attributes + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. + * + * Author: Mario Limonciello + */ + +#ifndef __HSTI_H +#define __HSTI_H + +extern struct attribute_group psp_security_attr_group; + +#endif /* __HSTI_H */ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 7d9d2042be35..1a7b991c27f7 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -19,6 +19,7 @@ #include "tee-dev.h" #include "platform-access.h" #include "dbc.h" +#include "hsti.h" struct psp_device *psp_master; diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index b57392292af1..dd31e791156d 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -24,6 +24,7 @@ #include "ccp-dev.h" #include "psp-dev.h" +#include "hsti.h" /* used for version string AA.BB.CC.DD */ #define AA GENMASK(31, 24) @@ -39,61 +40,6 @@ struct sp_pci { }; static struct sp_device *sp_dev_master; -#define security_attribute_show(name) \ -static ssize_t name##_show(struct device *d, struct device_attribute *attr, \ - char *buf) \ -{ \ - struct sp_device *sp = dev_get_drvdata(d); \ - struct psp_device *psp = sp->psp_data; \ - return sysfs_emit(buf, "%d\n", psp->capability.name); \ -} - -security_attribute_show(fused_part) -static DEVICE_ATTR_RO(fused_part); -security_attribute_show(debug_lock_on) -static DEVICE_ATTR_RO(debug_lock_on); -security_attribute_show(tsme_status) -static DEVICE_ATTR_RO(tsme_status); -security_attribute_show(anti_rollback_status) -static DEVICE_ATTR_RO(anti_rollback_status); -security_attribute_show(rpmc_production_enabled) -static DEVICE_ATTR_RO(rpmc_production_enabled); -security_attribute_show(rpmc_spirom_available) -static DEVICE_ATTR_RO(rpmc_spirom_available); -security_attribute_show(hsp_tpm_available) -static DEVICE_ATTR_RO(hsp_tpm_available); -security_attribute_show(rom_armor_enforced) -static DEVICE_ATTR_RO(rom_armor_enforced); - -static struct attribute *psp_security_attrs[] = { - &dev_attr_fused_part.attr, - &dev_attr_debug_lock_on.attr, - &dev_attr_tsme_status.attr, - &dev_attr_anti_rollback_status.attr, - &dev_attr_rpmc_production_enabled.attr, - &dev_attr_rpmc_spirom_available.attr, - &dev_attr_hsp_tpm_available.attr, - &dev_attr_rom_armor_enforced.attr, - NULL -}; - -static umode_t psp_security_is_visible(struct kobject *kobj, struct attribute *attr, int idx) -{ - struct device *dev = kobj_to_dev(kobj); - struct sp_device *sp = dev_get_drvdata(dev); - struct psp_device *psp = sp->psp_data; - - if (psp && psp->capability.security_reporting) - return 0444; - - return 0; -} - -static struct attribute_group psp_security_attr_group = { - .attrs = psp_security_attrs, - .is_visible = psp_security_is_visible, -}; - #define version_attribute_show(name, _offset) \ static ssize_t name##_show(struct device *d, struct device_attribute *attr, \ char *buf) \ @@ -150,7 +96,9 @@ static struct attribute_group psp_firmware_attr_group = { }; static const struct attribute_group *psp_groups[] = { +#ifdef CONFIG_CRYPTO_DEV_SP_PSP &psp_security_attr_group, +#endif &psp_firmware_attr_group, NULL, }; -- cgit From b4100947a8014e1876872546398275968f77e1ad Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 28 May 2024 16:07:10 -0500 Subject: crypto: ccp - align psp_platform_access_msg Align the whitespace so that future messages will also be better aligned. Acked-by: Tom Lendacky Signed-off-by: Mario Limonciello Signed-off-by: Herbert Xu --- include/linux/psp-platform-access.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index c1dc87fc536b..23893b33e48c 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -6,8 +6,8 @@ #include enum psp_platform_access_msg { - PSP_CMD_NONE = 0x0, - PSP_I2C_REQ_BUS_CMD = 0x64, + PSP_CMD_NONE = 0x0, + PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, PSP_DYNAMIC_BOOST_SET_UID, PSP_DYNAMIC_BOOST_GET_PARAMETER, -- cgit From 82f9327f774c6e040ba63a887a85fa2e290a233a Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 28 May 2024 16:07:11 -0500 Subject: crypto: ccp - Add support for getting security attributes on some older systems Older systems will not populate the security attributes in the capabilities register. The PSP on these systems, however, does have a command to get the security attributes. Use this command during ccp startup to populate the attributes if they're missing. Closes: https://github.com/fwupd/fwupd/issues/5284 Closes: https://github.com/fwupd/fwupd/issues/5675 Closes: https://github.com/fwupd/fwupd/issues/6253 Closes: https://github.com/fwupd/fwupd/issues/7280 Closes: https://github.com/fwupd/fwupd/issues/6323 Closes: https://github.com/fwupd/fwupd/discussions/5433 Signed-off-by: Mario Limonciello Acked-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/hsti.c | 55 +++++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/hsti.h | 2 ++ drivers/crypto/ccp/psp-dev.c | 5 ++++ drivers/crypto/ccp/psp-dev.h | 2 -- drivers/crypto/ccp/sp-dev.h | 1 + drivers/crypto/ccp/sp-pci.c | 5 +++- include/linux/psp-platform-access.h | 1 + 7 files changed, 68 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/ccp/hsti.c b/drivers/crypto/ccp/hsti.c index 076c1d175b2b..8b99bbd4efe2 100644 --- a/drivers/crypto/ccp/hsti.c +++ b/drivers/crypto/ccp/hsti.c @@ -12,6 +12,13 @@ #include "psp-dev.h" #include "hsti.h" +#define PSP_CAPABILITY_PSP_SECURITY_OFFSET 8 + +struct hsti_request { + struct psp_req_buffer_hdr header; + u32 hsti; +} __packed; + #define security_attribute_show(name) \ static ssize_t name##_show(struct device *d, struct device_attribute *attr, \ char *buf) \ @@ -66,3 +73,51 @@ struct attribute_group psp_security_attr_group = { .attrs = psp_security_attrs, .is_visible = psp_security_is_visible, }; + +static int psp_poulate_hsti(struct psp_device *psp) +{ + struct hsti_request *req; + int ret; + + /* Are the security attributes already reported? */ + if (psp->capability.security_reporting) + return 0; + + /* Allocate command-response buffer */ + req = kzalloc(sizeof(*req), GFP_KERNEL | __GFP_ZERO); + if (!req) + return -ENOMEM; + + req->header.payload_size = sizeof(req); + + ret = psp_send_platform_access_msg(PSP_CMD_HSTI_QUERY, (struct psp_request *)req); + if (ret) + goto out; + + if (req->header.status != 0) { + dev_dbg(psp->dev, "failed to populate HSTI state: %d\n", req->header.status); + ret = -EINVAL; + goto out; + } + + psp->capability.security_reporting = 1; + psp->capability.raw |= req->hsti << PSP_CAPABILITY_PSP_SECURITY_OFFSET; + +out: + kfree(req); + + return ret; +} + +int psp_init_hsti(struct psp_device *psp) +{ + int ret; + + if (PSP_FEATURE(psp, HSTI)) { + ret = psp_poulate_hsti(psp); + if (ret) + return ret; + } + + return 0; +} diff --git a/drivers/crypto/ccp/hsti.h b/drivers/crypto/ccp/hsti.h index e5c5ceab9973..6a70f922d2c4 100644 --- a/drivers/crypto/ccp/hsti.h +++ b/drivers/crypto/ccp/hsti.h @@ -12,4 +12,6 @@ extern struct attribute_group psp_security_attr_group; +int psp_init_hsti(struct psp_device *psp); + #endif /* __HSTI_H */ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 1a7b991c27f7..0a01ad134609 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -220,6 +220,11 @@ static int psp_init(struct psp_device *psp) return ret; } + /* HSTI uses platform access on some systems. */ + ret = psp_init_hsti(psp); + if (ret) + return ret; + return 0; } diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index 02a7c94c02df..e43ce87ede76 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -78,8 +78,6 @@ void psp_clear_sev_irq_handler(struct psp_device *psp); struct psp_device *psp_get_master_device(void); -#define PSP_CAPABILITY_PSP_SECURITY_OFFSET 8 - /** * enum psp_cmd - PSP mailbox commands * @PSP_CMD_TEE_RING_INIT: Initialize TEE ring buffer diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h index c4e125efe6c7..0895de823674 100644 --- a/drivers/crypto/ccp/sp-dev.h +++ b/drivers/crypto/ccp/sp-dev.h @@ -29,6 +29,7 @@ #define CACHE_WB_NO_ALLOC 0xb7 #define PLATFORM_FEATURE_DBC 0x1 +#define PLATFORM_FEATURE_HSTI 0x2 #define PSP_FEATURE(psp, feat) (psp->vdata && psp->vdata->platform_features & PLATFORM_FEATURE_##feat) diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index dd31e791156d..248d98fd8c48 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -397,10 +397,12 @@ static const struct psp_vdata pspv1 = { static const struct psp_vdata pspv2 = { .sev = &sevv2, + .platform_access = &pa_v1, .bootloader_info_reg = 0x109ec, /* C2PMSG_59 */ .feature_reg = 0x109fc, /* C2PMSG_63 */ .inten_reg = 0x10690, /* P2CMSG_INTEN */ .intsts_reg = 0x10694, /* P2CMSG_INTSTS */ + .platform_features = PLATFORM_FEATURE_HSTI, }; static const struct psp_vdata pspv3 = { @@ -413,7 +415,8 @@ static const struct psp_vdata pspv3 = { .feature_reg = 0x109fc, /* C2PMSG_63 */ .inten_reg = 0x10690, /* P2CMSG_INTEN */ .intsts_reg = 0x10694, /* P2CMSG_INTSTS */ - .platform_features = PLATFORM_FEATURE_DBC, + .platform_features = PLATFORM_FEATURE_DBC | + PLATFORM_FEATURE_HSTI, }; static const struct psp_vdata pspv4 = { diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index 23893b33e48c..1504fb012c05 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -7,6 +7,7 @@ enum psp_platform_access_msg { PSP_CMD_NONE = 0x0, + PSP_CMD_HSTI_QUERY = 0x14, PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, PSP_DYNAMIC_BOOST_SET_UID, -- cgit From 059b1352519d1f2f856d80ae6fe11e11891869d2 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 28 May 2024 16:07:12 -0500 Subject: crypto: ccp - Move message about TSME being enabled later in init Some of the security attributes data is now populated from an HSTI command on some processors, so show the message after it has been populated. Signed-off-by: Mario Limonciello Acked-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/hsti.c | 15 +++++++++++++++ drivers/crypto/ccp/psp-dev.c | 8 -------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/crypto/ccp/hsti.c b/drivers/crypto/ccp/hsti.c index 8b99bbd4efe2..1b39a4fb55c0 100644 --- a/drivers/crypto/ccp/hsti.c +++ b/drivers/crypto/ccp/hsti.c @@ -119,5 +119,20 @@ int psp_init_hsti(struct psp_device *psp) return ret; } + /* + * At this stage, if security information hasn't been populated by + * either the PSP or by the driver through the platform command, + * then there is nothing more to do. + */ + if (!psp->capability.security_reporting) + return 0; + + if (psp->capability.tsme_status) { + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + dev_notice(psp->dev, "psp: Both TSME and SME are active, SME is unnecessary when TSME is active.\n"); + else + dev_notice(psp->dev, "psp: TSME enabled\n"); + } + return 0; } diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 0a01ad134609..1c5a7189631e 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -157,14 +157,6 @@ static unsigned int psp_get_capability(struct psp_device *psp) } psp->capability.raw = val; - /* Detect TSME and/or SME status */ - if (psp->capability.security_reporting && psp->capability.tsme_status) { - if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) - dev_notice(psp->dev, "psp: Both TSME and SME are active, SME is unnecessary when TSME is active.\n"); - else - dev_notice(psp->dev, "psp: TSME enabled\n"); - } - return 0; } -- cgit From 645211db1394f0607935dd43d907af7a12631907 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 28 May 2024 16:49:23 -0700 Subject: crypto: lib - add missing MODULE_DESCRIPTION() macros Fix the allmodconfig 'make W=1' warnings: WARNING: modpost: missing MODULE_DESCRIPTION() in lib/crypto/libchacha.o WARNING: modpost: missing MODULE_DESCRIPTION() in lib/crypto/libarc4.o WARNING: modpost: missing MODULE_DESCRIPTION() in lib/crypto/libdes.o WARNING: modpost: missing MODULE_DESCRIPTION() in lib/crypto/libpoly1305.o Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- lib/crypto/arc4.c | 1 + lib/crypto/des.c | 1 + lib/crypto/libchacha.c | 1 + lib/crypto/poly1305.c | 1 + 4 files changed, 4 insertions(+) diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c index c2020f19c652..838812d18216 100644 --- a/lib/crypto/arc4.c +++ b/lib/crypto/arc4.c @@ -71,4 +71,5 @@ void arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len) } EXPORT_SYMBOL(arc4_crypt); +MODULE_DESCRIPTION("ARC4 Cipher Algorithm"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/des.c b/lib/crypto/des.c index ef5bb8822aba..9518658b97cf 100644 --- a/lib/crypto/des.c +++ b/lib/crypto/des.c @@ -899,4 +899,5 @@ void des3_ede_decrypt(const struct des3_ede_ctx *dctx, u8 *dst, const u8 *src) } EXPORT_SYMBOL_GPL(des3_ede_decrypt); +MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/libchacha.c b/lib/crypto/libchacha.c index dabc3accae05..cc1be0496eb9 100644 --- a/lib/crypto/libchacha.c +++ b/lib/crypto/libchacha.c @@ -32,4 +32,5 @@ void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_generic); +MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c index 26d87fc3823e..5d8378d23e95 100644 --- a/lib/crypto/poly1305.c +++ b/lib/crypto/poly1305.c @@ -76,3 +76,4 @@ EXPORT_SYMBOL_GPL(poly1305_final_generic); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi "); +MODULE_DESCRIPTION("Poly1305 authenticator algorithm, RFC7539"); -- cgit From 2fd2a82ccbfc106aec314db6c4bda5e24fd32a22 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Wed, 29 May 2024 19:08:26 -0400 Subject: crypto: ecdsa - Use ecc_digits_from_bytes to create hash digits array Since ecc_digits_from_bytes will provide zeros when an insufficient number of bytes are passed in the input byte array, use it to create the hash digits directly from the input byte array. This avoids going through an intermediate byte array (rawhash) that has the first few bytes filled with zeros. Signed-off-by: Stefan Berger Reviewed-by: Jarkko Sakkinen Signed-off-by: Herbert Xu --- crypto/ecdsa.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/crypto/ecdsa.c b/crypto/ecdsa.c index 55114146ff84..daab3c2b87c1 100644 --- a/crypto/ecdsa.c +++ b/crypto/ecdsa.c @@ -142,10 +142,8 @@ static int ecdsa_verify(struct akcipher_request *req) struct ecdsa_signature_ctx sig_ctx = { .curve = ctx->curve, }; - u8 rawhash[ECC_MAX_BYTES]; u64 hash[ECC_MAX_DIGITS]; unsigned char *buffer; - ssize_t diff; int ret; if (unlikely(!ctx->pub_key_set)) @@ -164,18 +162,11 @@ static int ecdsa_verify(struct akcipher_request *req) if (ret < 0) goto error; - /* if the hash is shorter then we will add leading zeros to fit to ndigits */ - diff = bufsize - req->dst_len; - if (diff >= 0) { - if (diff) - memset(rawhash, 0, diff); - memcpy(&rawhash[diff], buffer + req->src_len, req->dst_len); - } else if (diff < 0) { - /* given hash is longer, we take the left-most bytes */ - memcpy(&rawhash, buffer + req->src_len, bufsize); - } + if (bufsize > req->dst_len) + bufsize = req->dst_len; - ecc_swap_digits((u64 *)rawhash, hash, ctx->curve->g.ndigits); + ecc_digits_from_bytes(buffer + req->src_len, bufsize, + hash, ctx->curve->g.ndigits); ret = _ecdsa_verify(ctx, hash, sig_ctx.r, sig_ctx.s); -- cgit From 546ce0bdc91afd9f5c4c67d9fc4733e0fc7086d1 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Wed, 29 May 2024 19:08:27 -0400 Subject: crypto: ecdsa - Use ecc_digits_from_bytes to convert signature Since ecc_digits_from_bytes will provide zeros when an insufficient number of bytes are passed in the input byte array, use it to convert the r and s components of the signature to digits directly from the input byte array. This avoids going through an intermediate byte array that has the first few bytes filled with zeros. Signed-off-by: Stefan Berger Reviewed-by: Jarkko Sakkinen Signed-off-by: Herbert Xu --- crypto/ecdsa.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/crypto/ecdsa.c b/crypto/ecdsa.c index daab3c2b87c1..d5a10959ec28 100644 --- a/crypto/ecdsa.c +++ b/crypto/ecdsa.c @@ -38,7 +38,6 @@ static int ecdsa_get_signature_rs(u64 *dest, size_t hdrlen, unsigned char tag, size_t bufsize = ndigits * sizeof(u64); ssize_t diff = vlen - bufsize; const char *d = value; - u8 rs[ECC_MAX_BYTES]; if (!value || !vlen) return -EINVAL; @@ -46,7 +45,7 @@ static int ecdsa_get_signature_rs(u64 *dest, size_t hdrlen, unsigned char tag, /* diff = 0: 'value' has exacly the right size * diff > 0: 'value' has too many bytes; one leading zero is allowed that * makes the value a positive integer; error on more - * diff < 0: 'value' is missing leading zeros, which we add + * diff < 0: 'value' is missing leading zeros */ if (diff > 0) { /* skip over leading zeros that make 'value' a positive int */ @@ -61,14 +60,7 @@ static int ecdsa_get_signature_rs(u64 *dest, size_t hdrlen, unsigned char tag, if (-diff >= bufsize) return -EINVAL; - if (diff) { - /* leading zeros not given in 'value' */ - memset(rs, 0, -diff); - } - - memcpy(&rs[-diff], d, vlen); - - ecc_swap_digits((u64 *)rs, dest, ndigits); + ecc_digits_from_bytes(d, vlen, dest, ndigits); return 0; } -- cgit From 3aeb1da092e875255c24c6a26be097448d70a756 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 30 May 2024 17:53:17 -0700 Subject: crypto: x86 - add missing MODULE_DESCRIPTION() macros On x86, make allmodconfig && make W=1 C=1 warns: WARNING: modpost: missing MODULE_DESCRIPTION() in arch/x86/crypto/crc32-pclmul.o WARNING: modpost: missing MODULE_DESCRIPTION() in arch/x86/crypto/curve25519-x86_64.o Add the missing MODULE_DESCRIPTION() macro invocations. Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- arch/x86/crypto/crc32-pclmul_glue.c | 1 + arch/x86/crypto/curve25519-x86_64.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 98cf3b4e4c9f..9f5e342b9845 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -195,6 +195,7 @@ module_init(crc32_pclmul_mod_init); module_exit(crc32_pclmul_mod_fini); MODULE_AUTHOR("Alexander Boyko "); +MODULE_DESCRIPTION("CRC32 algorithm (IEEE 802.3) accelerated with PCLMULQDQ"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("crc32"); diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index d55fa9e9b9e6..dcfc0de333de 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -1720,5 +1720,6 @@ module_exit(curve25519_mod_exit); MODULE_ALIAS_CRYPTO("curve25519"); MODULE_ALIAS_CRYPTO("curve25519-x86"); +MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jason A. Donenfeld "); -- cgit From f134d5dce9c1f70fb522712b306644deb1e6eccd Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 31 May 2024 10:53:22 +0200 Subject: hwrng: stm32 - use pm_runtime_resume_and_get() include/linux/pm_runtime.h pm_runtime_get_sync() description suggests to ... consider using pm_runtime_resume_and_get() instead of it, especially if its return value is checked by the caller, as this is likely to result in cleaner code. This is indeed better, switch to pm_runtime_resume_and_get() which correctly suspends the device again in case of failure. Also add error checking into the RNG driver in case pm_runtime_resume_and_get() does fail, which is currently not done, and it does detect sporadic -EACCES error return after resume, which would otherwise lead to a hang due to register access on un-resumed hardware. Now the read simply errors out and the system does not hang. Acked-by: Gatien Chevallier Signed-off-by: Marek Vasut Signed-off-by: Herbert Xu --- drivers/char/hw_random/stm32-rng.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/char/hw_random/stm32-rng.c b/drivers/char/hw_random/stm32-rng.c index 0e903d6e22e3..6dec4adc4985 100644 --- a/drivers/char/hw_random/stm32-rng.c +++ b/drivers/char/hw_random/stm32-rng.c @@ -187,7 +187,9 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) int retval = 0, err = 0; u32 sr; - pm_runtime_get_sync((struct device *) priv->rng.priv); + retval = pm_runtime_resume_and_get((struct device *)priv->rng.priv); + if (retval) + return retval; if (readl_relaxed(priv->base + RNG_SR) & RNG_SR_SEIS) stm32_rng_conceal_seed_error(rng); -- cgit From 771c7faa65fb72c1d8d0ffd2b504058b7d02d51f Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 31 May 2024 10:53:23 +0200 Subject: hwrng: stm32 - cache device pointer in struct stm32_rng_private Place device pointer in struct stm32_rng_private and use it all over the place to get rid of the horrible type casts throughout the driver. No functional change. Acked-by: Gatien Chevallier Signed-off-by: Marek Vasut Signed-off-by: Herbert Xu --- drivers/char/hw_random/stm32-rng.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/drivers/char/hw_random/stm32-rng.c b/drivers/char/hw_random/stm32-rng.c index 6dec4adc4985..d08c870eb8d1 100644 --- a/drivers/char/hw_random/stm32-rng.c +++ b/drivers/char/hw_random/stm32-rng.c @@ -70,6 +70,7 @@ struct stm32_rng_config { struct stm32_rng_private { struct hwrng rng; + struct device *dev; void __iomem *base; struct clk *clk; struct reset_control *rst; @@ -99,7 +100,7 @@ struct stm32_rng_private { */ static int stm32_rng_conceal_seed_error_cond_reset(struct stm32_rng_private *priv) { - struct device *dev = (struct device *)priv->rng.priv; + struct device *dev = priv->dev; u32 sr = readl_relaxed(priv->base + RNG_SR); u32 cr = readl_relaxed(priv->base + RNG_CR); int err; @@ -171,7 +172,7 @@ static int stm32_rng_conceal_seed_error(struct hwrng *rng) { struct stm32_rng_private *priv = container_of(rng, struct stm32_rng_private, rng); - dev_dbg((struct device *)priv->rng.priv, "Concealing seed error\n"); + dev_dbg(priv->dev, "Concealing seed error\n"); if (priv->data->has_cond_reset) return stm32_rng_conceal_seed_error_cond_reset(priv); @@ -187,7 +188,7 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) int retval = 0, err = 0; u32 sr; - retval = pm_runtime_resume_and_get((struct device *)priv->rng.priv); + retval = pm_runtime_resume_and_get(priv->dev); if (retval) return retval; @@ -206,8 +207,7 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) sr, sr, 10, 50000); if (err) { - dev_err((struct device *)priv->rng.priv, - "%s: timeout %x!\n", __func__, sr); + dev_err(priv->dev, "%s: timeout %x!\n", __func__, sr); break; } } else if (!sr) { @@ -220,8 +220,7 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) err = stm32_rng_conceal_seed_error(rng); i++; if (err && i > RNG_NB_RECOVER_TRIES) { - dev_err((struct device *)priv->rng.priv, - "Couldn't recover from seed error\n"); + dev_err(priv->dev, "Couldn't recover from seed error\n"); retval = -ENOTRECOVERABLE; goto exit_rpm; } @@ -239,8 +238,7 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) err = stm32_rng_conceal_seed_error(rng); i++; if (err && i > RNG_NB_RECOVER_TRIES) { - dev_err((struct device *)priv->rng.priv, - "Couldn't recover from seed error"); + dev_err(priv->dev, "Couldn't recover from seed error"); retval = -ENOTRECOVERABLE; goto exit_rpm; } @@ -255,8 +253,8 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) } exit_rpm: - pm_runtime_mark_last_busy((struct device *) priv->rng.priv); - pm_runtime_put_sync_autosuspend((struct device *) priv->rng.priv); + pm_runtime_mark_last_busy(priv->dev); + pm_runtime_put_sync_autosuspend(priv->dev); return retval || !wait ? retval : -EIO; } @@ -331,8 +329,7 @@ static int stm32_rng_init(struct hwrng *rng) 10, 50000); if (err) { clk_disable_unprepare(priv->clk); - dev_err((struct device *)priv->rng.priv, - "%s: timeout %x!\n", __func__, reg); + dev_err(priv->dev, "%s: timeout %x!\n", __func__, reg); return -EINVAL; } } else { @@ -360,8 +357,7 @@ static int stm32_rng_init(struct hwrng *rng) 10, 100000); if (err || (reg & ~RNG_SR_DRDY)) { clk_disable_unprepare(priv->clk); - dev_err((struct device *)priv->rng.priv, - "%s: timeout:%x SR: %x!\n", __func__, err, reg); + dev_err(priv->dev, "%s: timeout:%x SR: %x!\n", __func__, err, reg); return -EINVAL; } @@ -467,8 +463,7 @@ static int __maybe_unused stm32_rng_resume(struct device *dev) if (err) { clk_disable_unprepare(priv->clk); - dev_err((struct device *)priv->rng.priv, - "%s: timeout:%x CR: %x!\n", __func__, err, reg); + dev_err(priv->dev, "%s: timeout:%x CR: %x!\n", __func__, err, reg); return -EINVAL; } } else { @@ -543,6 +538,7 @@ static int stm32_rng_probe(struct platform_device *ofdev) priv->ced = of_property_read_bool(np, "clock-error-detect"); priv->lock_conf = of_property_read_bool(np, "st,rng-lock-conf"); + priv->dev = dev; priv->data = of_device_get_match_data(dev); if (!priv->data) @@ -553,7 +549,6 @@ static int stm32_rng_probe(struct platform_device *ofdev) priv->rng.name = dev_driver_string(dev); priv->rng.init = stm32_rng_init; priv->rng.read = stm32_rng_read; - priv->rng.priv = (unsigned long) dev; priv->rng.quality = 900; pm_runtime_set_autosuspend_delay(dev, 100); -- cgit From 4c6338f8664b5f32d6ef0756a5afef64e50608dc Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 31 May 2024 10:57:34 +0200 Subject: hwrng: stm32 - use sizeof(*priv) instead of sizeof(struct stm32_rng_private) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use sizeof(*priv) instead of sizeof(struct stm32_rng_private), the former makes renaming of struct stm32_rng_private easier if necessary, as it removes one site where such rename has to happen. No functional change. Signed-off-by: Marek Vasut Acked-by: Uwe Kleine-König Acked-by: Gatien Chevallier Signed-off-by: Herbert Xu --- drivers/char/hw_random/stm32-rng.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/stm32-rng.c b/drivers/char/hw_random/stm32-rng.c index d08c870eb8d1..9d041a67c295 100644 --- a/drivers/char/hw_random/stm32-rng.c +++ b/drivers/char/hw_random/stm32-rng.c @@ -517,7 +517,7 @@ static int stm32_rng_probe(struct platform_device *ofdev) struct stm32_rng_private *priv; struct resource *res; - priv = devm_kzalloc(dev, sizeof(struct stm32_rng_private), GFP_KERNEL); + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; -- cgit From 46b3ff73afc815f1feb844e6b57e43cc13051544 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 31 May 2024 18:20:03 +0800 Subject: crypto: sm2 - Remove sm2 algorithm The SM2 algorithm has a single user in the kernel. However, it's never been integrated properly with that user: asymmetric_keys. The crux of the issue is that the way it computes its digest with sm3 does not fit into the architecture of asymmetric_keys. As no solution has been proposed, remove this algorithm. It can be resubmitted when it is integrated properly into the asymmetric_keys subsystem. Signed-off-by: Herbert Xu --- crypto/Kconfig | 18 -- crypto/Makefile | 8 - crypto/asymmetric_keys/pkcs7_parser.c | 4 - crypto/asymmetric_keys/public_key.c | 7 - crypto/asymmetric_keys/x509_cert_parser.c | 16 - crypto/asymmetric_keys/x509_public_key.c | 17 +- crypto/sm2.c | 498 ------------------------------ crypto/sm2signature.asn1 | 4 - crypto/testmgr.c | 6 - crypto/testmgr.h | 59 ---- include/crypto/sm2.h | 28 -- security/integrity/digsig_asymmetric.c | 3 +- 12 files changed, 3 insertions(+), 665 deletions(-) delete mode 100644 crypto/sm2.c delete mode 100644 crypto/sm2signature.asn1 delete mode 100644 include/crypto/sm2.h diff --git a/crypto/Kconfig b/crypto/Kconfig index 5688d42a59c2..72e2decb8c6a 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -313,24 +313,6 @@ config CRYPTO_ECRDSA One of the Russian cryptographic standard algorithms (called GOST algorithms). Only signature verification is implemented. -config CRYPTO_SM2 - tristate "SM2 (ShangMi 2)" - select CRYPTO_SM3 - select CRYPTO_AKCIPHER - select CRYPTO_MANAGER - select MPILIB - select ASN1 - help - SM2 (ShangMi 2) public key algorithm - - Published by State Encryption Management Bureau, China, - as specified by OSCCA GM/T 0003.1-2012 -- 0003.5-2012. - - References: - https://datatracker.ietf.org/doc/draft-shen-sm2-ecdsa/ - http://www.oscca.gov.cn/sca/xxgk/2010-12/17/content_1002386.shtml - http://www.gmbz.org.cn/main/bzlb.html - config CRYPTO_CURVE25519 tristate "Curve25519" select CRYPTO_KPP diff --git a/crypto/Makefile b/crypto/Makefile index edbbaa3ffef5..4c99e5d376f6 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -50,14 +50,6 @@ rsa_generic-y += rsa_helper.o rsa_generic-y += rsa-pkcs1pad.o obj-$(CONFIG_CRYPTO_RSA) += rsa_generic.o -$(obj)/sm2signature.asn1.o: $(obj)/sm2signature.asn1.c $(obj)/sm2signature.asn1.h -$(obj)/sm2.o: $(obj)/sm2signature.asn1.h - -sm2_generic-y += sm2signature.asn1.o -sm2_generic-y += sm2.o - -obj-$(CONFIG_CRYPTO_SM2) += sm2_generic.o - $(obj)/ecdsasignature.asn1.o: $(obj)/ecdsasignature.asn1.c $(obj)/ecdsasignature.asn1.h $(obj)/ecdsa.o: $(obj)/ecdsasignature.asn1.h ecdsa_generic-y += ecdsa.o diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c index 231ad7b3789d..423d13c47545 100644 --- a/crypto/asymmetric_keys/pkcs7_parser.c +++ b/crypto/asymmetric_keys/pkcs7_parser.c @@ -292,10 +292,6 @@ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen, ctx->sinfo->sig->pkey_algo = "ecdsa"; ctx->sinfo->sig->encoding = "x962"; break; - case OID_SM2_with_SM3: - ctx->sinfo->sig->pkey_algo = "sm2"; - ctx->sinfo->sig->encoding = "raw"; - break; case OID_gost2012PKey256: case OID_gost2012PKey512: ctx->sinfo->sig->pkey_algo = "ecrdsa"; diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index 3474fb34ded9..422940a6706a 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -124,13 +124,6 @@ software_key_determine_akcipher(const struct public_key *pkey, strcmp(hash_algo, "sha3-384") != 0 && strcmp(hash_algo, "sha3-512") != 0) return -EINVAL; - } else if (strcmp(pkey->pkey_algo, "sm2") == 0) { - if (strcmp(encoding, "raw") != 0) - return -EINVAL; - if (!hash_algo) - return -EINVAL; - if (strcmp(hash_algo, "sm3") != 0) - return -EINVAL; } else if (strcmp(pkey->pkey_algo, "ecrdsa") == 0) { if (strcmp(encoding, "raw") != 0) return -EINVAL; diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c index 25cc4273472f..ee2fdab42334 100644 --- a/crypto/asymmetric_keys/x509_cert_parser.c +++ b/crypto/asymmetric_keys/x509_cert_parser.c @@ -257,10 +257,6 @@ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag, case OID_gost2012Signature512: ctx->cert->sig->hash_algo = "streebog512"; goto ecrdsa; - - case OID_SM2_with_SM3: - ctx->cert->sig->hash_algo = "sm3"; - goto sm2; } rsa_pkcs1: @@ -273,11 +269,6 @@ ecrdsa: ctx->cert->sig->encoding = "raw"; ctx->sig_algo = ctx->last_oid; return 0; -sm2: - ctx->cert->sig->pkey_algo = "sm2"; - ctx->cert->sig->encoding = "raw"; - ctx->sig_algo = ctx->last_oid; - return 0; ecdsa: ctx->cert->sig->pkey_algo = "ecdsa"; ctx->cert->sig->encoding = "x962"; @@ -309,7 +300,6 @@ int x509_note_signature(void *context, size_t hdrlen, if (strcmp(ctx->cert->sig->pkey_algo, "rsa") == 0 || strcmp(ctx->cert->sig->pkey_algo, "ecrdsa") == 0 || - strcmp(ctx->cert->sig->pkey_algo, "sm2") == 0 || strcmp(ctx->cert->sig->pkey_algo, "ecdsa") == 0) { /* Discard the BIT STRING metadata */ if (vlen < 1 || *(const u8 *)value != 0) @@ -514,17 +504,11 @@ int x509_extract_key_data(void *context, size_t hdrlen, case OID_gost2012PKey512: ctx->cert->pub->pkey_algo = "ecrdsa"; break; - case OID_sm2: - ctx->cert->pub->pkey_algo = "sm2"; - break; case OID_id_ecPublicKey: if (parse_OID(ctx->params, ctx->params_size, &oid) != 0) return -EBADMSG; switch (oid) { - case OID_sm2: - ctx->cert->pub->pkey_algo = "sm2"; - break; case OID_id_prime192v1: ctx->cert->pub->pkey_algo = "ecdsa-nist-p192"; break; diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c index 00ac7159fba2..8409d7d36cb4 100644 --- a/crypto/asymmetric_keys/x509_public_key.c +++ b/crypto/asymmetric_keys/x509_public_key.c @@ -7,7 +7,6 @@ #define pr_fmt(fmt) "X.509: "fmt #include -#include #include #include #include @@ -64,20 +63,8 @@ int x509_get_sig_params(struct x509_certificate *cert) desc->tfm = tfm; - if (strcmp(cert->pub->pkey_algo, "sm2") == 0) { - ret = strcmp(sig->hash_algo, "sm3") != 0 ? -EINVAL : - crypto_shash_init(desc) ?: - sm2_compute_z_digest(desc, cert->pub->key, - cert->pub->keylen, sig->digest) ?: - crypto_shash_init(desc) ?: - crypto_shash_update(desc, sig->digest, - sig->digest_size) ?: - crypto_shash_finup(desc, cert->tbs, cert->tbs_size, - sig->digest); - } else { - ret = crypto_shash_digest(desc, cert->tbs, cert->tbs_size, - sig->digest); - } + ret = crypto_shash_digest(desc, cert->tbs, cert->tbs_size, + sig->digest); if (ret < 0) goto error_2; diff --git a/crypto/sm2.c b/crypto/sm2.c deleted file mode 100644 index 5ab120d74c59..000000000000 --- a/crypto/sm2.c +++ /dev/null @@ -1,498 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SM2 asymmetric public-key algorithm - * as specified by OSCCA GM/T 0003.1-2012 -- 0003.5-2012 SM2 and - * described at https://tools.ietf.org/html/draft-shen-sm2-ecdsa-02 - * - * Copyright (c) 2020, Alibaba Group. - * Authors: Tianjia Zhang - */ - -#include -#include -#include -#include -#include -#include -#include -#include "sm2signature.asn1.h" - -/* The default user id as specified in GM/T 0009-2012 */ -#define SM2_DEFAULT_USERID "1234567812345678" -#define SM2_DEFAULT_USERID_LEN 16 - -#define MPI_NBYTES(m) ((mpi_get_nbits(m) + 7) / 8) - -struct ecc_domain_parms { - const char *desc; /* Description of the curve. */ - unsigned int nbits; /* Number of bits. */ - unsigned int fips:1; /* True if this is a FIPS140-2 approved curve */ - - /* The model describing this curve. This is mainly used to select - * the group equation. - */ - enum gcry_mpi_ec_models model; - - /* The actual ECC dialect used. This is used for curve specific - * optimizations and to select encodings etc. - */ - enum ecc_dialects dialect; - - const char *p; /* The prime defining the field. */ - const char *a, *b; /* The coefficients. For Twisted Edwards - * Curves b is used for d. For Montgomery - * Curves (a,b) has ((A-2)/4,B^-1). - */ - const char *n; /* The order of the base point. */ - const char *g_x, *g_y; /* Base point. */ - unsigned int h; /* Cofactor. */ -}; - -static const struct ecc_domain_parms sm2_ecp = { - .desc = "sm2p256v1", - .nbits = 256, - .fips = 0, - .model = MPI_EC_WEIERSTRASS, - .dialect = ECC_DIALECT_STANDARD, - .p = "0xfffffffeffffffffffffffffffffffffffffffff00000000ffffffffffffffff", - .a = "0xfffffffeffffffffffffffffffffffffffffffff00000000fffffffffffffffc", - .b = "0x28e9fa9e9d9f5e344d5a9e4bcf6509a7f39789f515ab8f92ddbcbd414d940e93", - .n = "0xfffffffeffffffffffffffffffffffff7203df6b21c6052b53bbf40939d54123", - .g_x = "0x32c4ae2c1f1981195f9904466a39c9948fe30bbff2660be1715a4589334c74c7", - .g_y = "0xbc3736a2f4f6779c59bdcee36b692153d0a9877cc62a474002df32e52139f0a0", - .h = 1 -}; - -static int __sm2_set_pub_key(struct mpi_ec_ctx *ec, - const void *key, unsigned int keylen); - -static int sm2_ec_ctx_init(struct mpi_ec_ctx *ec) -{ - const struct ecc_domain_parms *ecp = &sm2_ecp; - MPI p, a, b; - MPI x, y; - int rc = -EINVAL; - - p = mpi_scanval(ecp->p); - a = mpi_scanval(ecp->a); - b = mpi_scanval(ecp->b); - if (!p || !a || !b) - goto free_p; - - x = mpi_scanval(ecp->g_x); - y = mpi_scanval(ecp->g_y); - if (!x || !y) - goto free; - - rc = -ENOMEM; - - ec->Q = mpi_point_new(0); - if (!ec->Q) - goto free; - - /* mpi_ec_setup_elliptic_curve */ - ec->G = mpi_point_new(0); - if (!ec->G) { - mpi_point_release(ec->Q); - goto free; - } - - mpi_set(ec->G->x, x); - mpi_set(ec->G->y, y); - mpi_set_ui(ec->G->z, 1); - - rc = -EINVAL; - ec->n = mpi_scanval(ecp->n); - if (!ec->n) { - mpi_point_release(ec->Q); - mpi_point_release(ec->G); - goto free; - } - - ec->h = ecp->h; - ec->name = ecp->desc; - mpi_ec_init(ec, ecp->model, ecp->dialect, 0, p, a, b); - - rc = 0; - -free: - mpi_free(x); - mpi_free(y); -free_p: - mpi_free(p); - mpi_free(a); - mpi_free(b); - - return rc; -} - -static void sm2_ec_ctx_deinit(struct mpi_ec_ctx *ec) -{ - mpi_ec_deinit(ec); - - memset(ec, 0, sizeof(*ec)); -} - -/* RESULT must have been initialized and is set on success to the - * point given by VALUE. - */ -static int sm2_ecc_os2ec(MPI_POINT result, MPI value) -{ - int rc; - size_t n; - unsigned char *buf; - MPI x, y; - - n = MPI_NBYTES(value); - buf = kmalloc(n, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - rc = mpi_print(GCRYMPI_FMT_USG, buf, n, &n, value); - if (rc) - goto err_freebuf; - - rc = -EINVAL; - if (n < 1 || ((n - 1) % 2)) - goto err_freebuf; - /* No support for point compression */ - if (*buf != 0x4) - goto err_freebuf; - - rc = -ENOMEM; - n = (n - 1) / 2; - x = mpi_read_raw_data(buf + 1, n); - if (!x) - goto err_freebuf; - y = mpi_read_raw_data(buf + 1 + n, n); - if (!y) - goto err_freex; - - mpi_normalize(x); - mpi_normalize(y); - mpi_set(result->x, x); - mpi_set(result->y, y); - mpi_set_ui(result->z, 1); - - rc = 0; - - mpi_free(y); -err_freex: - mpi_free(x); -err_freebuf: - kfree(buf); - return rc; -} - -struct sm2_signature_ctx { - MPI sig_r; - MPI sig_s; -}; - -int sm2_get_signature_r(void *context, size_t hdrlen, unsigned char tag, - const void *value, size_t vlen) -{ - struct sm2_signature_ctx *sig = context; - - if (!value || !vlen) - return -EINVAL; - - sig->sig_r = mpi_read_raw_data(value, vlen); - if (!sig->sig_r) - return -ENOMEM; - - return 0; -} - -int sm2_get_signature_s(void *context, size_t hdrlen, unsigned char tag, - const void *value, size_t vlen) -{ - struct sm2_signature_ctx *sig = context; - - if (!value || !vlen) - return -EINVAL; - - sig->sig_s = mpi_read_raw_data(value, vlen); - if (!sig->sig_s) - return -ENOMEM; - - return 0; -} - -static int sm2_z_digest_update(struct shash_desc *desc, - MPI m, unsigned int pbytes) -{ - static const unsigned char zero[32]; - unsigned char *in; - unsigned int inlen; - int err; - - in = mpi_get_buffer(m, &inlen, NULL); - if (!in) - return -EINVAL; - - if (inlen < pbytes) { - /* padding with zero */ - err = crypto_shash_update(desc, zero, pbytes - inlen) ?: - crypto_shash_update(desc, in, inlen); - } else if (inlen > pbytes) { - /* skip the starting zero */ - err = crypto_shash_update(desc, in + inlen - pbytes, pbytes); - } else { - err = crypto_shash_update(desc, in, inlen); - } - - kfree(in); - return err; -} - -static int sm2_z_digest_update_point(struct shash_desc *desc, - MPI_POINT point, struct mpi_ec_ctx *ec, - unsigned int pbytes) -{ - MPI x, y; - int ret = -EINVAL; - - x = mpi_new(0); - y = mpi_new(0); - - ret = mpi_ec_get_affine(x, y, point, ec) ? -EINVAL : - sm2_z_digest_update(desc, x, pbytes) ?: - sm2_z_digest_update(desc, y, pbytes); - - mpi_free(x); - mpi_free(y); - return ret; -} - -int sm2_compute_z_digest(struct shash_desc *desc, - const void *key, unsigned int keylen, void *dgst) -{ - struct mpi_ec_ctx *ec; - unsigned int bits_len; - unsigned int pbytes; - u8 entl[2]; - int err; - - ec = kmalloc(sizeof(*ec), GFP_KERNEL); - if (!ec) - return -ENOMEM; - - err = sm2_ec_ctx_init(ec); - if (err) - goto out_free_ec; - - err = __sm2_set_pub_key(ec, key, keylen); - if (err) - goto out_deinit_ec; - - bits_len = SM2_DEFAULT_USERID_LEN * 8; - entl[0] = bits_len >> 8; - entl[1] = bits_len & 0xff; - - pbytes = MPI_NBYTES(ec->p); - - /* ZA = H256(ENTLA | IDA | a | b | xG | yG | xA | yA) */ - err = crypto_shash_init(desc); - if (err) - goto out_deinit_ec; - - err = crypto_shash_update(desc, entl, 2); - if (err) - goto out_deinit_ec; - - err = crypto_shash_update(desc, SM2_DEFAULT_USERID, - SM2_DEFAULT_USERID_LEN); - if (err) - goto out_deinit_ec; - - err = sm2_z_digest_update(desc, ec->a, pbytes) ?: - sm2_z_digest_update(desc, ec->b, pbytes) ?: - sm2_z_digest_update_point(desc, ec->G, ec, pbytes) ?: - sm2_z_digest_update_point(desc, ec->Q, ec, pbytes); - if (err) - goto out_deinit_ec; - - err = crypto_shash_final(desc, dgst); - -out_deinit_ec: - sm2_ec_ctx_deinit(ec); -out_free_ec: - kfree(ec); - return err; -} -EXPORT_SYMBOL_GPL(sm2_compute_z_digest); - -static int _sm2_verify(struct mpi_ec_ctx *ec, MPI hash, MPI sig_r, MPI sig_s) -{ - int rc = -EINVAL; - struct gcry_mpi_point sG, tP; - MPI t = NULL; - MPI x1 = NULL, y1 = NULL; - - mpi_point_init(&sG); - mpi_point_init(&tP); - x1 = mpi_new(0); - y1 = mpi_new(0); - t = mpi_new(0); - - /* r, s in [1, n-1] */ - if (mpi_cmp_ui(sig_r, 1) < 0 || mpi_cmp(sig_r, ec->n) > 0 || - mpi_cmp_ui(sig_s, 1) < 0 || mpi_cmp(sig_s, ec->n) > 0) { - goto leave; - } - - /* t = (r + s) % n, t == 0 */ - mpi_addm(t, sig_r, sig_s, ec->n); - if (mpi_cmp_ui(t, 0) == 0) - goto leave; - - /* sG + tP = (x1, y1) */ - rc = -EBADMSG; - mpi_ec_mul_point(&sG, sig_s, ec->G, ec); - mpi_ec_mul_point(&tP, t, ec->Q, ec); - mpi_ec_add_points(&sG, &sG, &tP, ec); - if (mpi_ec_get_affine(x1, y1, &sG, ec)) - goto leave; - - /* R = (e + x1) % n */ - mpi_addm(t, hash, x1, ec->n); - - /* check R == r */ - rc = -EKEYREJECTED; - if (mpi_cmp(t, sig_r)) - goto leave; - - rc = 0; - -leave: - mpi_point_free_parts(&sG); - mpi_point_free_parts(&tP); - mpi_free(x1); - mpi_free(y1); - mpi_free(t); - - return rc; -} - -static int sm2_verify(struct akcipher_request *req) -{ - struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); - unsigned char *buffer; - struct sm2_signature_ctx sig; - MPI hash; - int ret; - - if (unlikely(!ec->Q)) - return -EINVAL; - - buffer = kmalloc(req->src_len + req->dst_len, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - sg_pcopy_to_buffer(req->src, - sg_nents_for_len(req->src, req->src_len + req->dst_len), - buffer, req->src_len + req->dst_len, 0); - - sig.sig_r = NULL; - sig.sig_s = NULL; - ret = asn1_ber_decoder(&sm2signature_decoder, &sig, - buffer, req->src_len); - if (ret) - goto error; - - ret = -ENOMEM; - hash = mpi_read_raw_data(buffer + req->src_len, req->dst_len); - if (!hash) - goto error; - - ret = _sm2_verify(ec, hash, sig.sig_r, sig.sig_s); - - mpi_free(hash); -error: - mpi_free(sig.sig_r); - mpi_free(sig.sig_s); - kfree(buffer); - return ret; -} - -static int sm2_set_pub_key(struct crypto_akcipher *tfm, - const void *key, unsigned int keylen) -{ - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); - - return __sm2_set_pub_key(ec, key, keylen); - -} - -static int __sm2_set_pub_key(struct mpi_ec_ctx *ec, - const void *key, unsigned int keylen) -{ - MPI a; - int rc; - - /* include the uncompressed flag '0x04' */ - a = mpi_read_raw_data(key, keylen); - if (!a) - return -ENOMEM; - - mpi_normalize(a); - rc = sm2_ecc_os2ec(ec->Q, a); - mpi_free(a); - - return rc; -} - -static unsigned int sm2_max_size(struct crypto_akcipher *tfm) -{ - /* Unlimited max size */ - return PAGE_SIZE; -} - -static int sm2_init_tfm(struct crypto_akcipher *tfm) -{ - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); - - return sm2_ec_ctx_init(ec); -} - -static void sm2_exit_tfm(struct crypto_akcipher *tfm) -{ - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); - - sm2_ec_ctx_deinit(ec); -} - -static struct akcipher_alg sm2 = { - .verify = sm2_verify, - .set_pub_key = sm2_set_pub_key, - .max_size = sm2_max_size, - .init = sm2_init_tfm, - .exit = sm2_exit_tfm, - .base = { - .cra_name = "sm2", - .cra_driver_name = "sm2-generic", - .cra_priority = 100, - .cra_module = THIS_MODULE, - .cra_ctxsize = sizeof(struct mpi_ec_ctx), - }, -}; - -static int __init sm2_init(void) -{ - return crypto_register_akcipher(&sm2); -} - -static void __exit sm2_exit(void) -{ - crypto_unregister_akcipher(&sm2); -} - -subsys_initcall(sm2_init); -module_exit(sm2_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Tianjia Zhang "); -MODULE_DESCRIPTION("SM2 generic algorithm"); -MODULE_ALIAS_CRYPTO("sm2-generic"); diff --git a/crypto/sm2signature.asn1 b/crypto/sm2signature.asn1 deleted file mode 100644 index ab8c0b754d21..000000000000 --- a/crypto/sm2signature.asn1 +++ /dev/null @@ -1,4 +0,0 @@ -Sm2Signature ::= SEQUENCE { - sig_r INTEGER ({ sm2_get_signature_r }), - sig_s INTEGER ({ sm2_get_signature_s }) -} diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 1b30e3565e80..a780b615f8c6 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -5608,12 +5608,6 @@ static const struct alg_test_desc alg_test_descs[] = { .suite = { .hash = __VECS(sha512_tv_template) } - }, { - .alg = "sm2", - .test = alg_test_akcipher, - .suite = { - .akcipher = __VECS(sm2_tv_template) - } }, { .alg = "sm3", .test = alg_test_hash, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 5350cfd9d325..9b38501a17b2 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -5774,65 +5774,6 @@ static const struct hash_testvec hmac_streebog512_tv_template[] = { }, }; -/* - * SM2 test vectors. - */ -static const struct akcipher_testvec sm2_tv_template[] = { - { /* Generated from openssl */ - .key = - "\x04" - "\x8e\xa0\x33\x69\x91\x7e\x3d\xec\xad\x8e\xf0\x45\x5e\x13\x3e\x68" - "\x5b\x8c\xab\x5c\xc6\xc8\x50\xdf\x91\x00\xe0\x24\x73\x4d\x31\xf2" - "\x2e\xc0\xd5\x6b\xee\xda\x98\x93\xec\xd8\x36\xaa\xb9\xcf\x63\x82" - "\xef\xa7\x1a\x03\xed\x16\xba\x74\xb8\x8b\xf9\xe5\x70\x39\xa4\x70", - .key_len = 65, - .param_len = 0, - .c = - "\x30\x45" - "\x02\x20" - "\x70\xab\xb6\x7d\xd6\x54\x80\x64\x42\x7e\x2d\x05\x08\x36\xc9\x96" - "\x25\xc2\xbb\xff\x08\xe5\x43\x15\x5e\xf3\x06\xd9\x2b\x2f\x0a\x9f" - "\x02\x21" - "\x00" - "\xbf\x21\x5f\x7e\x5d\x3f\x1a\x4d\x8f\x84\xc2\xe9\xa6\x4c\xa4\x18" - "\xb2\xb8\x46\xf4\x32\x96\xfa\x57\xc6\x29\xd4\x89\xae\xcc\xda\xdb", - .c_size = 71, - .algo = OID_SM2_with_SM3, - .m = - "\x47\xa7\xbf\xd3\xda\xc4\x79\xee\xda\x8b\x4f\xe8\x40\x94\xd4\x32" - "\x8f\xf1\xcd\x68\x4d\xbd\x9b\x1d\xe0\xd8\x9a\x5d\xad\x85\x47\x5c", - .m_size = 32, - .public_key_vec = true, - .siggen_sigver_test = true, - }, - { /* From libgcrypt */ - .key = - "\x04" - "\x87\x59\x38\x9a\x34\xaa\xad\x07\xec\xf4\xe0\xc8\xc2\x65\x0a\x44" - "\x59\xc8\xd9\x26\xee\x23\x78\x32\x4e\x02\x61\xc5\x25\x38\xcb\x47" - "\x75\x28\x10\x6b\x1e\x0b\x7c\x8d\xd5\xff\x29\xa9\xc8\x6a\x89\x06" - "\x56\x56\xeb\x33\x15\x4b\xc0\x55\x60\x91\xef\x8a\xc9\xd1\x7d\x78", - .key_len = 65, - .param_len = 0, - .c = - "\x30\x44" - "\x02\x20" - "\xd9\xec\xef\xe8\x5f\xee\x3c\x59\x57\x8e\x5b\xab\xb3\x02\xe1\x42" - "\x4b\x67\x2c\x0b\x26\xb6\x51\x2c\x3e\xfc\xc6\x49\xec\xfe\x89\xe5" - "\x02\x20" - "\x43\x45\xd0\xa5\xff\xe5\x13\x27\x26\xd0\xec\x37\xad\x24\x1e\x9a" - "\x71\x9a\xa4\x89\xb0\x7e\x0f\xc4\xbb\x2d\x50\xd0\xe5\x7f\x7a\x68", - .c_size = 70, - .algo = OID_SM2_with_SM3, - .m = - "\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff\x00" - "\x12\x34\x56\x78\x9a\xbc\xde\xf0\x12\x34\x56\x78\x9a\xbc\xde\xf0", - .m_size = 32, - .public_key_vec = true, - .siggen_sigver_test = true, - }, -}; - /* Example vectors below taken from * http://www.oscca.gov.cn/UpFile/20101222141857786.pdf * diff --git a/include/crypto/sm2.h b/include/crypto/sm2.h deleted file mode 100644 index 04a92c1013c8..000000000000 --- a/include/crypto/sm2.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * sm2.h - SM2 asymmetric public-key algorithm - * as specified by OSCCA GM/T 0003.1-2012 -- 0003.5-2012 SM2 and - * described at https://tools.ietf.org/html/draft-shen-sm2-ecdsa-02 - * - * Copyright (c) 2020, Alibaba Group. - * Written by Tianjia Zhang - */ - -#ifndef _CRYPTO_SM2_H -#define _CRYPTO_SM2_H - -struct shash_desc; - -#if IS_REACHABLE(CONFIG_CRYPTO_SM2) -int sm2_compute_z_digest(struct shash_desc *desc, - const void *key, unsigned int keylen, void *dgst); -#else -static inline int sm2_compute_z_digest(struct shash_desc *desc, - const void *key, unsigned int keylen, - void *dgst) -{ - return -ENOTSUPP; -} -#endif - -#endif /* _CRYPTO_SM2_H */ diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c index de603cf42ac7..457c0a396caf 100644 --- a/security/integrity/digsig_asymmetric.c +++ b/security/integrity/digsig_asymmetric.c @@ -114,8 +114,7 @@ int asymmetric_verify(struct key *keyring, const char *sig, } else if (!strncmp(pk->pkey_algo, "ecdsa-", 6)) { /* edcsa-nist-p192 etc. */ pks.encoding = "x962"; - } else if (!strcmp(pk->pkey_algo, "ecrdsa") || - !strcmp(pk->pkey_algo, "sm2")) { + } else if (!strcmp(pk->pkey_algo, "ecrdsa")) { pks.encoding = "raw"; } else { ret = -ENOPKG; -- cgit From 13e21e0ba44f5fad02a3b7b34987ff3845718198 Mon Sep 17 00:00:00 2001 From: Chenghai Huang Date: Sat, 1 Jun 2024 10:51:49 +0800 Subject: crypto: hisilicon/qm - adjust the internal processing sequence of the vf enable and disable When the vf is enabled, the value of vfs_num must be assigned after the VF configuration is complete. Otherwise, the device may be accessed before the virtual configuration is complete, causing an error. When the vf is disabled, clear vfs_num and execute qm_pm_put_sync before hisi_qm_sriov_disable is return. Otherwise, if qm_clear_vft_config fails, users may access the device when the PCI virtualization is disabled, resulting in an error. Signed-off-by: Chenghai Huang Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 3dac8d8e8568..f614fd228b56 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3793,14 +3793,13 @@ int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs) goto err_put_sync; } - qm->vfs_num = num_vfs; - ret = pci_enable_sriov(pdev, num_vfs); if (ret) { pci_err(pdev, "Can't enable VF!\n"); qm_clear_vft_config(qm); goto err_put_sync; } + qm->vfs_num = num_vfs; pci_info(pdev, "VF enabled, vfs_num(=%d)!\n", num_vfs); @@ -3822,7 +3821,6 @@ EXPORT_SYMBOL_GPL(hisi_qm_sriov_enable); int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen) { struct hisi_qm *qm = pci_get_drvdata(pdev); - int ret; if (pci_vfs_assigned(pdev)) { pci_err(pdev, "Failed to disable VFs as VFs are assigned!\n"); @@ -3837,13 +3835,10 @@ int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen) pci_disable_sriov(pdev); - ret = qm_clear_vft_config(qm); - if (ret) - return ret; - + qm->vfs_num = 0; qm_pm_put_sync(qm); - return 0; + return qm_clear_vft_config(qm); } EXPORT_SYMBOL_GPL(hisi_qm_sriov_disable); -- cgit From c17b56d96ce614cde4aa61e8fc800f04182eec25 Mon Sep 17 00:00:00 2001 From: Chenghai Huang Date: Sat, 1 Jun 2024 10:51:50 +0800 Subject: crypto: hisilicon/zip - optimize the address offset of the reg query function Currently, the reg is queried based on the fixed address offset array. When the number of accelerator cores changes, the system can not flexibly respond to the change. Therefore, the reg to be queried is calculated based on the comp or decomp core base address. Signed-off-by: Chenghai Huang Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/zip/zip_main.c | 48 ++++++++++++++------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index c94a7b20d07e..7c2d803886fd 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -37,7 +37,7 @@ #define HZIP_QM_IDEL_STATUS 0x3040e4 #define HZIP_CORE_DFX_BASE 0x301000 -#define HZIP_CLOCK_GATED_CONTL 0X301004 +#define HZIP_CORE_DFX_DECOMP_BASE 0x304000 #define HZIP_CORE_DFX_COMP_0 0x302000 #define HZIP_CORE_DFX_COMP_1 0x303000 #define HZIP_CORE_DFX_DECOMP_0 0x304000 @@ -48,6 +48,7 @@ #define HZIP_CORE_DFX_DECOMP_5 0x309000 #define HZIP_CORE_REGS_BASE_LEN 0xB0 #define HZIP_CORE_REGS_DFX_LEN 0x28 +#define HZIP_CORE_ADDR_INTRVL 0x1000 #define HZIP_CORE_INT_SOURCE 0x3010A0 #define HZIP_CORE_INT_MASK_REG 0x3010A4 @@ -269,28 +270,6 @@ static const u32 zip_pre_store_caps[] = { ZIP_DEV_ALG_BITMAP, }; -enum { - HZIP_COMP_CORE0, - HZIP_COMP_CORE1, - HZIP_DECOMP_CORE0, - HZIP_DECOMP_CORE1, - HZIP_DECOMP_CORE2, - HZIP_DECOMP_CORE3, - HZIP_DECOMP_CORE4, - HZIP_DECOMP_CORE5, -}; - -static const u64 core_offsets[] = { - [HZIP_COMP_CORE0] = 0x302000, - [HZIP_COMP_CORE1] = 0x303000, - [HZIP_DECOMP_CORE0] = 0x304000, - [HZIP_DECOMP_CORE1] = 0x305000, - [HZIP_DECOMP_CORE2] = 0x306000, - [HZIP_DECOMP_CORE3] = 0x307000, - [HZIP_DECOMP_CORE4] = 0x308000, - [HZIP_DECOMP_CORE5] = 0x309000, -}; - static const struct debugfs_reg32 hzip_dfx_regs[] = { {"HZIP_GET_BD_NUM ", 0x00}, {"HZIP_GET_RIGHT_BD ", 0x04}, @@ -807,6 +786,18 @@ static int hisi_zip_regs_show(struct seq_file *s, void *unused) DEFINE_SHOW_ATTRIBUTE(hisi_zip_regs); +static void __iomem *get_zip_core_addr(struct hisi_qm *qm, int core_num) +{ + u32 zip_comp_core_num = qm->cap_tables.dev_cap_table[ZIP_CLUSTER_COMP_NUM_CAP_IDX].cap_val; + + if (core_num < zip_comp_core_num) + return qm->io_base + HZIP_CORE_DFX_BASE + + (core_num + 1) * HZIP_CORE_ADDR_INTRVL; + + return qm->io_base + HZIP_CORE_DFX_DECOMP_BASE + + (core_num - zip_comp_core_num) * HZIP_CORE_ADDR_INTRVL; +} + static int hisi_zip_core_debug_init(struct hisi_qm *qm) { u32 zip_core_num, zip_comp_core_num; @@ -832,7 +823,7 @@ static int hisi_zip_core_debug_init(struct hisi_qm *qm) regset->regs = hzip_dfx_regs; regset->nregs = ARRAY_SIZE(hzip_dfx_regs); - regset->base = qm->io_base + core_offsets[i]; + regset->base = get_zip_core_addr(qm, i); regset->dev = dev; tmp_d = debugfs_create_dir(buf, qm->debug.debug_root); @@ -921,13 +912,14 @@ debugfs_remove: /* hisi_zip_debug_regs_clear() - clear the zip debug regs */ static void hisi_zip_debug_regs_clear(struct hisi_qm *qm) { + u32 zip_core_num = qm->cap_tables.dev_cap_table[ZIP_CORE_NUM_CAP_IDX].cap_val; int i, j; /* enable register read_clear bit */ writel(HZIP_RD_CNT_CLR_CE_EN, qm->io_base + HZIP_SOFT_CTRL_CNT_CLR_CE); - for (i = 0; i < ARRAY_SIZE(core_offsets); i++) + for (i = 0; i < zip_core_num; i++) for (j = 0; j < ARRAY_SIZE(hzip_dfx_regs); j++) - readl(qm->io_base + core_offsets[i] + + readl(get_zip_core_addr(qm, i) + hzip_dfx_regs[j].offset); /* disable register read_clear bit */ @@ -970,7 +962,7 @@ static int hisi_zip_show_last_regs_init(struct hisi_qm *qm) } for (i = 0; i < zip_core_num; i++) { - io_base = qm->io_base + core_offsets[i]; + io_base = get_zip_core_addr(qm, i); for (j = 0; j < core_dfx_regs_num; j++) { idx = com_dfx_regs_num + i * core_dfx_regs_num + j; debug->last_words[idx] = readl_relaxed( @@ -1022,7 +1014,7 @@ static void hisi_zip_show_last_dfx_regs(struct hisi_qm *qm) else scnprintf(buf, sizeof(buf), "Decomp_core-%d", i - zip_comp_core_num); - base = qm->io_base + core_offsets[i]; + base = get_zip_core_addr(qm, i); pci_info(qm->pdev, "==>%s:\n", buf); /* dump last word for dfx regs during control resetting */ -- cgit From b06affb1cb580e135d1b454d5318fdbe6e24828a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 2 Jun 2024 15:22:19 -0700 Subject: crypto: x86/aes-gcm - add VAES and AVX512 / AVX10 optimized AES-GCM Add implementations of AES-GCM for x86_64 CPUs that support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and either AVX512 or AVX10. There are two implementations, sharing most source code: one using 256-bit vectors and one using 512-bit vectors. This patch improves AES-GCM performance by up to 162%; see Tables 1 and 2 below. I wrote the new AES-GCM assembly code from scratch, focusing on correctness, performance, code size (both source and binary), and documenting the source. The new assembly file aes-gcm-avx10-x86_64.S is about 1200 lines including extensive comments, and it generates less than 8 KB of binary code. The main loop does 4 vectors at a time, with the AES and GHASH instructions interleaved. Any remainder is handled using a simple 1 vector at a time loop, with masking. Several VAES + AVX512 implementations of AES-GCM exist from Intel, including one in OpenSSL and one proposed for inclusion in Linux in 2021 (https://lore.kernel.org/linux-crypto/1611386920-28579-6-git-send-email-megha.dey@intel.com/). These aren't really suitable to be used, though, due to the massive amount of binary code generated (696 KB for OpenSSL, 200 KB for Linux) and well as the significantly larger amount of assembly source (4978 lines for OpenSSL, 1788 lines for Linux). Also, Intel's code does not support 256-bit vectors, which makes it not usable on future AVX10/256-only CPUs, and also not ideal for certain Intel CPUs that have downclocking issues. So I ended up starting from scratch. Usually my much shorter code is actually slightly faster than Intel's AVX512 code, though it depends on message length and on which of Intel's implementations is used; for details, see Tables 3 and 4 below. To facilitate potential integration into other projects, I've dual-licensed aes-gcm-avx10-x86_64.S under Apache-2.0 OR BSD-2-Clause, the same as the recently added RISC-V crypto code. The following two tables summarize the performance improvement over the existing AES-GCM code in Linux that uses AES-NI and AVX2: Table 1: AES-256-GCM encryption throughput improvement, CPU microarchitecture vs. message length in bytes: | 16384 | 4096 | 4095 | 1420 | 512 | 500 | ----------------------+-------+-------+-------+-------+-------+-------+ Intel Ice Lake | 42% | 48% | 60% | 62% | 70% | 69% | Intel Sapphire Rapids | 157% | 145% | 162% | 119% | 96% | 96% | Intel Emerald Rapids | 156% | 144% | 161% | 115% | 95% | 100% | AMD Zen 4 | 103% | 89% | 78% | 56% | 54% | 54% | | 300 | 200 | 64 | 63 | 16 | ----------------------+-------+-------+-------+-------+-------+ Intel Ice Lake | 66% | 48% | 49% | 70% | 53% | Intel Sapphire Rapids | 80% | 60% | 41% | 62% | 38% | Intel Emerald Rapids | 79% | 60% | 41% | 62% | 38% | AMD Zen 4 | 51% | 35% | 27% | 32% | 25% | Table 2: AES-256-GCM decryption throughput improvement, CPU microarchitecture vs. message length in bytes: | 16384 | 4096 | 4095 | 1420 | 512 | 500 | ----------------------+-------+-------+-------+-------+-------+-------+ Intel Ice Lake | 42% | 48% | 59% | 63% | 67% | 71% | Intel Sapphire Rapids | 159% | 145% | 161% | 125% | 102% | 100% | Intel Emerald Rapids | 158% | 144% | 161% | 124% | 100% | 103% | AMD Zen 4 | 110% | 95% | 80% | 59% | 56% | 54% | | 300 | 200 | 64 | 63 | 16 | ----------------------+-------+-------+-------+-------+-------+ Intel Ice Lake | 67% | 56% | 46% | 70% | 56% | Intel Sapphire Rapids | 79% | 62% | 39% | 61% | 39% | Intel Emerald Rapids | 80% | 62% | 40% | 58% | 40% | AMD Zen 4 | 49% | 36% | 30% | 35% | 28% | The above numbers are percentage improvements in single-thread throughput, so e.g. an increase from 4000 MB/s to 6000 MB/s would be listed as 50%. They were collected by directly measuring the Linux crypto API performance using a custom kernel module. Note that indirect benchmarks (e.g. 'cryptsetup benchmark' or benchmarking dm-crypt I/O) include more overhead and won't see quite as much of a difference. All these benchmarks used an associated data length of 16 bytes. Note that AES-GCM is almost always used with short associated data lengths. The following two tables summarize how the performance of my code compares with Intel's AVX512 AES-GCM code, both the version that is in OpenSSL and the version that was proposed for inclusion in Linux. Neither version exists in Linux currently, but these are alternative AES-GCM implementations that could be chosen instead of mine. I collected the following numbers on Emerald Rapids using a userspace benchmark program that calls the assembly functions directly. I've also included a comparison with Cloudflare's AES-GCM implementation from https://boringssl-review.googlesource.com/c/boringssl/+/65987/3. Table 3: VAES-based AES-256-GCM encryption throughput in MB/s, implementation name vs. message length in bytes: | 16384 | 4096 | 4095 | 1420 | 512 | 500 | ---------------------+-------+-------+-------+-------+-------+-------+ This implementation | 14171 | 12956 | 12318 | 9588 | 7293 | 6449 | AVX512_Intel_OpenSSL | 14022 | 12467 | 11863 | 9107 | 5891 | 6472 | AVX512_Intel_Linux | 13954 | 12277 | 11530 | 8712 | 6627 | 5898 | AVX512_Cloudflare | 12564 | 11050 | 10905 | 8152 | 5345 | 5202 | | 300 | 200 | 64 | 63 | 16 | ---------------------+-------+-------+-------+-------+-------+ This implementation | 4939 | 3688 | 1846 | 1821 | 738 | AVX512_Intel_OpenSSL | 4629 | 4532 | 2734 | 2332 | 1131 | AVX512_Intel_Linux | 4035 | 2966 | 1567 | 1330 | 639 | AVX512_Cloudflare | 3344 | 2485 | 1141 | 1127 | 456 | Table 4: VAES-based AES-256-GCM decryption throughput in MB/s, implementation name vs. message length in bytes: | 16384 | 4096 | 4095 | 1420 | 512 | 500 | ---------------------+-------+-------+-------+-------+-------+-------+ This implementation | 14276 | 13311 | 13007 | 11086 | 8268 | 8086 | AVX512_Intel_OpenSSL | 14067 | 12620 | 12421 | 9587 | 5954 | 7060 | AVX512_Intel_Linux | 14116 | 12795 | 11778 | 9269 | 7735 | 6455 | AVX512_Cloudflare | 13301 | 12018 | 11919 | 9182 | 7189 | 6726 | | 300 | 200 | 64 | 63 | 16 | ---------------------+-------+-------+-------+-------+-------+ This implementation | 6454 | 5020 | 2635 | 2602 | 1079 | AVX512_Intel_OpenSSL | 5184 | 5799 | 2957 | 2545 | 1228 | AVX512_Intel_Linux | 4394 | 4247 | 2235 | 1635 | 922 | AVX512_Cloudflare | 4289 | 3851 | 1435 | 1417 | 574 | So, usually my code is actually slightly faster than Intel's code, though the OpenSSL implementation has a slight edge on messages shorter than 256 bytes in this microbenchmark. (This also holds true when doing the same tests on AMD Zen 4.) It can be seen that the large code size (up to 94x larger!) of the Intel implementations doesn't seem to bring much benefit, so starting from scratch with much smaller code, as I've done, seems appropriate. The performance of my code on messages shorter than 256 bytes could be improved through a limited amount of unrolling, but it's unclear it would be worth it, given code size considerations (e.g. caches) that don't get measured in microbenchmarks. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/Kconfig | 1 + arch/x86/crypto/Makefile | 3 + arch/x86/crypto/aes-gcm-avx10-x86_64.S | 1222 ++++++++++++++++++++++++++++++++ arch/x86/crypto/aesni-intel_glue.c | 549 +++++++++++++- 4 files changed, 1759 insertions(+), 16 deletions(-) create mode 100644 arch/x86/crypto/aes-gcm-avx10-x86_64.S diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index c9e59589a1ce..24875e6295f2 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -18,6 +18,7 @@ config CRYPTO_AES_NI_INTEL depends on X86 select CRYPTO_AEAD select CRYPTO_LIB_AES + select CRYPTO_LIB_GF128MUL select CRYPTO_ALGAPI select CRYPTO_SKCIPHER select CRYPTO_SIMD diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 9c5ce5613738..a2a536b690fa 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -50,6 +50,9 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \ aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o +ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) +aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o +endif obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S new file mode 100644 index 000000000000..97e0ee515fc5 --- /dev/null +++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S @@ -0,0 +1,1222 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// VAES and VPCLMULQDQ optimized AES-GCM for x86_64 +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ +// +// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that +// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and +// either AVX512 or AVX10. Some of the functions, notably the encryption and +// decryption update functions which are the most performance-critical, are +// provided in two variants generated from a macro: one using 256-bit vectors +// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The +// other, "shared" functions (vaes_avx10) use at most 256-bit vectors. +// +// The functions that use 512-bit vectors are intended for CPUs that support +// 512-bit vectors *and* where using them doesn't cause significant +// downclocking. They require the following CPU features: +// +// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512) +// +// The other functions require the following CPU features: +// +// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256) +// +// All functions use the "System V" ABI. The Windows ABI is not supported. +// +// Note that we use "avx10" in the names of the functions as a shorthand to +// really mean "AVX10 or a certain set of AVX512 features". Due to Intel's +// introduction of AVX512 and then its replacement by AVX10, there doesn't seem +// to be a simple way to name things that makes sense on all CPUs. +// +// Note that the macros that support both 256-bit and 512-bit vectors could +// fairly easily be changed to support 128-bit too. However, this would *not* +// be sufficient to allow the code to run on CPUs without AVX512 or AVX10, +// because the code heavily uses several features of these extensions other than +// the vector length: the increase in the number of SIMD registers from 16 to +// 32, masking support, and new instructions such as vpternlogd (which can do a +// three-argument XOR). These features are very useful for AES-GCM. + +#include + +.section .rodata +.p2align 6 + + // A shuffle mask that reflects the bytes of 16-byte blocks +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f + + // This is the GHASH reducing polynomial without its constant term, i.e. + // x^128 + x^7 + x^2 + x, represented using the backwards mapping + // between bits and polynomial coefficients. + // + // Alternatively, it can be interpreted as the naturally-ordered + // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the + // "reversed" GHASH reducing polynomial without its x^128 term. +.Lgfpoly: + .octa 0xc2000000000000000000000000000001 + + // Same as above, but with the (1 << 64) bit set. +.Lgfpoly_and_internal_carrybit: + .octa 0xc2000000000000010000000000000001 + + // The below constants are used for incrementing the counter blocks. + // ctr_pattern points to the four 128-bit values [0, 1, 2, 3]. + // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and + // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks. +.Lctr_pattern: + .octa 0 + .octa 1 +.Linc_2blocks: + .octa 2 + .octa 3 +.Linc_4blocks: + .octa 4 + +// Number of powers of the hash key stored in the key struct. The powers are +// stored from highest (H^NUM_H_POWERS) to lowest (H^1). +#define NUM_H_POWERS 16 + +// Offset to AES key length (in bytes) in the key struct +#define OFFSETOF_AESKEYLEN 480 + +// Offset to start of hash key powers array in the key struct +#define OFFSETOF_H_POWERS 512 + +// Offset to end of hash key powers array in the key struct. +// +// This is immediately followed by three zeroized padding blocks, which are +// included so that partial vectors can be handled more easily. E.g. if VL=64 +// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most +// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. +#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) + +.text + +// Set the vector length in bytes. This sets the VL variable and defines +// register aliases V0-V31 that map to the ymm or zmm registers. +.macro _set_veclen vl + .set VL, \vl +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if VL == 32 + .set V\i, %ymm\i +.elseif VL == 64 + .set V\i, %zmm\i +.else + .error "Unsupported vector length" +.endif +.endr +.endm + +// The _ghash_mul_step macro does one step of GHASH multiplication of the +// 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the +// reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the +// same size as \a and \b. To complete all steps, this must invoked with \i=0 +// through \i=9. The division into steps allows users of this macro to +// optionally interleave the computation with other instructions. Users of this +// macro must preserve the parameter registers across steps. +// +// The multiplications are done in GHASH's representation of the finite field +// GF(2^128). Elements of GF(2^128) are represented as binary polynomials +// (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial +// G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is +// just XOR, while multiplication is more complex and has two parts: (a) do +// carryless multiplication of two 128-bit input polynomials to get a 256-bit +// intermediate product polynomial, and (b) reduce the intermediate product to +// 128 bits by adding multiples of G that cancel out terms in it. (Adding +// multiples of G doesn't change which field element the polynomial represents.) +// +// Unfortunately, the GCM specification maps bits to/from polynomial +// coefficients backwards from the natural order. In each byte it specifies the +// highest bit to be the lowest order polynomial coefficient, *not* the highest! +// This makes it nontrivial to work with the GHASH polynomials. We could +// reflect the bits, but x86 doesn't have an instruction that does that. +// +// Instead, we operate on the values without bit-reflecting them. This *mostly* +// just works, since XOR and carryless multiplication are symmetric with respect +// to bit order, but it has some consequences. First, due to GHASH's byte +// order, by skipping bit reflection, *byte* reflection becomes necessary to +// give the polynomial terms a consistent order. E.g., considering an N-bit +// value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0 +// through N-1 of the byte-reflected value represent the coefficients of x^(N-1) +// through x^0, whereas bits 0 through N-1 of the non-byte-reflected value +// represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked +// with. Fortunately, x86's vpshufb instruction can do byte reflection. +// +// Second, forgoing the bit reflection causes an extra multiple of x (still +// using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each +// multiplication. This is because an M-bit by N-bit carryless multiplication +// really produces a (M+N-1)-bit product, but in practice it's zero-extended to +// M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits +// to polynomial coefficients backwards, this zero-extension actually changes +// the product by introducing an extra factor of x. Therefore, users of this +// macro must ensure that one of the inputs has an extra factor of x^-1, i.e. +// the multiplicative inverse of x, to cancel out the extra x. +// +// Third, the backwards coefficients convention is just confusing to work with, +// since it makes "low" and "high" in the polynomial math mean the opposite of +// their normal meaning in computer programming. This can be solved by using an +// alternative interpretation: the polynomial coefficients are understood to be +// in the natural order, and the multiplication is actually \a * \b * x^-128 mod +// x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs, +// or the implementation at all; it just changes the mathematical interpretation +// of what each instruction is doing. Starting from here, we'll use this +// alternative interpretation, as it's easier to understand the code that way. +// +// Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 => +// 128-bit carryless multiplication, so we break the 128 x 128 multiplication +// into parts as follows (the _L and _H suffixes denote low and high 64 bits): +// +// LO = a_L * b_L +// MI = (a_L * b_H) + (a_H * b_L) +// HI = a_H * b_H +// +// The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit. +// Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and +// HI right away, since the way the reduction works makes that unnecessary. +// +// For the reduction, we cancel out the low 128 bits by adding multiples of G = +// x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of +// which cancels out the next lowest 64 bits. Consider a value x^64*A + B, +// where A and B are 128-bit. Adding B_L*G to that value gives: +// +// x^64*A + B + B_L*G +// = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1) +// = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L +// = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L +// = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57)) +// +// So: if we sum A, B with its halves swapped, and the low half of B times x^63 +// + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the +// original value x^64*A + B. I.e., the low 64 bits got canceled out. +// +// We just need to apply this twice: first to fold LO into MI, and second to +// fold the updated MI into HI. +// +// The needed three-argument XORs are done using the vpternlogd instruction with +// immediate 0x96, since this is faster than two vpxord instructions. +// +// A potential optimization, assuming that b is fixed per-key (if a is fixed +// per-key it would work the other way around), is to use one iteration of the +// reduction described above to precompute a value c such that x^64*c = b mod G, +// and then multiply a_L by c (and implicitly by x^64) instead of by b: +// +// MI = (a_L * c_L) + (a_H * b_L) +// HI = (a_L * c_H) + (a_H * b_H) +// +// This would eliminate the LO part of the intermediate product, which would +// eliminate the need to fold LO into MI. This would save two instructions, +// including a vpclmulqdq. However, we currently don't use this optimization +// because it would require twice as many per-key precomputed values. +// +// Using Karatsuba multiplication instead of "schoolbook" multiplication +// similarly would save a vpclmulqdq but does not seem to be worth it. +.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2 +.if \i == 0 + vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L + vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H +.elseif \i == 1 + vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L +.elseif \i == 2 + vpxord \t2, \t1, \t1 // MI = MI_0 + MI_1 +.elseif \i == 3 + vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57) +.elseif \i == 4 + vpshufd $0x4e, \t0, \t0 // Swap halves of LO +.elseif \i == 5 + vpternlogd $0x96, \t2, \t0, \t1 // Fold LO into MI +.elseif \i == 6 + vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H +.elseif \i == 7 + vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + vpshufd $0x4e, \t1, \t1 // Swap halves of MI +.elseif \i == 9 + vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI +.endif +.endm + +// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store +// the reduced products in \dst. See _ghash_mul_step for full explanation. +.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2 +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2 +.endr +.endm + +// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the +// *unreduced* products to \lo, \mi, and \hi. +.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0, t1, t2, t3 + vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L + vpclmulqdq $0x01, \a, \b, \t1 // a_L * b_H + vpclmulqdq $0x10, \a, \b, \t2 // a_H * b_L + vpclmulqdq $0x11, \a, \b, \t3 // a_H * b_H + vpxord \t0, \lo, \lo + vpternlogd $0x96, \t2, \t1, \mi + vpxord \t3, \hi, \hi +.endm + +// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit +// reduced products in \hi. See _ghash_mul_step for explanation of reduction. +.macro _ghash_reduce lo, mi, hi, gfpoly, t0 + vpclmulqdq $0x01, \lo, \gfpoly, \t0 + vpshufd $0x4e, \lo, \lo + vpternlogd $0x96, \t0, \lo, \mi + vpclmulqdq $0x01, \mi, \gfpoly, \t0 + vpshufd $0x4e, \mi, \mi + vpternlogd $0x96, \t0, \mi, \hi +.endm + +// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key); +// +// Given the expanded AES key |key->aes_key|, this function derives the GHASH +// subkey and initializes |key->ghash_key_powers| with powers of it. +// +// The number of key powers initialized is NUM_H_POWERS, and they are stored in +// the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key +// powers themselves are also initialized. +// +// This macro supports both VL=32 and VL=64. _set_veclen must have been invoked +// with the desired length. In the VL=32 case, the function computes twice as +// many key powers than are actually used by the VL=32 GCM update functions. +// This is done to keep the key format the same regardless of vector length. +.macro _aes_gcm_precompute + + // Function arguments + .set KEY, %rdi + + // Additional local variables. V0-V2 and %rax are used as temporaries. + .set POWERS_PTR, %rsi + .set RNDKEYLAST_PTR, %rdx + .set H_CUR, V3 + .set H_CUR_YMM, %ymm3 + .set H_CUR_XMM, %xmm3 + .set H_INC, V4 + .set H_INC_YMM, %ymm4 + .set H_INC_XMM, %xmm4 + .set GFPOLY, V5 + .set GFPOLY_YMM, %ymm5 + .set GFPOLY_XMM, %xmm5 + + // Get pointer to lowest set of key powers (located at end of array). + lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR + + // Encrypt an all-zeroes block to get the raw hash subkey. + movl OFFSETOF_AESKEYLEN(KEY), %eax + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR + vmovdqu (KEY), %xmm0 // Zero-th round key XOR all-zeroes block + add $16, KEY +1: + vaesenc (KEY), %xmm0, %xmm0 + add $16, KEY + cmp KEY, RNDKEYLAST_PTR + jne 1b + vaesenclast (RNDKEYLAST_PTR), %xmm0, %xmm0 + + // Reflect the bytes of the raw hash subkey. + vpshufb .Lbswap_mask(%rip), %xmm0, H_CUR_XMM + + // Zeroize the padding blocks. + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %ymm0, VL(POWERS_PTR) + vmovdqu %xmm0, VL+2*16(POWERS_PTR) + + // Finish preprocessing the first key power, H^1. Since this GHASH + // implementation operates directly on values with the backwards bit + // order specified by the GCM standard, it's necessary to preprocess the + // raw key as follows. First, reflect its bytes. Second, multiply it + // by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards + // interpretation of polynomial coefficients), which can also be + // interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121 + // + 1 using the alternative, natural interpretation of polynomial + // coefficients. For details, see the comment above _ghash_mul_step. + // + // Either way, for the multiplication the concrete operation performed + // is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2 + // << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit + // wide shift instruction, so instead double each of the two 64-bit + // halves and incorporate the internal carry bit into the value XOR'd. + vpshufd $0xd3, H_CUR_XMM, %xmm0 + vpsrad $31, %xmm0, %xmm0 + vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM + vpand .Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0 + vpxor %xmm0, H_CUR_XMM, H_CUR_XMM + + // Load the gfpoly constant. + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY + + // Square H^1 to get H^2. + // + // Note that as with H^1, all higher key powers also need an extra + // factor of x^-1 (or x using the natural interpretation). Nothing + // special needs to be done to make this happen, though: H^1 * H^1 would + // end up with two factors of x^-1, but the multiplication consumes one. + // So the product H^2 ends up with the desired one factor of x^-1. + _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \ + %xmm0, %xmm1, %xmm2 + + // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2]. + vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM + vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM + +.if VL == 64 + // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4]. + _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \ + %ymm0, %ymm1, %ymm2 + vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR + vshufi64x2 $0, H_INC, H_INC, H_INC +.endif + + // Store the lowest set of key powers. + vmovdqu8 H_CUR, (POWERS_PTR) + + // Compute and store the remaining key powers. With VL=32, repeatedly + // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)]. + // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by + // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)]. + mov $(NUM_H_POWERS*16/VL) - 1, %eax +.Lprecompute_next\@: + sub $VL, POWERS_PTR + _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2 + vmovdqu8 H_CUR, (POWERS_PTR) + dec %eax + jnz .Lprecompute_next\@ + + vzeroupper // This is needed after using ymm or zmm registers. + RET +.endm + +// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store +// the result in \dst_xmm. This implicitly zeroizes the other lanes of dst. +.macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm + vextracti32x4 $1, \src, \t0_xmm +.if VL == 32 + vpxord \t0_xmm, \src_xmm, \dst_xmm +.elseif VL == 64 + vextracti32x4 $2, \src, \t1_xmm + vextracti32x4 $3, \src, \t2_xmm + vpxord \t0_xmm, \src_xmm, \dst_xmm + vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm +.else + .error "Unsupported vector length" +.endif +.endm + +// Do one step of the GHASH update of the data blocks given in the vector +// registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The +// division into steps allows users of this macro to optionally interleave the +// computation with other instructions. This macro uses the vector register +// GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered; +// H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and +// GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the +// data blocks. The parameter registers must be preserved across steps. +// +// The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) + +// H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the +// operations are vectorized operations on vectors of 16-byte blocks. E.g., +// with VL=32 there are 2 blocks per vector and the vectorized terms correspond +// to the following non-vectorized terms: +// +// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0) +// H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3 +// H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5 +// H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7 +// +// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15. +// +// More concretely, this code does: +// - Do vectorized "schoolbook" multiplications to compute the intermediate +// 256-bit product of each block and its corresponding hash key power. +// There are 4*VL/16 of these intermediate products. +// - Sum (XOR) the intermediate 256-bit products across vectors. This leaves +// VL/16 256-bit intermediate values. +// - Do a vectorized reduction of these 256-bit intermediate values to +// 128-bits each. This leaves VL/16 128-bit intermediate values. +// - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM. +// +// See _ghash_mul_step for the full explanation of the operations performed for +// each individual finite field multiplication and reduction. +.macro _ghash_step_4x i +.if \i == 0 + vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 + vpxord GHASH_ACC, GHASHDATA0, GHASHDATA0 + vpshufb BSWAP_MASK, GHASHDATA1, GHASHDATA1 + vpshufb BSWAP_MASK, GHASHDATA2, GHASHDATA2 +.elseif \i == 1 + vpshufb BSWAP_MASK, GHASHDATA3, GHASHDATA3 + vpclmulqdq $0x00, H_POW4, GHASHDATA0, GHASH_ACC // LO_0 + vpclmulqdq $0x00, H_POW3, GHASHDATA1, GHASHTMP0 // LO_1 + vpclmulqdq $0x00, H_POW2, GHASHDATA2, GHASHTMP1 // LO_2 +.elseif \i == 2 + vpxord GHASHTMP0, GHASH_ACC, GHASH_ACC // sum(LO_{1,0}) + vpclmulqdq $0x00, H_POW1, GHASHDATA3, GHASHTMP2 // LO_3 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASH_ACC // LO = sum(LO_{3,2,1,0}) + vpclmulqdq $0x01, H_POW4, GHASHDATA0, GHASHTMP0 // MI_0 +.elseif \i == 3 + vpclmulqdq $0x01, H_POW3, GHASHDATA1, GHASHTMP1 // MI_1 + vpclmulqdq $0x01, H_POW2, GHASHDATA2, GHASHTMP2 // MI_2 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{2,1,0}) + vpclmulqdq $0x01, H_POW1, GHASHDATA3, GHASHTMP1 // MI_3 +.elseif \i == 4 + vpclmulqdq $0x10, H_POW4, GHASHDATA0, GHASHTMP2 // MI_4 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{4,3,2,1,0}) + vpclmulqdq $0x10, H_POW3, GHASHDATA1, GHASHTMP1 // MI_5 + vpclmulqdq $0x10, H_POW2, GHASHDATA2, GHASHTMP2 // MI_6 +.elseif \i == 5 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{6,5,4,3,2,1,0}) + vpclmulqdq $0x01, GHASH_ACC, GFPOLY, GHASHTMP2 // LO_L*(x^63 + x^62 + x^57) + vpclmulqdq $0x10, H_POW1, GHASHDATA3, GHASHTMP1 // MI_7 + vpxord GHASHTMP1, GHASHTMP0, GHASHTMP0 // MI = sum(MI_{7,6,5,4,3,2,1,0}) +.elseif \i == 6 + vpshufd $0x4e, GHASH_ACC, GHASH_ACC // Swap halves of LO + vpclmulqdq $0x11, H_POW4, GHASHDATA0, GHASHDATA0 // HI_0 + vpclmulqdq $0x11, H_POW3, GHASHDATA1, GHASHDATA1 // HI_1 + vpclmulqdq $0x11, H_POW2, GHASHDATA2, GHASHDATA2 // HI_2 +.elseif \i == 7 + vpternlogd $0x96, GHASHTMP2, GHASH_ACC, GHASHTMP0 // Fold LO into MI + vpclmulqdq $0x11, H_POW1, GHASHDATA3, GHASHDATA3 // HI_3 + vpternlogd $0x96, GHASHDATA2, GHASHDATA1, GHASHDATA0 // sum(HI_{2,1,0}) + vpclmulqdq $0x01, GHASHTMP0, GFPOLY, GHASHTMP1 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + vpxord GHASHDATA3, GHASHDATA0, GHASH_ACC // HI = sum(HI_{3,2,1,0}) + vpshufd $0x4e, GHASHTMP0, GHASHTMP0 // Swap halves of MI + vpternlogd $0x96, GHASHTMP1, GHASHTMP0, GHASH_ACC // Fold MI into HI +.elseif \i == 9 + _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ + GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM +.endif +.endm + +// Do one non-last round of AES encryption on the counter blocks in V0-V3 using +// the round key that has been broadcast to all 128-bit lanes of \round_key. +.macro _vaesenc_4x round_key + vaesenc \round_key, V0, V0 + vaesenc \round_key, V1, V1 + vaesenc \round_key, V2, V2 + vaesenc \round_key, V3, V3 +.endm + +// Start the AES encryption of four vectors of counter blocks. +.macro _ctr_begin_4x + + // Increment LE_CTR four times to generate four vectors of little-endian + // counter blocks, swap each to big-endian, and store them in V0-V3. + vpshufb BSWAP_MASK, LE_CTR, V0 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V1 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V2 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V3 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + + // AES "round zero": XOR in the zero-th round key. + vpxord RNDKEY0, V0, V0 + vpxord RNDKEY0, V1, V1 + vpxord RNDKEY0, V2, V2 + vpxord RNDKEY0, V3, V3 +.endm + +// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// const u8 *src, u8 *dst, int datalen); +// +// This macro generates a GCM encryption or decryption update function with the +// above prototype (with \enc selecting which one). This macro supports both +// VL=32 and VL=64. _set_veclen must have been invoked with the desired length. +// +// This function computes the next portion of the CTR keystream, XOR's it with +// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted +// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the +// next |datalen| ciphertext bytes. +// +// |datalen| must be a multiple of 16, except on the last call where it can be +// any length. The caller must do any buffering needed to ensure this. Both +// in-place and out-of-place en/decryption are supported. +// +// |le_ctr| must give the current counter in little-endian format. For a new +// message, the low word of the counter must be 2. This function loads the +// counter from |le_ctr| and increments the loaded counter as needed, but it +// does *not* store the updated counter back to |le_ctr|. The caller must +// update |le_ctr| if any more data segments follow. Internally, only the low +// 32-bit word of the counter is incremented, following the GCM standard. +.macro _aes_gcm_update enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set SRC, %rcx + .set DST, %r8 + .set DATALEN, %r9d + .set DATALEN64, %r9 // Zero-extend DATALEN before using! + + // Additional local variables + + // %rax and %k1 are used as temporary registers. LE_CTR_PTR is also + // available as a temporary register after the counter is loaded. + + // AES key length in bytes + .set AESKEYLEN, %r10d + .set AESKEYLEN64, %r10 + + // Pointer to the last AES round key for the chosen AES variant + .set RNDKEYLAST_PTR, %r11 + + // In the main loop, V0-V3 are used as AES input and output. Elsewhere + // they are used as temporary registers. + + // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data. + .set GHASHDATA0, V4 + .set GHASHDATA0_XMM, %xmm4 + .set GHASHDATA1, V5 + .set GHASHDATA1_XMM, %xmm5 + .set GHASHDATA2, V6 + .set GHASHDATA2_XMM, %xmm6 + .set GHASHDATA3, V7 + + // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values + // using vpshufb, copied to all 128-bit lanes. + .set BSWAP_MASK, V8 + + // RNDKEY temporarily holds the next AES round key. + .set RNDKEY, V9 + + // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, + // only the lowest 128-bit lane can be nonzero. When not fully reduced, + // more than one lane may be used, and they need to be XOR'd together. + .set GHASH_ACC, V10 + .set GHASH_ACC_XMM, %xmm10 + + // LE_CTR_INC is the vector of 32-bit words that need to be added to a + // vector of little-endian counter blocks to advance it forwards. + .set LE_CTR_INC, V11 + + // LE_CTR contains the next set of little-endian counter blocks. + .set LE_CTR, V12 + + // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys, + // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, + // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. + .set RNDKEY0, V13 + .set RNDKEYLAST, V14 + .set RNDKEY_M9, V15 + .set RNDKEY_M8, V16 + .set RNDKEY_M7, V17 + .set RNDKEY_M6, V18 + .set RNDKEY_M5, V19 + + // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with + // the corresponding block of source data. This is useful because + // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can + // be computed in parallel with the AES rounds. + .set RNDKEYLAST0, V20 + .set RNDKEYLAST1, V21 + .set RNDKEYLAST2, V22 + .set RNDKEYLAST3, V23 + + // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These + // cannot coincide with anything used for AES encryption, since for + // performance reasons GHASH and AES encryption are interleaved. + .set GHASHTMP0, V24 + .set GHASHTMP1, V25 + .set GHASHTMP2, V26 + + // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The + // descending numbering reflects the order of the key powers. + .set H_POW4, V27 + .set H_POW3, V28 + .set H_POW2, V29 + .set H_POW1, V30 + + // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes. + .set GFPOLY, V31 + + // Load some constants. + vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY + + // Load the GHASH accumulator and the starting counter. + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM + vbroadcasti32x4 (LE_CTR_PTR), LE_CTR + + // Load the AES key length in bytes. + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Make RNDKEYLAST_PTR point to the last AES round key. This is the + // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 + // respectively. Then load the zero-th and last round keys. + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR + vbroadcasti32x4 (KEY), RNDKEY0 + vbroadcasti32x4 (RNDKEYLAST_PTR), RNDKEYLAST + + // Finish initializing LE_CTR by adding [0, 1, ...] to its low words. + vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR + + // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes. +.if VL == 32 + vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC +.elseif VL == 64 + vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC +.else + .error "Unsupported vector length" +.endif + + // If there are at least 4*VL bytes of data, then continue into the loop + // that processes 4*VL bytes of data at a time. Otherwise skip it. + // + // Pre-subtracting 4*VL from DATALEN saves an instruction from the main + // loop and also ensures that at least one write always occurs to + // DATALEN, zero-extending it and allowing DATALEN64 to be used later. + sub $4*VL, DATALEN + jl .Lcrypt_loop_4x_done\@ + + // Load powers of the hash key. + vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4 + vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3 + vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2 + vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1 + + // Main loop: en/decrypt and hash 4 vectors at a time. + // + // When possible, interleave the AES encryption of the counter blocks + // with the GHASH update of the ciphertext blocks. This improves + // performance on many CPUs because the execution ports used by the VAES + // instructions often differ from those used by vpclmulqdq and other + // instructions used in GHASH. For example, many Intel CPUs dispatch + // vaesenc to ports 0 and 1 and vpclmulqdq to port 5. + // + // The interleaving is easiest to do during decryption, since during + // decryption the ciphertext blocks are immediately available. For + // encryption, instead encrypt the first set of blocks, then hash those + // blocks while encrypting the next set of blocks, repeat that as + // needed, and finally hash the last set of blocks. + +.if \enc + // Encrypt the first 4 vectors of plaintext blocks. Leave the resulting + // ciphertext in GHASHDATA[0-3] for GHASH. + _ctr_begin_4x + lea 16(KEY), %rax +1: + vbroadcasti32x4 (%rax), RNDKEY + _vaesenc_4x RNDKEY + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 + vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 + vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 + vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 + vaesenclast RNDKEYLAST0, V0, GHASHDATA0 + vaesenclast RNDKEYLAST1, V1, GHASHDATA1 + vaesenclast RNDKEYLAST2, V2, GHASHDATA2 + vaesenclast RNDKEYLAST3, V3, GHASHDATA3 + vmovdqu8 GHASHDATA0, 0*VL(DST) + vmovdqu8 GHASHDATA1, 1*VL(DST) + vmovdqu8 GHASHDATA2, 2*VL(DST) + vmovdqu8 GHASHDATA3, 3*VL(DST) + add $4*VL, SRC + add $4*VL, DST + sub $4*VL, DATALEN + jl .Lghash_last_ciphertext_4x\@ +.endif + + // Cache as many additional AES round keys as possible. +.irp i, 9,8,7,6,5 + vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i +.endr + +.Lcrypt_loop_4x\@: + + // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If + // encrypting, GHASHDATA[0-3] already contain the previous ciphertext. +.if !\enc + vmovdqu8 0*VL(SRC), GHASHDATA0 + vmovdqu8 1*VL(SRC), GHASHDATA1 + vmovdqu8 2*VL(SRC), GHASHDATA2 + vmovdqu8 3*VL(SRC), GHASHDATA3 +.endif + + // Start the AES encryption of the counter blocks. + _ctr_begin_4x + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + vbroadcasti32x4 -13*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY + vbroadcasti32x4 -12*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY +192: + vbroadcasti32x4 -11*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY + vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY +128: + + // XOR the source data with the last round key, saving the result in + // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). +.if \enc + vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 + vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 + vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 + vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 +.else + vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0 + vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1 + vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2 + vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3 +.endif + + // Finish the AES encryption of the counter blocks in V0-V3, interleaved + // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. +.irp i, 9,8,7,6,5 + _vaesenc_4x RNDKEY_M\i + _ghash_step_4x (9 - \i) +.endr +.irp i, 4,3,2,1 + vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY + _ghash_step_4x (9 - \i) +.endr + _ghash_step_4x 9 + + // Do the last AES round. This handles the XOR with the source data + // too, as per the optimization described above. + vaesenclast RNDKEYLAST0, V0, GHASHDATA0 + vaesenclast RNDKEYLAST1, V1, GHASHDATA1 + vaesenclast RNDKEYLAST2, V2, GHASHDATA2 + vaesenclast RNDKEYLAST3, V3, GHASHDATA3 + + // Store the en/decrypted data to DST. + vmovdqu8 GHASHDATA0, 0*VL(DST) + vmovdqu8 GHASHDATA1, 1*VL(DST) + vmovdqu8 GHASHDATA2, 2*VL(DST) + vmovdqu8 GHASHDATA3, 3*VL(DST) + + add $4*VL, SRC + add $4*VL, DST + sub $4*VL, DATALEN + jge .Lcrypt_loop_4x\@ + +.if \enc +.Lghash_last_ciphertext_4x\@: + // Update GHASH with the last set of ciphertext blocks. +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_step_4x \i +.endr +.endif + +.Lcrypt_loop_4x_done\@: + + // Undo the extra subtraction by 4*VL and check whether data remains. + add $4*VL, DATALEN + jz .Ldone\@ + + // The data length isn't a multiple of 4*VL. Process the remaining data + // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time. + // Going one vector at a time may seem inefficient compared to having + // separate code paths for each possible number of vectors remaining. + // However, using a loop keeps the code size down, and it performs + // surprising well; modern CPUs will start executing the next iteration + // before the previous one finishes and also predict the number of loop + // iterations. For a similar reason, we roll up the AES rounds. + // + // On the last iteration, the remaining length may be less than VL. + // Handle this using masking. + // + // Since there are enough key powers available for all remaining data, + // there is no need to do a GHASH reduction after each iteration. + // Instead, multiply each remaining block by its own key power, and only + // do a GHASH reduction at the very end. + + // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N + // is the number of blocks that remain. + .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused. + mov DATALEN, %eax + neg %rax + and $~15, %rax // -round_up(DATALEN, 16) + lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR + + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. + .set LO, GHASHDATA0 + .set LO_XMM, GHASHDATA0_XMM + .set MI, GHASHDATA1 + .set MI_XMM, GHASHDATA1_XMM + .set HI, GHASHDATA2 + .set HI_XMM, GHASHDATA2_XMM + vpxor LO_XMM, LO_XMM, LO_XMM + vpxor MI_XMM, MI_XMM, MI_XMM + vpxor HI_XMM, HI_XMM, HI_XMM + +.Lcrypt_loop_1x\@: + + // Select the appropriate mask for this iteration: all 1's if + // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the + // bzhi instruction from BMI2. (This relies on DATALEN <= 255.) +.if VL < 64 + mov $-1, %eax + bzhi DATALEN, %eax, %eax + kmovd %eax, %k1 +.else + mov $-1, %rax + bzhi DATALEN64, %rax, %rax + kmovq %rax, %k1 +.endif + + // Encrypt a vector of counter blocks. This does not need to be masked. + vpshufb BSWAP_MASK, LE_CTR, V0 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpxord RNDKEY0, V0, V0 + lea 16(KEY), %rax +1: + vbroadcasti32x4 (%rax), RNDKEY + vaesenc RNDKEY, V0, V0 + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + vaesenclast RNDKEYLAST, V0, V0 + + // XOR the data with the appropriate number of keystream bytes. + vmovdqu8 (SRC), V1{%k1}{z} + vpxord V1, V0, V0 + vmovdqu8 V0, (DST){%k1} + + // Update GHASH with the ciphertext block(s), without reducing. + // + // In the case of DATALEN < VL, the ciphertext is zero-padded to VL. + // (If decrypting, it's done by the above masked load. If encrypting, + // it's done by the below masked register-to-register move.) Note that + // if DATALEN <= VL - 16, there will be additional padding beyond the + // padding of the last block specified by GHASH itself; i.e., there may + // be whole block(s) that get processed by the GHASH multiplication and + // reduction instructions but should not actually be included in the + // GHASH. However, any such blocks are all-zeroes, and the values that + // they're multiplied with are also all-zeroes. Therefore they just add + // 0 * 0 = 0 to the final GHASH result, which makes no difference. + vmovdqu8 (POWERS_PTR), H_POW1 +.if \enc + vmovdqu8 V0, V1{%k1}{z} +.endif + vpshufb BSWAP_MASK, V1, V0 + vpxord GHASH_ACC, V0, V0 + _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3 + vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + + add $VL, POWERS_PTR + add $VL, SRC + add $VL, DST + sub $VL, DATALEN + jg .Lcrypt_loop_1x\@ + + // Finally, do the GHASH reduction. + _ghash_reduce LO, MI, HI, GFPOLY, V0 + _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2 + +.Ldone\@: + // Store the updated GHASH accumulator back to memory. + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) + + vzeroupper // This is needed after using ymm or zmm registers. + RET +.endm + +// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen); +// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], +// const u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen, +// const u8 tag[16], int taglen); +// +// This macro generates one of the above two functions (with \enc selecting +// which one). Both functions finish computing the GCM authentication tag by +// updating GHASH with the lengths block and encrypting the GHASH accumulator. +// |total_aadlen| and |total_datalen| must be the total length of the additional +// authenticated data and the en/decrypted data in bytes, respectively. +// +// The encryption function then stores the full-length (16-byte) computed +// authentication tag to |ghash_acc|. The decryption function instead loads the +// expected authentication tag (the one that was transmitted) from the 16-byte +// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the +// computed tag in constant time, and returns true if and only if they match. +.macro _aes_gcm_final enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set TOTAL_AADLEN, %rcx + .set TOTAL_DATALEN, %r8 + .set TAG, %r9 + .set TAGLEN, %r10d // Originally at 8(%rsp) + + // Additional local variables. + // %rax, %xmm0-%xmm3, and %k1 are used as temporary registers. + .set AESKEYLEN, %r11d + .set AESKEYLEN64, %r11 + .set GFPOLY, %xmm4 + .set BSWAP_MASK, %xmm5 + .set LE_CTR, %xmm6 + .set GHASH_ACC, %xmm7 + .set H_POW1, %xmm8 + + // Load some constants. + vmovdqa .Lgfpoly(%rip), GFPOLY + vmovdqa .Lbswap_mask(%rip), BSWAP_MASK + + // Load the AES key length in bytes. + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Set up a counter block with 1 in the low 32-bit word. This is the + // counter that produces the ciphertext needed to encrypt the auth tag. + // GFPOLY has 1 in the low word, so grab the 1 from there using a blend. + vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR + + // Build the lengths block and XOR it with the GHASH accumulator. + // Although the lengths block is defined as the AAD length followed by + // the en/decrypted data length, both in big-endian byte order, a byte + // reflection of the full block is needed because of the way we compute + // GHASH (see _ghash_mul_step). By using little-endian values in the + // opposite order, we avoid having to reflect any bytes here. + vmovq TOTAL_DATALEN, %xmm0 + vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0 + vpsllq $3, %xmm0, %xmm0 // Bytes to bits + vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC + + // Load the first hash key power (H^1), which is stored last. + vmovdqu8 OFFSETOFEND_H_POWERS-16(KEY), H_POW1 + +.if !\enc + // Prepare a mask of TAGLEN one bits. + movl 8(%rsp), TAGLEN + mov $-1, %eax + bzhi TAGLEN, %eax, %eax + kmovd %eax, %k1 +.endif + + // Make %rax point to the last AES round key for the chosen AES variant. + lea 6*16(KEY,AESKEYLEN64,4), %rax + + // Start the AES encryption of the counter block by swapping the counter + // block to big-endian and XOR-ing it with the zero-th AES round key. + vpshufb BSWAP_MASK, LE_CTR, %xmm0 + vpxor (KEY), %xmm0, %xmm0 + + // Complete the AES encryption and multiply GHASH_ACC by H^1. + // Interleave the AES and GHASH instructions to improve performance. + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + vaesenc -13*16(%rax), %xmm0, %xmm0 + vaesenc -12*16(%rax), %xmm0, %xmm0 +192: + vaesenc -11*16(%rax), %xmm0, %xmm0 + vaesenc -10*16(%rax), %xmm0, %xmm0 +128: +.irp i, 0,1,2,3,4,5,6,7,8 + _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %xmm1, %xmm2, %xmm3 + vaesenc (\i-9)*16(%rax), %xmm0, %xmm0 +.endr + _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %xmm1, %xmm2, %xmm3 + + // Undo the byte reflection of the GHASH accumulator. + vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC + + // Do the last AES round and XOR the resulting keystream block with the + // GHASH accumulator to produce the full computed authentication tag. + // + // Reduce latency by taking advantage of the property vaesenclast(key, + // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last + // round key, instead of XOR'ing the final AES output with GHASH_ACC. + // + // enc_final then returns the computed auth tag, while dec_final + // compares it with the transmitted one and returns a bool. To compare + // the tags, dec_final XORs them together and uses vptest to check + // whether the result is all-zeroes. This should be constant-time. + // dec_final applies the vaesenclast optimization to this additional + // value XOR'd too, using vpternlogd to XOR the last round key, GHASH + // accumulator, and transmitted auth tag together in one instruction. +.if \enc + vpxor (%rax), GHASH_ACC, %xmm1 + vaesenclast %xmm1, %xmm0, GHASH_ACC + vmovdqu GHASH_ACC, (GHASH_ACC_PTR) +.else + vmovdqu (TAG), %xmm1 + vpternlogd $0x96, (%rax), GHASH_ACC, %xmm1 + vaesenclast %xmm1, %xmm0, %xmm0 + xor %eax, %eax + vmovdqu8 %xmm0, %xmm0{%k1}{z} // Truncate to TAGLEN bytes + vptest %xmm0, %xmm0 + sete %al +.endif + // No need for vzeroupper here, since only used xmm registers were used. + RET +.endm + +_set_veclen 32 +SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256) +SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256) +SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256) + +_set_veclen 64 +SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512) +SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512) +SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512) + +// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// u8 ghash_acc[16], +// const u8 *aad, int aadlen); +// +// This function processes the AAD (Additional Authenticated Data) in GCM. +// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the +// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been +// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen| +// must be a multiple of 16, except on the last call where it can be any length. +// The caller must do any buffering needed to ensure this. +// +// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes. +// Therefore, for AAD processing we currently only provide this implementation +// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This +// keeps the code size down, and it enables some micro-optimizations, e.g. using +// VEX-coded instructions instead of EVEX-coded to save some instruction bytes. +// To optimize for large amounts of AAD, we could implement a 4x-wide loop and +// provide a version using 512-bit vectors, but that doesn't seem to be useful. +SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10) + + // Function arguments + .set KEY, %rdi + .set GHASH_ACC_PTR, %rsi + .set AAD, %rdx + .set AADLEN, %ecx + .set AADLEN64, %rcx // Zero-extend AADLEN before using! + + // Additional local variables. + // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers. + .set BSWAP_MASK, %ymm4 + .set GFPOLY, %ymm5 + .set GHASH_ACC, %ymm6 + .set GHASH_ACC_XMM, %xmm6 + .set H_POW1, %ymm7 + + // Load some constants. + vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK + vbroadcasti128 .Lgfpoly(%rip), GFPOLY + + // Load the GHASH accumulator. + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM + + // Update GHASH with 32 bytes of AAD at a time. + // + // Pre-subtracting 32 from AADLEN saves an instruction from the loop and + // also ensures that at least one write always occurs to AADLEN, + // zero-extending it and allowing AADLEN64 to be used later. + sub $32, AADLEN + jl .Laad_loop_1x_done + vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1] +.Laad_loop_1x: + vmovdqu (AAD), %ymm0 + vpshufb BSWAP_MASK, %ymm0, %ymm0 + vpxor %ymm0, GHASH_ACC, GHASH_ACC + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %ymm0, %ymm1, %ymm2 + vextracti128 $1, GHASH_ACC, %xmm0 + vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM + add $32, AAD + sub $32, AADLEN + jge .Laad_loop_1x +.Laad_loop_1x_done: + add $32, AADLEN + jz .Laad_done + + // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD. + mov $-1, %eax + bzhi AADLEN, %eax, %eax + kmovd %eax, %k1 + vmovdqu8 (AAD), %ymm0{%k1}{z} + neg AADLEN64 + and $~15, AADLEN64 // -round_up(AADLEN, 16) + vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 + vpshufb BSWAP_MASK, %ymm0, %ymm0 + vpxor %ymm0, GHASH_ACC, GHASH_ACC + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %ymm0, %ymm1, %ymm2 + vextracti128 $1, GHASH_ACC, %xmm0 + vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM + +.Laad_done: + // Store the updated GHASH accumulator back to memory. + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) + + vzeroupper // This is needed after using ymm or zmm registers. + RET +SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10) + +SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10) +SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index ef031655b2d3..c1cc01583103 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Support for Intel AES-NI instructions. This file contains glue - * code, the real AES implementation is in intel-aes_asm.S. + * Support for AES-NI and VAES instructions. This file contains glue code. + * The real AES implementations are in aesni-intel_asm.S and other .S files. * * Copyright (C) 2008, Intel Corp. * Author: Huang Ying @@ -13,6 +13,8 @@ * Tadeusz Struk (tadeusz.struk@intel.com) * Aidan O'Mahony (aidan.o.mahony@intel.com) * Copyright (c) 2010, Intel Corporation. + * + * Copyright 2024 Google LLC */ #include @@ -1214,13 +1216,505 @@ DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500); DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600); DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700); DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800); -#endif + +/* The common part of the x86_64 AES-GCM key struct */ +struct aes_gcm_key { + /* Expanded AES key and the AES key length in bytes */ + struct crypto_aes_ctx aes_key; + + /* RFC4106 nonce (used only by the rfc4106 algorithms) */ + u32 rfc4106_nonce; +}; + +/* Key struct used by the VAES + AVX10 implementations of AES-GCM */ +struct aes_gcm_key_avx10 { + /* + * Common part of the key. The assembly code prefers 16-byte alignment + * for the round keys; we get this by them being located at the start of + * the struct and the whole struct being 64-byte aligned. + */ + struct aes_gcm_key base; + + /* + * Powers of the hash key H^16 through H^1. These are 128-bit values. + * They all have an extra factor of x^-1 and are byte-reversed. This + * array is aligned to a 64-byte boundary to make it naturally aligned + * for 512-bit loads, which can improve performance. (The assembly code + * doesn't *need* the alignment; this is just an optimization.) + */ + u64 h_powers[16][2] __aligned(64); + + /* Three padding blocks required by the assembly code */ + u64 padding[3][2]; +}; +#define AES_GCM_KEY_AVX10(key) \ + container_of((key), struct aes_gcm_key_avx10, base) +#define AES_GCM_KEY_AVX10_SIZE \ + (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1))) + +/* + * These flags are passed to the AES-GCM helper functions to specify the + * specific version of AES-GCM (RFC4106 or not), whether it's encryption or + * decryption, and which assembly functions should be called. Assembly + * functions are selected using flags instead of function pointers to avoid + * indirect calls (which are very expensive on x86) regardless of inlining. + */ +#define FLAG_RFC4106 BIT(0) +#define FLAG_ENC BIT(1) +#define FLAG_AVX10_512 BIT(2) + +static inline struct aes_gcm_key * +aes_gcm_key_get(struct crypto_aead *tfm, int flags) +{ + return PTR_ALIGN(crypto_aead_ctx(tfm), 64); +} + +asmlinkage void +aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key); +asmlinkage void +aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key); + +static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) +{ + /* + * To make things a bit easier on the assembly side, the AVX10 + * implementations use the same key format. Therefore, a single + * function using 256-bit vectors would suffice here. However, it's + * straightforward to provide a 512-bit one because of how the assembly + * code is structured, and it works nicely because the total size of the + * key powers is a multiple of 512 bits. So we take advantage of that. + */ + if (flags & FLAG_AVX10_512) + aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key)); + else + aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key)); +} + +asmlinkage void +aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); + +static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], + const u8 *aad, int aadlen, int flags) +{ + aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc, + aad, aadlen); +} + +asmlinkage void +aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); + +asmlinkage void +aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline void +aes_gcm_update(const struct aes_gcm_key *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen, int flags) +{ + if (flags & FLAG_ENC) { + if (flags & FLAG_AVX10_512) + aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else + aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + } else { + if (flags & FLAG_AVX10_512) + aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else + aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + } +} + +asmlinkage void +aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline void +aes_gcm_enc_final(const struct aes_gcm_key *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, int flags) +{ + aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); +} + +asmlinkage bool __must_check +aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline bool __must_check +aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4], + u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, + u8 tag[16], int taglen, int flags) +{ + return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); +} + +/* + * This is the setkey function for the x86_64 implementations of AES-GCM. It + * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes + * powers of the hash key. + * + * To comply with the crypto_aead API, this has to be usable in no-SIMD context. + * For that reason, this function includes a portable C implementation of the + * needed logic. However, the portable C implementation is very slow, taking + * about the same time as encrypting 37 KB of data. To be ready for users that + * may set a key even somewhat frequently, we therefore also include a SIMD + * assembly implementation, expanding the AES key using AES-NI and precomputing + * the hash key powers using PCLMULQDQ or VPCLMULQDQ. + */ +static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, + unsigned int keylen, int flags) +{ + struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); + int err; + + if (flags & FLAG_RFC4106) { + if (keylen < 4) + return -EINVAL; + keylen -= 4; + key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen); + } + + /* The assembly code assumes the following offsets. */ + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768); + + if (likely(crypto_simd_usable())) { + err = aes_check_keylen(keylen); + if (err) + return err; + kernel_fpu_begin(); + aesni_set_key(&key->aes_key, raw_key, keylen); + aes_gcm_precompute(key, flags); + kernel_fpu_end(); + } else { + static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = { + [0] = 0xc2, [15] = 1 + }; + struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key); + be128 h1 = {}; + be128 h; + int i; + + err = aes_expandkey(&key->aes_key, raw_key, keylen); + if (err) + return err; + + /* Encrypt the all-zeroes block to get the hash key H^1 */ + aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1); + + /* Compute H^1 * x^-1 */ + h = h1; + gf128mul_lle(&h, (const be128 *)x_to_the_minus1); + + /* Compute the needed key powers */ + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { + k->h_powers[i][0] = be64_to_cpu(h.b); + k->h_powers[i][1] = be64_to_cpu(h.a); + gf128mul_lle(&h, &h1); + } + memset(k->padding, 0, sizeof(k->padding)); + } + return 0; +} + +/* + * Initialize @ghash_acc, then pass all @assoclen bytes of associated data + * (a.k.a. additional authenticated data) from @sg_src through the GHASH update + * assembly function. kernel_fpu_begin() must have already been called. + */ +static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16], + struct scatterlist *sg_src, unsigned int assoclen, + int flags) +{ + struct scatter_walk walk; + /* + * The assembly function requires that the length of any non-last + * segment of associated data be a multiple of 16 bytes, so this + * function does the buffering needed to achieve that. + */ + unsigned int pos = 0; + u8 buf[16]; + + memset(ghash_acc, 0, 16); + scatterwalk_start(&walk, sg_src); + + while (assoclen) { + unsigned int len_this_page = scatterwalk_clamp(&walk, assoclen); + void *mapped = scatterwalk_map(&walk); + const void *src = mapped; + unsigned int len; + + assoclen -= len_this_page; + scatterwalk_advance(&walk, len_this_page); + if (unlikely(pos)) { + len = min(len_this_page, 16 - pos); + memcpy(&buf[pos], src, len); + pos += len; + src += len; + len_this_page -= len; + if (pos < 16) + goto next; + aes_gcm_aad_update(key, ghash_acc, buf, 16, flags); + pos = 0; + } + len = len_this_page; + if (unlikely(assoclen)) /* Not the last segment yet? */ + len = round_down(len, 16); + aes_gcm_aad_update(key, ghash_acc, src, len, flags); + src += len; + len_this_page -= len; + if (unlikely(len_this_page)) { + memcpy(buf, src, len_this_page); + pos = len_this_page; + } +next: + scatterwalk_unmap(mapped); + scatterwalk_pagedone(&walk, 0, assoclen); + if (need_resched()) { + kernel_fpu_end(); + kernel_fpu_begin(); + } + } + if (unlikely(pos)) + aes_gcm_aad_update(key, ghash_acc, buf, pos, flags); +} + + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline int +gcm_crypt(struct aead_request *req, int flags) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); + unsigned int assoclen = req->assoclen; + struct skcipher_walk walk; + unsigned int nbytes; + u8 ghash_acc[16]; /* GHASH accumulator */ + u32 le_ctr[4]; /* Counter in little-endian format */ + int taglen; + int err; + + /* Initialize the counter and determine the associated data length. */ + le_ctr[0] = 2; + if (flags & FLAG_RFC4106) { + if (unlikely(assoclen != 16 && assoclen != 20)) + return -EINVAL; + assoclen -= 8; + le_ctr[1] = get_unaligned_be32(req->iv + 4); + le_ctr[2] = get_unaligned_be32(req->iv + 0); + le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */ + } else { + le_ctr[1] = get_unaligned_be32(req->iv + 8); + le_ctr[2] = get_unaligned_be32(req->iv + 4); + le_ctr[3] = get_unaligned_be32(req->iv + 0); + } + + /* Begin walking through the plaintext or ciphertext. */ + if (flags & FLAG_ENC) + err = skcipher_walk_aead_encrypt(&walk, req, false); + else + err = skcipher_walk_aead_decrypt(&walk, req, false); + + /* + * Since the AES-GCM assembly code requires that at least three assembly + * functions be called to process any message (this is needed to support + * incremental updates cleanly), to reduce overhead we try to do all + * three calls in the same kernel FPU section if possible. We close the + * section and start a new one if there are multiple data segments or if + * rescheduling is needed while processing the associated data. + */ + kernel_fpu_begin(); + + /* Pass the associated data through GHASH. */ + gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags); + + /* En/decrypt the data and pass the ciphertext through GHASH. */ + while ((nbytes = walk.nbytes) != 0) { + if (unlikely(nbytes < walk.total)) { + /* + * Non-last segment. In this case, the assembly + * function requires that the length be a multiple of 16 + * (AES_BLOCK_SIZE) bytes. The needed buffering of up + * to 16 bytes is handled by the skcipher_walk. Here we + * just need to round down to a multiple of 16. + */ + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + aes_gcm_update(key, le_ctr, ghash_acc, + walk.src.virt.addr, walk.dst.virt.addr, + nbytes, flags); + le_ctr[0] += nbytes / AES_BLOCK_SIZE; + kernel_fpu_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + kernel_fpu_begin(); + } else { + /* Last segment: process all remaining data. */ + aes_gcm_update(key, le_ctr, ghash_acc, + walk.src.virt.addr, walk.dst.virt.addr, + nbytes, flags); + err = skcipher_walk_done(&walk, 0); + /* + * The low word of the counter isn't used by the + * finalize, so there's no need to increment it here. + */ + } + } + if (err) + goto out; + + /* Finalize */ + taglen = crypto_aead_authsize(tfm); + if (flags & FLAG_ENC) { + /* Finish computing the auth tag. */ + aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen, + req->cryptlen, flags); + + /* Store the computed auth tag in the dst scatterlist. */ + scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen + + req->cryptlen, taglen, 1); + } else { + unsigned int datalen = req->cryptlen - taglen; + u8 tag[16]; + + /* Get the transmitted auth tag from the src scatterlist. */ + scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen, + taglen, 0); + /* + * Finish computing the auth tag and compare it to the + * transmitted one. The assembly function does the actual tag + * comparison. Here, just check the boolean result. + */ + if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen, + datalen, tag, taglen, flags)) + err = -EBADMSG; + } +out: + kernel_fpu_end(); + return err; +} + +#define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name, \ + ctxsize, priority) \ + \ +static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ + unsigned int keylen) \ +{ \ + return gcm_setkey(tfm, raw_key, keylen, (flags)); \ +} \ + \ +static int gcm_encrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_ENC); \ +} \ + \ +static int gcm_decrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags)); \ +} \ + \ +static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ + unsigned int keylen) \ +{ \ + return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \ +} \ + \ +static int rfc4106_encrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC); \ +} \ + \ +static int rfc4106_decrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_RFC4106); \ +} \ + \ +static struct aead_alg aes_gcm_algs_##suffix[] = { { \ + .setkey = gcm_setkey_##suffix, \ + .setauthsize = generic_gcmaes_set_authsize, \ + .encrypt = gcm_encrypt_##suffix, \ + .decrypt = gcm_decrypt_##suffix, \ + .ivsize = GCM_AES_IV_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .maxauthsize = 16, \ + .base = { \ + .cra_name = "__gcm(aes)", \ + .cra_driver_name = "__" generic_driver_name, \ + .cra_priority = (priority), \ + .cra_flags = CRYPTO_ALG_INTERNAL, \ + .cra_blocksize = 1, \ + .cra_ctxsize = (ctxsize), \ + .cra_module = THIS_MODULE, \ + }, \ +}, { \ + .setkey = rfc4106_setkey_##suffix, \ + .setauthsize = common_rfc4106_set_authsize, \ + .encrypt = rfc4106_encrypt_##suffix, \ + .decrypt = rfc4106_decrypt_##suffix, \ + .ivsize = GCM_RFC4106_IV_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .maxauthsize = 16, \ + .base = { \ + .cra_name = "__rfc4106(gcm(aes))", \ + .cra_driver_name = "__" rfc_driver_name, \ + .cra_priority = (priority), \ + .cra_flags = CRYPTO_ALG_INTERNAL, \ + .cra_blocksize = 1, \ + .cra_ctxsize = (ctxsize), \ + .cra_module = THIS_MODULE, \ + }, \ +} }; \ + \ +static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2] \ + +/* aes_gcm_algs_vaes_avx10_256 */ +DEFINE_GCM_ALGS(vaes_avx10_256, 0, + "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256", + AES_GCM_KEY_AVX10_SIZE, 700); + +/* aes_gcm_algs_vaes_avx10_512 */ +DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512, + "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512", + AES_GCM_KEY_AVX10_SIZE, 800); +#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ /* * This is a list of CPU models that are known to suffer from downclocking when - * zmm registers (512-bit vectors) are used. On these CPUs, the AES-XTS - * implementation with zmm registers won't be used by default. An - * implementation with ymm registers (256-bit vectors) will be used instead. + * zmm registers (512-bit vectors) are used. On these CPUs, the AES mode + * implementations with zmm registers won't be used by default. Implementations + * with ymm registers (256-bit vectors) will be used by default instead. */ static const struct x86_cpu_id zmm_exclusion_list[] = { X86_MATCH_VFM(INTEL_SKYLAKE_X, 0), @@ -1236,7 +1730,7 @@ static const struct x86_cpu_id zmm_exclusion_list[] = { {}, }; -static int __init register_xts_algs(void) +static int __init register_avx_algs(void) { int err; @@ -1269,19 +1763,34 @@ static int __init register_xts_algs(void) &aes_xts_simdalg_vaes_avx10_256); if (err) return err; + err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), + aes_gcm_simdalgs_vaes_avx10_256); + if (err) + return err; + + if (x86_match_cpu(zmm_exclusion_list)) { + int i; - if (x86_match_cpu(zmm_exclusion_list)) aes_xts_alg_vaes_avx10_512.base.cra_priority = 1; + for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) + aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; + } err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1, &aes_xts_simdalg_vaes_avx10_512); if (err) return err; + err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), + aes_gcm_simdalgs_vaes_avx10_512); + if (err) + return err; #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ return 0; } -static void unregister_xts_algs(void) +static void unregister_avx_algs(void) { if (aes_xts_simdalg_aesni_avx) simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, @@ -1293,18 +1802,26 @@ static void unregister_xts_algs(void) if (aes_xts_simdalg_vaes_avx10_256) simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1, &aes_xts_simdalg_vaes_avx10_256); + if (aes_gcm_simdalgs_vaes_avx10_256[0]) + simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), + aes_gcm_simdalgs_vaes_avx10_256); if (aes_xts_simdalg_vaes_avx10_512) simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1, &aes_xts_simdalg_vaes_avx10_512); + if (aes_gcm_simdalgs_vaes_avx10_512[0]) + simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), + aes_gcm_simdalgs_vaes_avx10_512); #endif } #else /* CONFIG_X86_64 */ -static int __init register_xts_algs(void) +static int __init register_avx_algs(void) { return 0; } -static void unregister_xts_algs(void) +static void unregister_avx_algs(void) { } #endif /* !CONFIG_X86_64 */ @@ -1447,14 +1964,14 @@ static int __init aesni_init(void) goto unregister_aeads; #endif /* CONFIG_X86_64 */ - err = register_xts_algs(); + err = register_avx_algs(); if (err) - goto unregister_xts; + goto unregister_avx; return 0; -unregister_xts: - unregister_xts_algs(); +unregister_avx: + unregister_avx_algs(); #ifdef CONFIG_X86_64 if (aesni_simd_xctr) simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); @@ -1482,7 +1999,7 @@ static void __exit aesni_exit(void) if (boot_cpu_has(X86_FEATURE_AVX)) simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); #endif /* CONFIG_X86_64 */ - unregister_xts_algs(); + unregister_avx_algs(); } late_initcall(aesni_init); -- cgit From e6e758fa64438082e48272db19ad74ed4fb98938 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 2 Jun 2024 15:22:20 -0700 Subject: crypto: x86/aes-gcm - rewrite the AES-NI optimized AES-GCM Rewrite the AES-NI implementations of AES-GCM, taking advantage of things I learned while writing the VAES-AVX10 implementations. This is a complete rewrite that reduces the AES-NI GCM source code size by about 70% and the binary code size by about 95%, while not regressing performance and in fact improving it significantly in many cases. The following summarizes the state before this patch: - The aesni-intel module registered algorithms "generic-gcm-aesni" and "rfc4106-gcm-aesni" with the crypto API that actually delegated to one of three underlying implementations according to the CPU capabilities detected at runtime: AES-NI, AES-NI + AVX, or AES-NI + AVX2. - The AES-NI + AVX and AES-NI + AVX2 assembly code was in aesni-intel_avx-x86_64.S and consisted of 2804 lines of source and 257 KB of binary. This massive binary size was not really appropriate, and depending on the kconfig it could take up over 1% the size of the entire vmlinux. The main loops did 8 blocks per iteration. The AVX code minimized the use of carryless multiplication whereas the AVX2 code did not. The "AVX2" code did not actually use AVX2; the check for AVX2 was really a check for Intel Haswell or later to detect support for fast carryless multiplication. The long source length was caused by factors such as significant code duplication. - The AES-NI only assembly code was in aesni-intel_asm.S and consisted of 1501 lines of source and 15 KB of binary. The main loops did 4 blocks per iteration and minimized the use of carryless multiplication by using Karatsuba multiplication and a multiplication-less reduction. - The assembly code was contributed in 2010-2013. Maintenance has been sporadic and most design choices haven't been revisited. - The assembly function prototypes and the corresponding glue code were separate from and were not consistent with the new VAES-AVX10 code I recently added. The older code had several issues such as not precomputing the GHASH key powers, which hurt performance. This rewrite achieves the following goals: - Much shorter source and binary sizes. The assembly source shrinks from 4300 lines to 1130 lines, and it produces about 9 KB of binary instead of 272 KB. This is achieved via a better designed AES-GCM implementation that doesn't excessively unroll the code and instead prioritizes the parts that really matter. Sharing the C glue code with the VAES-AVX10 implementations also saves 250 lines of C source. - Improve performance on most (possibly all) CPUs on which this code runs, for most (possibly all) message lengths. Benchmark results are given in Tables 1 and 2 below. - Use the same function prototypes and glue code as the new VAES-AVX10 algorithms. This fixes some issues with the integration of the assembly and results in some significant performance improvements, primarily on short messages. Also, the AVX and non-AVX implementations are now registered as separate algorithms with the crypto API, which makes them both testable by the self-tests. - Keep support for AES-NI without AVX (for Westmere, Silvermont, Goldmont, and Tremont), but unify the source code with AES-NI + AVX. Since 256-bit vectors cannot be used without VAES anyway, this is made feasible by just using the non-VEX coded form of most instructions. - Use a unified approach where the main loop does 8 blocks per iteration and uses Karatsuba multiplication to save one pclmulqdq per block but does not use the multiplication-less reduction. This strikes a good balance across the range of CPUs on which this code runs. - Don't spam the kernel log with an informational message on every boot. The following tables summarize the improvement in AES-GCM throughput on various CPU microarchitectures as a result of this patch: Table 1: AES-256-GCM encryption throughput improvement, CPU microarchitecture vs. message length in bytes: | 16384 | 4096 | 4095 | 1420 | 512 | 500 | -------------------+-------+-------+-------+-------+-------+-------+ Intel Broadwell | 2% | 8% | 11% | 18% | 31% | 26% | Intel Skylake | 1% | 4% | 7% | 12% | 26% | 19% | Intel Cascade Lake | 3% | 8% | 10% | 18% | 33% | 24% | AMD Zen 1 | 6% | 12% | 6% | 15% | 27% | 24% | AMD Zen 2 | 8% | 13% | 13% | 19% | 26% | 28% | AMD Zen 3 | 8% | 14% | 13% | 19% | 26% | 25% | | 300 | 200 | 64 | 63 | 16 | -------------------+-------+-------+-------+-------+-------+ Intel Broadwell | 35% | 29% | 45% | 55% | 54% | Intel Skylake | 25% | 19% | 28% | 33% | 27% | Intel Cascade Lake | 36% | 28% | 39% | 49% | 54% | AMD Zen 1 | 27% | 22% | 23% | 29% | 26% | AMD Zen 2 | 32% | 24% | 22% | 25% | 31% | AMD Zen 3 | 30% | 24% | 22% | 23% | 26% | Table 2: AES-256-GCM decryption throughput improvement, CPU microarchitecture vs. message length in bytes: | 16384 | 4096 | 4095 | 1420 | 512 | 500 | -------------------+-------+-------+-------+-------+-------+-------+ Intel Broadwell | 3% | 8% | 11% | 19% | 32% | 28% | Intel Skylake | 3% | 4% | 7% | 13% | 28% | 27% | Intel Cascade Lake | 3% | 9% | 11% | 19% | 33% | 28% | AMD Zen 1 | 15% | 18% | 14% | 20% | 36% | 33% | AMD Zen 2 | 9% | 16% | 13% | 21% | 26% | 27% | AMD Zen 3 | 8% | 15% | 12% | 18% | 23% | 23% | | 300 | 200 | 64 | 63 | 16 | -------------------+-------+-------+-------+-------+-------+ Intel Broadwell | 36% | 31% | 40% | 51% | 53% | Intel Skylake | 28% | 21% | 23% | 30% | 30% | Intel Cascade Lake | 36% | 29% | 36% | 47% | 53% | AMD Zen 1 | 35% | 31% | 32% | 35% | 36% | AMD Zen 2 | 31% | 30% | 27% | 38% | 30% | AMD Zen 3 | 27% | 23% | 24% | 32% | 26% | The above numbers are percentage improvements in single-thread throughput, so e.g. an increase from 3000 MB/s to 3300 MB/s would be listed as 10%. They were collected by directly measuring the Linux crypto API performance using a custom kernel module. Note that indirect benchmarks (e.g. 'cryptsetup benchmark' or benchmarking dm-crypt I/O) include more overhead and won't see quite as much of a difference. All these benchmarks used an associated data length of 16 bytes. Note that AES-GCM is almost always used with short associated data lengths. I didn't test Intel CPUs before Broadwell, AMD CPUs before Zen 1, or Intel low-power CPUs, as these weren't readily available to me. However, based on the design of the new code and the available information about these other CPU microarchitectures, I wouldn't expect any significant regressions, and there's a good chance performance is improved just as it is above. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 5 +- arch/x86/crypto/aes-gcm-aesni-x86_64.S | 1128 ++++++++++++ arch/x86/crypto/aesni-intel_asm.S | 1503 +--------------- arch/x86/crypto/aesni-intel_avx-x86_64.S | 2804 ------------------------------ arch/x86/crypto/aesni-intel_glue.c | 768 +++----- 5 files changed, 1390 insertions(+), 4818 deletions(-) create mode 100644 arch/x86/crypto/aes-gcm-aesni-x86_64.S delete mode 100644 arch/x86/crypto/aesni-intel_avx-x86_64.S diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index a2a536b690fa..53b4a277809e 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -48,8 +48,9 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o -aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \ - aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o +aesni-intel-$(CONFIG_64BIT) += aes_ctrby8_avx-x86_64.o \ + aes-gcm-aesni-x86_64.o \ + aes-xts-avx-x86_64.o ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o endif diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S new file mode 100644 index 000000000000..45940e2883a0 --- /dev/null +++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S @@ -0,0 +1,1128 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// AES-NI optimized AES-GCM for x86_64 +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ +// +// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that +// support the original set of AES instructions, i.e. AES-NI. Two +// implementations are provided, one that uses AVX and one that doesn't. They +// are very similar, being generated by the same macros. The only difference is +// that the AVX implementation takes advantage of VEX-coded instructions in some +// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX +// implementation does *not* use 256-bit vectors, as AES is not supported on +// 256-bit vectors until the VAES feature (which this file doesn't target). +// +// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1 +// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems +// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) +// +// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is +// more thoroughly commented. This file has the following notable changes: +// +// - The vector length is fixed at 128-bit, i.e. xmm registers. This means +// there is only one AES block (and GHASH block) per register. +// +// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of +// 32. We work around this by being much more careful about using +// registers, relying heavily on loads to load values as they are needed. +// +// - Masking is not available either. We work around this by implementing +// partial block loads and stores using overlapping scalar loads and stores +// combined with shifts and SSE4.1 insertion and extraction instructions. +// +// - The main loop is organized differently due to the different design +// constraints. First, with just one AES block per SIMD register, on some +// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore +// do an 8-register wide loop. Considering that and the fact that we have +// just 16 SIMD registers to work with, it's not feasible to cache AES +// round keys and GHASH key powers in registers across loop iterations. +// That's not ideal, but also not actually that bad, since loads can run in +// parallel with other instructions. Significantly, this also makes it +// possible to roll up the inner loops, relying on hardware loop unrolling +// instead of software loop unrolling, greatly reducing code size. +// +// - We implement the GHASH multiplications in the main loop using Karatsuba +// multiplication instead of schoolbook multiplication. This saves one +// pclmulqdq instruction per block, at the cost of one 64-bit load, one +// pshufd, and 0.25 pxors per block. (This is without the three-argument +// XOR support that would be provided by AVX512 / AVX10, which would be +// more beneficial to schoolbook than Karatsuba.) +// +// As a rough approximation, we can assume that Karatsuba multiplication is +// faster than schoolbook multiplication in this context if one pshufd and +// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit +// load is "free" due to running in parallel with arithmetic instructions.) +// This is true on AMD CPUs, including all that support pclmulqdq up to at +// least Zen 3. It's also true on older Intel CPUs: Westmere through +// Haswell on the Core side, and Silvermont through Goldmont Plus on the +// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the +// benefit of Karatsuba should be substantial. On newer Intel CPUs, +// schoolbook multiplication should be faster, but only marginally. +// +// Not all these CPUs were available to be tested. However, benchmarks on +// available CPUs suggest that this approximation is plausible. Switching +// to Karatsuba showed negligible change (< 1%) on Intel Broadwell, +// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%. +// Considering that and the fact that Karatsuba should be even more +// beneficial on older Intel CPUs, it seems like the right choice here. +// +// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be +// saved by using a multiplication-less reduction method. We don't do that +// because it would require a large number of shift and xor instructions, +// making it less worthwhile and likely harmful on newer CPUs. +// +// It does make sense to sometimes use a different reduction optimization +// that saves a pclmulqdq, though: precompute the hash key times x^64, and +// multiply the low half of the data block by the hash key with the extra +// factor of x^64. This eliminates one step of the reduction. However, +// this is incompatible with Karatsuba multiplication. Therefore, for +// multi-block processing we use Karatsuba multiplication with a regular +// reduction. For single-block processing, we use the x^64 optimization. + +#include + +.section .rodata +.p2align 4 +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f +.Lgfpoly: + .quad 0xc200000000000000 +.Lone: + .quad 1 +.Lgfpoly_and_internal_carrybit: + .octa 0xc2000000000000010000000000000001 + // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of + // 'len' 0xff bytes and the rest zeroes. +.Lzeropad_mask: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0 + +// Offsets in struct aes_gcm_key_aesni +#define OFFSETOF_AESKEYLEN 480 +#define OFFSETOF_H_POWERS 496 +#define OFFSETOF_H_POWERS_XORED 624 +#define OFFSETOF_H_TIMES_X64 688 + +.text + +// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback +// assumes that all operands are distinct and that any mem operand is aligned. +.macro _vpclmulqdq imm, src1, src2, dst +.if USE_AVX + vpclmulqdq \imm, \src1, \src2, \dst +.else + movdqa \src2, \dst + pclmulqdq \imm, \src1, \dst +.endif +.endm + +// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes +// that all operands are distinct and that any mem operand is aligned. +.macro _vpshufb src1, src2, dst +.if USE_AVX + vpshufb \src1, \src2, \dst +.else + movdqa \src2, \dst + pshufb \src1, \dst +.endif +.endm + +// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that +// all operands are distinct. +.macro _vpand src1, src2, dst +.if USE_AVX + vpand \src1, \src2, \dst +.else + movdqu \src1, \dst + pand \src2, \dst +.endif +.endm + +// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must +// be a temporary xmm register. +.macro _xor_mem_to_reg mem, reg, tmp +.if USE_AVX + vpxor \mem, \reg, \reg +.else + movdqu \mem, \tmp + pxor \tmp, \reg +.endif +.endm + +// Test the unaligned memory operand \mem against the xmm register \reg. \tmp +// must be a temporary xmm register. +.macro _test_mem mem, reg, tmp +.if USE_AVX + vptest \mem, \reg +.else + movdqu \mem, \tmp + ptest \tmp, \reg +.endif +.endm + +// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst +// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _load_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jle .Lle8\@ + + // Load 9 <= LEN <= 15 bytes. + movq (\src), \dst // Load first 8 bytes + mov (\src, %rcx), %rax // Load last 8 bytes + neg %ecx + shl $3, %ecx + shr %cl, %rax // Discard overlapping bytes + pinsrq $1, %rax, \dst + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Load 4 <= LEN <= 8 bytes. + mov (\src), %eax // Load first 4 bytes + mov (\src, %rcx), \tmp32 // Load last 4 bytes + jmp .Lcombine\@ + +.Llt4\@: + // Load 1 <= LEN <= 3 bytes. + add $2, %ecx // LEN - 2 + movzbl (\src), %eax // Load first byte + jl .Lmovq\@ + movzwl (\src, %rcx), \tmp32 // Load last 2 bytes +.Lcombine\@: + shl $3, %ecx + shl %cl, \tmp64 + or \tmp64, %rax // Combine the two parts +.Lmovq\@: + movq %rax, \dst +.Ldone\@: +.endm + +// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. +// Clobbers %rax, %rcx, and %rsi. +.macro _store_partial_block src, dst + sub $8, %ecx // LEN - 8 + jl .Llt8\@ + + // Store 8 <= LEN <= 15 bytes. + pextrq $1, \src, %rax + mov %ecx, %esi + shl $3, %ecx + ror %cl, %rax + mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes + movq \src, (\dst) // Store first 8 bytes + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Store 4 <= LEN <= 7 bytes. + pextrd $1, \src, %eax + mov %ecx, %esi + shl $3, %ecx + ror %cl, %eax + mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes + movd \src, (\dst) // Store first 4 bytes + jmp .Ldone\@ + +.Llt4\@: + // Store 1 <= LEN <= 3 bytes. + pextrb $0, \src, 0(\dst) + cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? + jl .Ldone\@ + pextrb $1, \src, 1(\dst) + je .Ldone\@ + pextrb $2, \src, 2(\dst) +.Ldone\@: +.endm + +// Do one step of GHASH-multiplying \a by \b and storing the reduced product in +// \b. To complete all steps, this must be invoked with \i=0 through \i=9. +// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the +// .Lgfpoly constant, and \t0-\t1 must be temporary registers. +.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1 + + // MI = (a_L * b_H) + ((a*x^64)_L * b_L) +.if \i == 0 + _vpclmulqdq $0x01, \a, \b, \t0 +.elseif \i == 1 + _vpclmulqdq $0x00, \a_times_x64, \b, \t1 +.elseif \i == 2 + pxor \t1, \t0 + + // HI = (a_H * b_H) + ((a*x^64)_H * b_L) +.elseif \i == 3 + _vpclmulqdq $0x11, \a, \b, \t1 +.elseif \i == 4 + pclmulqdq $0x10, \a_times_x64, \b +.elseif \i == 5 + pxor \t1, \b +.elseif \i == 6 + + // Fold MI into HI. + pshufd $0x4e, \t0, \t1 // Swap halves of MI +.elseif \i == 7 + pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + pxor \t1, \b +.elseif \i == 9 + pxor \t0, \b +.endif +.endm + +// GHASH-multiply \a by \b and store the reduced product in \b. +// See _ghash_mul_step for details. +.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1 +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1 +.endr +.endm + +// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi. +// This does Karatsuba multiplication and must be paired with _ghash_reduce. On +// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the +// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered. +.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0 + + // LO += a_L * b_L + _vpclmulqdq $0x00, \a, \b, \t0 + pxor \t0, \lo + + // b_L + b_H + pshufd $0x4e, \b, \t0 + pxor \b, \t0 + + // HI += a_H * b_H + pclmulqdq $0x11, \a, \b + pxor \b, \hi + + // MI += (a_L + a_H) * (b_L + b_H) + pclmulqdq $0x00, \a_xored, \t0 + pxor \t0, \mi +.endm + +// Reduce the product from \lo, \mi, and \hi, and store the result in \dst. +// This assumes that _ghash_mul_noreduce was used. +.macro _ghash_reduce lo, mi, hi, dst, t0 + + movq .Lgfpoly(%rip), \t0 + + // MI += LO + HI (needed because we used Karatsuba multiplication) + pxor \lo, \mi + pxor \hi, \mi + + // Fold LO into MI. + pshufd $0x4e, \lo, \dst + pclmulqdq $0x00, \t0, \lo + pxor \dst, \mi + pxor \lo, \mi + + // Fold MI into HI. + pshufd $0x4e, \mi, \dst + pclmulqdq $0x00, \t0, \mi + pxor \hi, \dst + pxor \mi, \dst +.endm + +// Do the first step of the GHASH update of a set of 8 ciphertext blocks. +// +// The whole GHASH update does: +// +// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 + +// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1 +// +// This macro just does the first step: it does the unreduced multiplication +// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm +// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the +// inner block counter in %rax, which is a value that counts up by 8 for each +// block in the set of 8 and is used later to index by 8*blknum and 16*blknum. +// +// To reduce the number of pclmulqdq instructions required, both this macro and +// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook +// multiplication. See the file comment for more details about this choice. +// +// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if +// encrypting, or SRC if decrypting. They also expect the precomputed hash key +// powers H^i and their XOR'd-together halves to be available in the struct +// pointed to by KEY. Both macros clobber TMP[0-2]. +.macro _ghash_update_begin_8x enc + + // Initialize the inner block counter. + xor %eax, %eax + + // Load the highest hash key power, H^8. + movdqa OFFSETOF_H_POWERS(KEY), TMP0 + + // Load the first ciphertext block and byte-reflect it. +.if \enc + movdqu (DST), TMP1 +.else + movdqu (SRC), TMP1 +.endif + pshufb BSWAP_MASK, TMP1 + + // Add the GHASH accumulator to the ciphertext block to get the block + // 'b' that needs to be multiplied with the hash key power 'a'. + pxor TMP1, GHASH_ACC + + // b_L + b_H + pshufd $0x4e, GHASH_ACC, MI + pxor GHASH_ACC, MI + + // LO = a_L * b_L + _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO + + // HI = a_H * b_H + pclmulqdq $0x11, TMP0, GHASH_ACC + + // MI = (a_L + a_H) * (b_L + b_H) + pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI +.endm + +// Continue the GHASH update of 8 ciphertext blocks as described above by doing +// an unreduced multiplication of the next ciphertext block by the next lowest +// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI. +.macro _ghash_update_continue_8x enc + add $8, %eax + + // Load the next lowest key power. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0 + + // Load the next ciphertext block and byte-reflect it. +.if \enc + movdqu (DST,%rax,2), TMP1 +.else + movdqu (SRC,%rax,2), TMP1 +.endif + pshufb BSWAP_MASK, TMP1 + + // LO += a_L * b_L + _vpclmulqdq $0x00, TMP0, TMP1, TMP2 + pxor TMP2, LO + + // b_L + b_H + pshufd $0x4e, TMP1, TMP2 + pxor TMP1, TMP2 + + // HI += a_H * b_H + pclmulqdq $0x11, TMP0, TMP1 + pxor TMP1, GHASH_ACC + + // MI += (a_L + a_H) * (b_L + b_H) + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1 + pclmulqdq $0x00, TMP1, TMP2 + pxor TMP2, MI +.endm + +// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to +// _ghash_reduce, but it's hardcoded to use the registers of the main loop and +// it uses the same register for HI and the destination. It's also divided into +// two steps. TMP1 must be preserved across steps. +// +// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of +// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would +// increase the critical path length, and it seems to slightly hurt performance. +.macro _ghash_update_end_8x_step i +.if \i == 0 + movq .Lgfpoly(%rip), TMP1 + pxor LO, MI + pxor GHASH_ACC, MI + pshufd $0x4e, LO, TMP2 + pclmulqdq $0x00, TMP1, LO + pxor TMP2, MI + pxor LO, MI +.elseif \i == 1 + pshufd $0x4e, MI, TMP2 + pclmulqdq $0x00, TMP1, MI + pxor TMP2, GHASH_ACC + pxor MI, GHASH_ACC +.endif +.endm + +// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key); +// +// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH +// related fields in the key struct. +.macro _aes_gcm_precompute + + // Function arguments + .set KEY, %rdi + + // Additional local variables. + // %xmm0-%xmm1 and %rax are used as temporaries. + .set RNDKEYLAST_PTR, %rsi + .set H_CUR, %xmm2 + .set H_POW1, %xmm3 // H^1 + .set H_POW1_X64, %xmm4 // H^1 * x^64 + .set GFPOLY, %xmm5 + + // Encrypt an all-zeroes block to get the raw hash subkey. + movl OFFSETOF_AESKEYLEN(KEY), %eax + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR + movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block + lea 16(KEY), %rax +1: + aesenc (%rax), H_POW1 + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + aesenclast (RNDKEYLAST_PTR), H_POW1 + + // Preprocess the raw hash subkey as needed to operate on GHASH's + // bit-reflected values directly: reflect its bytes, then multiply it by + // x^-1 (using the backwards interpretation of polynomial coefficients + // from the GCM spec) or equivalently x^1 (using the alternative, + // natural interpretation of polynomial coefficients). + pshufb .Lbswap_mask(%rip), H_POW1 + movdqa H_POW1, %xmm0 + pshufd $0xd3, %xmm0, %xmm0 + psrad $31, %xmm0 + paddq H_POW1, H_POW1 + pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0 + pxor %xmm0, H_POW1 + + // Store H^1. + movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY) + + // Compute and store H^1 * x^64. + movq .Lgfpoly(%rip), GFPOLY + pshufd $0x4e, H_POW1, %xmm0 + _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64 + pxor %xmm0, H_POW1_X64 + movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY) + + // Compute and store the halves of H^1 XOR'd together. + pxor H_POW1, %xmm0 + movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY) + + // Compute and store the remaining key powers H^2 through H^8. + movdqa H_POW1, H_CUR + mov $6*8, %eax +.Lprecompute_next\@: + // Compute H^i = H^{i-1} * H^1. + _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1 + // Store H^i. + movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2) + // Compute and store the halves of H^i XOR'd together. + pshufd $0x4e, H_CUR, %xmm0 + pxor H_CUR, %xmm0 + movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax) + sub $8, %eax + jge .Lprecompute_next\@ + + RET +.endm + +// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, +// u8 ghash_acc[16], const u8 *aad, int aadlen); +// +// This function processes the AAD (Additional Authenticated Data) in GCM. +// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the +// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all +// zeroes. |aadlen| must be a multiple of 16, except on the last call where it +// can be any length. The caller must do any buffering needed to ensure this. +.macro _aes_gcm_aad_update + + // Function arguments + .set KEY, %rdi + .set GHASH_ACC_PTR, %rsi + .set AAD, %rdx + .set AADLEN, %ecx + // Note: _load_partial_block relies on AADLEN being in %ecx. + + // Additional local variables. + // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers. + .set BSWAP_MASK, %xmm2 + .set GHASH_ACC, %xmm3 + .set H_POW1, %xmm4 // H^1 + .set H_POW1_X64, %xmm5 // H^1 * x^64 + .set GFPOLY, %xmm6 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movdqu (GHASH_ACC_PTR), GHASH_ACC + movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 + movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 + movq .Lgfpoly(%rip), GFPOLY + + // Process the AAD one full block at a time. + sub $16, AADLEN + jl .Laad_loop_1x_done\@ +.Laad_loop_1x\@: + movdqu (AAD), %xmm0 + pshufb BSWAP_MASK, %xmm0 + pxor %xmm0, GHASH_ACC + _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 + add $16, AAD + sub $16, AADLEN + jge .Laad_loop_1x\@ +.Laad_loop_1x_done\@: + // Check whether there is a partial block at the end. + add $16, AADLEN + jz .Laad_done\@ + + // Process a partial block of length 1 <= AADLEN <= 15. + // _load_partial_block assumes that %ecx contains AADLEN. + _load_partial_block AAD, %xmm0, %r10, %r10d + pshufb BSWAP_MASK, %xmm0 + pxor %xmm0, GHASH_ACC + _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 + +.Laad_done\@: + movdqu GHASH_ACC, (GHASH_ACC_PTR) + RET +.endm + +// Increment LE_CTR eight times to generate eight little-endian counter blocks, +// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with +// the zero-th AES round key. Clobbers TMP0 and TMP1. +.macro _ctr_begin_8x + movq .Lone(%rip), TMP0 + movdqa (KEY), TMP1 // zero-th round key +.irp i, 0,1,2,3,4,5,6,7 + _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i + pxor TMP1, AESDATA\i + paddd TMP0, LE_CTR +.endr +.endm + +// Do a non-last round of AES on AESDATA[0-7] using \round_key. +.macro _aesenc_8x round_key +.irp i, 0,1,2,3,4,5,6,7 + aesenc \round_key, AESDATA\i +.endr +.endm + +// Do the last round of AES on AESDATA[0-7] using \round_key. +.macro _aesenclast_8x round_key +.irp i, 0,1,2,3,4,5,6,7 + aesenclast \round_key, AESDATA\i +.endr +.endm + +// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and +// store the result to DST. Clobbers TMP0. +.macro _xor_data_8x +.irp i, 0,1,2,3,4,5,6,7 + _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0 +.endr +.irp i, 0,1,2,3,4,5,6,7 + movdqu AESDATA\i, \i*16(DST) +.endr +.endm + +// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// const u8 *src, u8 *dst, int datalen); +// +// This macro generates a GCM encryption or decryption update function with the +// above prototype (with \enc selecting which one). +// +// This function computes the next portion of the CTR keystream, XOR's it with +// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted +// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the +// next |datalen| ciphertext bytes. +// +// |datalen| must be a multiple of 16, except on the last call where it can be +// any length. The caller must do any buffering needed to ensure this. Both +// in-place and out-of-place en/decryption are supported. +// +// |le_ctr| must give the current counter in little-endian format. For a new +// message, the low word of the counter must be 2. This function loads the +// counter from |le_ctr| and increments the loaded counter as needed, but it +// does *not* store the updated counter back to |le_ctr|. The caller must +// update |le_ctr| if any more data segments follow. Internally, only the low +// 32-bit word of the counter is incremented, following the GCM standard. +.macro _aes_gcm_update enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg + .set GHASH_ACC_PTR, %rdx + .set SRC, %rcx + .set DST, %r8 + .set DATALEN, %r9d + .set DATALEN64, %r9 // Zero-extend DATALEN before using! + // Note: the code setting up for _load_partial_block assumes that SRC is + // in %rcx (and that DATALEN is *not* in %rcx). + + // Additional local variables + + // %rax and %rsi are used as temporary registers. Note: %rsi overlaps + // with LE_CTR_PTR, which is used only at the beginning. + + .set AESKEYLEN, %r10d // AES key length in bytes + .set AESKEYLEN64, %r10 + .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key + + // Put the most frequently used values in %xmm0-%xmm7 to reduce code + // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.) + .set TMP0, %xmm0 + .set TMP1, %xmm1 + .set TMP2, %xmm2 + .set LO, %xmm3 // Low part of unreduced product + .set MI, %xmm4 // Middle part of unreduced product + .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also + // the high part of unreduced product + .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes + .set LE_CTR, %xmm7 // Little-endian counter value + .set AESDATA0, %xmm8 + .set AESDATA1, %xmm9 + .set AESDATA2, %xmm10 + .set AESDATA3, %xmm11 + .set AESDATA4, %xmm12 + .set AESDATA5, %xmm13 + .set AESDATA6, %xmm14 + .set AESDATA7, %xmm15 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movdqu (GHASH_ACC_PTR), GHASH_ACC + movdqu (LE_CTR_PTR), LE_CTR + + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR + + // If there are at least 8*16 bytes of data, then continue into the main + // loop, which processes 8*16 bytes of data per iteration. + // + // The main loop interleaves AES and GHASH to improve performance on + // CPUs that can execute these instructions in parallel. When + // decrypting, the GHASH input (the ciphertext) is immediately + // available. When encrypting, we instead encrypt a set of 8 blocks + // first and then GHASH those blocks while encrypting the next set of 8, + // repeat that as needed, and finally GHASH the last set of 8 blocks. + // + // Code size optimization: Prefer adding or subtracting -8*16 over 8*16, + // as this makes the immediate fit in a signed byte, saving 3 bytes. + add $-8*16, DATALEN + jl .Lcrypt_loop_8x_done\@ +.if \enc + // Encrypt the first 8 plaintext blocks. + _ctr_begin_8x + lea 16(KEY), %rsi + .p2align 4 +1: + movdqa (%rsi), TMP0 + _aesenc_8x TMP0 + add $16, %rsi + cmp %rsi, RNDKEYLAST_PTR + jne 1b + movdqa (%rsi), TMP0 + _aesenclast_8x TMP0 + _xor_data_8x + // Don't increment DST until the ciphertext blocks have been hashed. + sub $-8*16, SRC + add $-8*16, DATALEN + jl .Lghash_last_ciphertext_8x\@ +.endif + + .p2align 4 +.Lcrypt_loop_8x\@: + + // Generate the next set of 8 counter blocks and start encrypting them. + _ctr_begin_8x + lea 16(KEY), %rsi + + // Do a round of AES, and start the GHASH update of 8 ciphertext blocks + // by doing the unreduced multiplication for the first ciphertext block. + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + _ghash_update_begin_8x \enc + + // Do 7 more rounds of AES, and continue the GHASH update by doing the + // unreduced multiplication for the remaining ciphertext blocks. + .p2align 4 +1: + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + _ghash_update_continue_8x \enc + cmp $7*8, %eax + jne 1b + + // Do the remaining AES rounds. + .p2align 4 +1: + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + cmp %rsi, RNDKEYLAST_PTR + jne 1b + + // Do the GHASH reduction and the last round of AES. + movdqa (RNDKEYLAST_PTR), TMP0 + _ghash_update_end_8x_step 0 + _aesenclast_8x TMP0 + _ghash_update_end_8x_step 1 + + // XOR the data with the AES-CTR keystream blocks. +.if \enc + sub $-8*16, DST +.endif + _xor_data_8x + sub $-8*16, SRC +.if !\enc + sub $-8*16, DST +.endif + add $-8*16, DATALEN + jge .Lcrypt_loop_8x\@ + +.if \enc +.Lghash_last_ciphertext_8x\@: + // Update GHASH with the last set of 8 ciphertext blocks. + _ghash_update_begin_8x \enc + .p2align 4 +1: + _ghash_update_continue_8x \enc + cmp $7*8, %eax + jne 1b + _ghash_update_end_8x_step 0 + _ghash_update_end_8x_step 1 + sub $-8*16, DST +.endif + +.Lcrypt_loop_8x_done\@: + + sub $-8*16, DATALEN + jz .Ldone\@ + + // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep + // things simple and keep the code size down by just going one block at + // a time, again taking advantage of hardware loop unrolling. Since + // there are enough key powers available for all remaining data, we do + // the GHASH multiplications unreduced, and only reduce at the very end. + + .set HI, TMP2 + .set H_POW, AESDATA0 + .set H_POW_XORED, AESDATA1 + .set ONE, AESDATA2 + + movq .Lone(%rip), ONE + + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. + pxor LO, LO + pxor MI, MI + pxor HI, HI + + // Set up a block counter %rax to contain 8*(8-n), where n is the number + // of blocks that remain, counting any partial block. This will be used + // to access the key powers H^n through H^1. + mov DATALEN, %eax + neg %eax + and $~15, %eax + sar $1, %eax + add $64, %eax + + sub $16, DATALEN + jl .Lcrypt_loop_1x_done\@ + + // Process the data one full block at a time. +.Lcrypt_loop_1x\@: + + // Encrypt the next counter block. + _vpshufb BSWAP_MASK, LE_CTR, TMP0 + paddd ONE, LE_CTR + pxor (KEY), TMP0 + lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + aesenc -7*16(%rsi), TMP0 + aesenc -6*16(%rsi), TMP0 +192: + aesenc -5*16(%rsi), TMP0 + aesenc -4*16(%rsi), TMP0 +128: +.irp i, -3,-2,-1,0,1,2,3,4,5 + aesenc \i*16(%rsi), TMP0 +.endr + aesenclast (RNDKEYLAST_PTR), TMP0 + + // Load the next key power H^i. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED + + // XOR the keystream block that was just generated in TMP0 with the next + // source data block and store the resulting en/decrypted data to DST. +.if \enc + _xor_mem_to_reg (SRC), TMP0, tmp=TMP1 + movdqu TMP0, (DST) +.else + movdqu (SRC), TMP1 + pxor TMP1, TMP0 + movdqu TMP0, (DST) +.endif + + // Update GHASH with the ciphertext block. +.if \enc + pshufb BSWAP_MASK, TMP0 + pxor TMP0, GHASH_ACC +.else + pshufb BSWAP_MASK, TMP1 + pxor TMP1, GHASH_ACC +.endif + _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 + pxor GHASH_ACC, GHASH_ACC + + add $8, %eax + add $16, SRC + add $16, DST + sub $16, DATALEN + jge .Lcrypt_loop_1x\@ +.Lcrypt_loop_1x_done\@: + // Check whether there is a partial block at the end. + add $16, DATALEN + jz .Lghash_reduce\@ + + // Process a partial block of length 1 <= DATALEN <= 15. + + // Encrypt a counter block for the last time. + pshufb BSWAP_MASK, LE_CTR + pxor (KEY), LE_CTR + lea 16(KEY), %rsi +1: + aesenc (%rsi), LE_CTR + add $16, %rsi + cmp %rsi, RNDKEYLAST_PTR + jne 1b + aesenclast (RNDKEYLAST_PTR), LE_CTR + + // Load the lowest key power, H^1. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED + + // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is + // in %rcx, but _load_partial_block needs DATALEN in %rcx instead. + // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC. + mov SRC, RNDKEYLAST_PTR + mov DATALEN, %ecx + _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi + + // XOR the keystream block that was just generated in LE_CTR with the + // source data block and store the resulting en/decrypted data to DST. + pxor TMP0, LE_CTR + mov DATALEN, %ecx + _store_partial_block LE_CTR, DST + + // If encrypting, zero-pad the final ciphertext block for GHASH. (If + // decrypting, this was already done by _load_partial_block.) +.if \enc + lea .Lzeropad_mask+16(%rip), %rax + sub DATALEN64, %rax + _vpand (%rax), LE_CTR, TMP0 +.endif + + // Update GHASH with the final ciphertext block. + pshufb BSWAP_MASK, TMP0 + pxor TMP0, GHASH_ACC + _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 + +.Lghash_reduce\@: + // Finally, do the GHASH reduction. + _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0 + +.Ldone\@: + // Store the updated GHASH accumulator back to memory. + movdqu GHASH_ACC, (GHASH_ACC_PTR) + + RET +.endm + +// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen); +// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], const u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen, +// const u8 tag[16], int taglen); +// +// This macro generates one of the above two functions (with \enc selecting +// which one). Both functions finish computing the GCM authentication tag by +// updating GHASH with the lengths block and encrypting the GHASH accumulator. +// |total_aadlen| and |total_datalen| must be the total length of the additional +// authenticated data and the en/decrypted data in bytes, respectively. +// +// The encryption function then stores the full-length (16-byte) computed +// authentication tag to |ghash_acc|. The decryption function instead loads the +// expected authentication tag (the one that was transmitted) from the 16-byte +// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the +// computed tag in constant time, and returns true if and only if they match. +.macro _aes_gcm_final enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set TOTAL_AADLEN, %rcx + .set TOTAL_DATALEN, %r8 + .set TAG, %r9 + .set TAGLEN, %r10d // Originally at 8(%rsp) + .set TAGLEN64, %r10 + + // Additional local variables. + // %rax and %xmm0-%xmm2 are used as temporary registers. + .set AESKEYLEN, %r11d + .set AESKEYLEN64, %r11 + .set BSWAP_MASK, %xmm3 + .set GHASH_ACC, %xmm4 + .set H_POW1, %xmm5 // H^1 + .set H_POW1_X64, %xmm6 // H^1 * x^64 + .set GFPOLY, %xmm7 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Set up a counter block with 1 in the low 32-bit word. This is the + // counter that produces the ciphertext needed to encrypt the auth tag. + movdqu (LE_CTR_PTR), %xmm0 + mov $1, %eax + pinsrd $0, %eax, %xmm0 + + // Build the lengths block and XOR it into the GHASH accumulator. + movq TOTAL_DATALEN, GHASH_ACC + pinsrq $1, TOTAL_AADLEN, GHASH_ACC + psllq $3, GHASH_ACC // Bytes to bits + _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1 + + movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 + movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 + movq .Lgfpoly(%rip), GFPOLY + + // Make %rax point to the 6th from last AES round key. (Using signed + // byte offsets -7*16 through 6*16 decreases code size.) + lea (KEY,AESKEYLEN64,4), %rax + + // AES-encrypt the counter block and also multiply GHASH_ACC by H^1. + // Interleave the AES and GHASH instructions to improve performance. + pshufb BSWAP_MASK, %xmm0 + pxor (KEY), %xmm0 + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + aesenc -7*16(%rax), %xmm0 + aesenc -6*16(%rax), %xmm0 +192: + aesenc -5*16(%rax), %xmm0 + aesenc -4*16(%rax), %xmm0 +128: +.irp i, 0,1,2,3,4,5,6,7,8 + aesenc (\i-3)*16(%rax), %xmm0 + _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 +.endr + aesenclast 6*16(%rax), %xmm0 + _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 + + // Undo the byte reflection of the GHASH accumulator. + pshufb BSWAP_MASK, GHASH_ACC + + // Encrypt the GHASH accumulator. + pxor %xmm0, GHASH_ACC + +.if \enc + // Return the computed auth tag. + movdqu GHASH_ACC, (GHASH_ACC_PTR) +.else + .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN! + + // Verify the auth tag in constant time by XOR'ing the transmitted and + // computed auth tags together and using the ptest instruction to check + // whether the first TAGLEN bytes of the result are zero. + _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0 + movl 8(%rsp), TAGLEN + lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR + sub TAGLEN64, ZEROPAD_MASK_PTR + xor %eax, %eax + _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0 + sete %al +.endif + RET +.endm + +.set USE_AVX, 0 +SYM_FUNC_START(aes_gcm_precompute_aesni) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_aesni) +SYM_FUNC_START(aes_gcm_aad_update_aesni) + _aes_gcm_aad_update +SYM_FUNC_END(aes_gcm_aad_update_aesni) +SYM_FUNC_START(aes_gcm_enc_update_aesni) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_aesni) +SYM_FUNC_START(aes_gcm_dec_update_aesni) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_aesni) +SYM_FUNC_START(aes_gcm_enc_final_aesni) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_aesni) +SYM_FUNC_START(aes_gcm_dec_final_aesni) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_aesni) + +.set USE_AVX, 1 +SYM_FUNC_START(aes_gcm_precompute_aesni_avx) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_aesni_avx) +SYM_FUNC_START(aes_gcm_aad_update_aesni_avx) + _aes_gcm_aad_update +SYM_FUNC_END(aes_gcm_aad_update_aesni_avx) +SYM_FUNC_START(aes_gcm_enc_update_aesni_avx) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_aesni_avx) +SYM_FUNC_START(aes_gcm_dec_update_aesni_avx) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_aesni_avx) +SYM_FUNC_START(aes_gcm_enc_final_aesni_avx) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_aesni_avx) +SYM_FUNC_START(aes_gcm_dec_final_aesni_avx) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_aesni_avx) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 39066b57a70e..eb153eff9331 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -10,16 +10,7 @@ * Vinodh Gopal * Kahraman Akdemir * - * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD - * interface for 64-bit kernels. - * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) - * Aidan O'Mahony (aidan.o.mahony@intel.com) - * Adrian Hoban - * James Guilford (james.guilford@intel.com) - * Gabriele Paoloni - * Tadeusz Struk (tadeusz.struk@intel.com) - * Wajdi Feghali (wajdi.k.feghali@intel.com) - * Copyright (c) 2010, Intel Corporation. + * Copyright (c) 2010, Intel Corporation. * * Ported x86_64 version to x86: * Author: Mathias Krause @@ -27,95 +18,6 @@ #include #include -#include - -/* - * The following macros are used to move an (un)aligned 16 byte value to/from - * an XMM register. This can done for either FP or integer values, for FP use - * movaps (move aligned packed single) or integer use movdqa (move double quad - * aligned). It doesn't make a performance difference which instruction is used - * since Nehalem (original Core i7) was released. However, the movaps is a byte - * shorter, so that is the one we'll use for now. (same for unaligned). - */ -#define MOVADQ movaps -#define MOVUDQ movups - -#ifdef __x86_64__ - -# constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.POLY, "aM", @progbits, 16 -.align 16 -POLY: .octa 0xC2000000000000000000000000000001 -.section .rodata.cst16.TWOONE, "aM", @progbits, 16 -.align 16 -TWOONE: .octa 0x00000001000000000000000000000001 - -.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 -.align 16 -SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F -.section .rodata.cst16.MASK1, "aM", @progbits, 16 -.align 16 -MASK1: .octa 0x0000000000000000ffffffffffffffff -.section .rodata.cst16.MASK2, "aM", @progbits, 16 -.align 16 -MASK2: .octa 0xffffffffffffffff0000000000000000 -.section .rodata.cst16.ONE, "aM", @progbits, 16 -.align 16 -ONE: .octa 0x00000000000000000000000000000001 -.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 -.align 16 -F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 -.section .rodata.cst16.dec, "aM", @progbits, 16 -.align 16 -dec: .octa 0x1 -.section .rodata.cst16.enc, "aM", @progbits, 16 -.align 16 -enc: .octa 0x2 - -# order of these constants should not change. -# more specifically, ALL_F should follow SHIFT_MASK, -# and zero should follow ALL_F -.section .rodata, "a", @progbits -.align 16 -SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 -ALL_F: .octa 0xffffffffffffffffffffffffffffffff - .octa 0x00000000000000000000000000000000 - -.text - -#define AadHash 16*0 -#define AadLen 16*1 -#define InLen (16*1)+8 -#define PBlockEncKey 16*2 -#define OrigIV 16*3 -#define CurCount 16*4 -#define PBlockLen 16*5 -#define HashKey 16*6 // store HashKey <<1 mod poly here -#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here -#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here -#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here -#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 - // bits of HashKey <<1 mod poly here - //(for Karatsuba purposes) -#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 - // bits of HashKey^2 <<1 mod poly here - // (for Karatsuba purposes) -#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 - // bits of HashKey^3 <<1 mod poly here - // (for Karatsuba purposes) -#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 - // bits of HashKey^4 <<1 mod poly here - // (for Karatsuba purposes) - -#define arg1 rdi -#define arg2 rsi -#define arg3 rdx -#define arg4 rcx -#define arg5 r8 -#define arg6 r9 -#define keysize 2*15*16(%arg1) -#endif - #define STATE1 %xmm0 #define STATE2 %xmm4 @@ -162,1409 +64,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define TKEYP T1 #endif -.macro FUNC_SAVE - push %r12 - push %r13 - push %r14 -# -# states of %xmm registers %xmm6:%xmm15 not saved -# all %xmm registers are clobbered -# -.endm - - -.macro FUNC_RESTORE - pop %r14 - pop %r13 - pop %r12 -.endm - -# Precompute hashkeys. -# Input: Hash subkey. -# Output: HashKeys stored in gcm_context_data. Only needs to be called -# once per key. -# clobbers r12, and tmp xmm registers. -.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 - mov \SUBKEY, %r12 - movdqu (%r12), \TMP3 - movdqa SHUF_MASK(%rip), \TMP2 - pshufb \TMP2, \TMP3 - - # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) - - movdqa \TMP3, \TMP2 - psllq $1, \TMP3 - psrlq $63, \TMP2 - movdqa \TMP2, \TMP1 - pslldq $8, \TMP2 - psrldq $8, \TMP1 - por \TMP2, \TMP3 - - # reduce HashKey<<1 - - pshufd $0x24, \TMP1, \TMP2 - pcmpeqd TWOONE(%rip), \TMP2 - pand POLY(%rip), \TMP2 - pxor \TMP2, \TMP3 - movdqu \TMP3, HashKey(%arg2) - - movdqa \TMP3, \TMP5 - pshufd $78, \TMP3, \TMP1 - pxor \TMP3, \TMP1 - movdqu \TMP1, HashKey_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^2<<1 (mod poly) - movdqu \TMP5, HashKey_2(%arg2) -# HashKey_2 = HashKey^2<<1 (mod poly) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_2_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqu \TMP5, HashKey_3(%arg2) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_3_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqu \TMP5, HashKey_4(%arg2) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_4_k(%arg2) -.endm - -# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. -# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 -.macro GCM_INIT Iv SUBKEY AAD AADLEN - mov \AADLEN, %r11 - mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length - xor %r11d, %r11d - mov %r11, InLen(%arg2) # ctx_data.in_length = 0 - mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 - mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 - mov \Iv, %rax - movdqu (%rax), %xmm0 - movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv - - movdqa SHUF_MASK(%rip), %xmm2 - pshufb %xmm2, %xmm0 - movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv - - PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 - movdqu HashKey(%arg2), %xmm13 - - CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ - %xmm4, %xmm5, %xmm6 -.endm - -# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context -# struct has been initialized by GCM_INIT. -# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK -# Clobbers rax, r10-r13, and xmm0-xmm15 -.macro GCM_ENC_DEC operation - movdqu AadHash(%arg2), %xmm8 - movdqu HashKey(%arg2), %xmm13 - add %arg5, InLen(%arg2) - - xor %r11d, %r11d # initialise the data pointer offset as zero - PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation - - sub %r11, %arg5 # sub partial block data used - mov %arg5, %r13 # save the number of bytes - - and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) - mov %r13, %r12 - # Encrypt/Decrypt first few blocks - - and $(3<<4), %r12 - jz .L_initial_num_blocks_is_0_\@ - cmp $(2<<4), %r12 - jb .L_initial_num_blocks_is_1_\@ - je .L_initial_num_blocks_is_2_\@ -.L_initial_num_blocks_is_3_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation - sub $48, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_2_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation - sub $32, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_1_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation - sub $16, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_0_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation -.L_initial_blocks_\@: - - # Main loop - Encrypt/Decrypt remaining blocks - - test %r13, %r13 - je .L_zero_cipher_left_\@ - sub $64, %r13 - je .L_four_cipher_left_\@ -.L_crypt_by_4_\@: - GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ - %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ - %xmm7, %xmm8, enc - add $64, %r11 - sub $64, %r13 - jne .L_crypt_by_4_\@ -.L_four_cipher_left_\@: - GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ -%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 -.L_zero_cipher_left_\@: - movdqu %xmm8, AadHash(%arg2) - movdqu %xmm0, CurCount(%arg2) - - mov %arg5, %r13 - and $15, %r13 # %r13 = arg5 (mod 16) - je .L_multiple_of_16_bytes_\@ - - mov %r13, PBlockLen(%arg2) - - # Handle the last <16 Byte block separately - paddd ONE(%rip), %xmm0 # INCR CNT to get Yn - movdqu %xmm0, CurCount(%arg2) - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm0 - - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) - movdqu %xmm0, PBlockEncKey(%arg2) - - cmp $16, %arg5 - jge .L_large_enough_update_\@ - - lea (%arg4,%r11,1), %r10 - mov %r13, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 - jmp .L_data_read_\@ - -.L_large_enough_update_\@: - sub $16, %r11 - add %r13, %r11 - - # receive the last <16 Byte block - movdqu (%arg4, %r11, 1), %xmm1 - - sub %r13, %r11 - add $16, %r11 - - lea SHIFT_MASK+16(%rip), %r12 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - sub %r13, %r12 - # get the appropriate shuffle mask - movdqu (%r12), %xmm2 - # shift right 16-r13 bytes - pshufb %xmm2, %xmm1 - -.L_data_read_\@: - lea ALL_F+16(%rip), %r12 - sub %r13, %r12 - -.ifc \operation, dec - movdqa %xmm1, %xmm2 -.endif - pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) - movdqu (%r12), %xmm1 - # get the appropriate mask to mask out top 16-r13 bytes of xmm0 - pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 -.ifc \operation, dec - pand %xmm1, %xmm2 - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10 ,%xmm2 - - pxor %xmm2, %xmm8 -.else - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10,%xmm0 - - pxor %xmm0, %xmm8 -.endif - - movdqu %xmm8, AadHash(%arg2) -.ifc \operation, enc - # GHASH computation for the last <16 byte block - movdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm0 back to output as ciphertext - pshufb %xmm10, %xmm0 -.endif - - # Output %r13 bytes - movq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - mov %rax, (%arg3 , %r11, 1) - add $8, %r11 - psrldq $8, %xmm0 - movq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - mov %al, (%arg3, %r11, 1) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_multiple_of_16_bytes_\@: -.endm - -# GCM_COMPLETE Finishes update of tag of last partial block -# Output: Authorization Tag (AUTH_TAG) -# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 -.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN - movdqu AadHash(%arg2), %xmm8 - movdqu HashKey(%arg2), %xmm13 - - mov PBlockLen(%arg2), %r12 - - test %r12, %r12 - je .L_partial_done\@ - - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - -.L_partial_done\@: - mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - movd %r12d, %xmm15 # len(A) in %xmm15 - mov InLen(%arg2), %r12 - shl $3, %r12 # len(C) in bits (*128) - movq %r12, %xmm1 - - pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 - pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) - pxor %xmm15, %xmm8 - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - # final GHASH computation - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm8 - - movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) - pxor %xmm8, %xmm0 -.L_return_T_\@: - mov \AUTHTAG, %r10 # %r10 = authTag - mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len - cmp $16, %r11 - je .L_T_16_\@ - cmp $8, %r11 - jl .L_T_4_\@ -.L_T_8_\@: - movq %xmm0, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - psrldq $8, %xmm0 - test %r11, %r11 - je .L_return_T_done_\@ -.L_T_4_\@: - movd %xmm0, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - psrldq $4, %xmm0 - test %r11, %r11 - je .L_return_T_done_\@ -.L_T_123_\@: - movd %xmm0, %eax - cmp $2, %r11 - jl .L_T_1_\@ - mov %ax, (%r10) - cmp $2, %r11 - je .L_return_T_done_\@ - add $2, %r10 - sar $16, %eax -.L_T_1_\@: - mov %al, (%r10) - jmp .L_return_T_done_\@ -.L_T_16_\@: - movdqu %xmm0, (%r10) -.L_return_T_done_\@: -.endm - -#ifdef __x86_64__ -/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -* -* -* Input: A and B (128-bits each, bit-reflected) -* Output: C = A*B*x mod poly, (i.e. >>1 ) -* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -* -*/ -.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 - movdqa \GH, \TMP1 - pshufd $78, \GH, \TMP2 - pshufd $78, \HK, \TMP3 - pxor \GH, \TMP2 # TMP2 = a1+a0 - pxor \HK, \TMP3 # TMP3 = b1+b0 - pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \HK, \GH # GH = a0*b0 - pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) - pxor \GH, \TMP2 - pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \GH - pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK - - # first phase of the reduction - - movdqa \GH, \TMP2 - movdqa \GH, \TMP3 - movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 - # in in order to perform - # independent shifts - pslld $31, \TMP2 # packed right shift <<31 - pslld $30, \TMP3 # packed right shift <<30 - pslld $25, \TMP4 # packed right shift <<25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift TMP5 1 DW - pslldq $12, \TMP2 # left shift TMP2 3 DWs - pxor \TMP2, \GH - - # second phase of the reduction - - movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 - # in in order to perform - # independent shifts - movdqa \GH,\TMP3 - movdqa \GH,\TMP4 - psrld $1,\TMP2 # packed left shift >>1 - psrld $2,\TMP3 # packed left shift >>2 - psrld $7,\TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \GH - pxor \TMP1, \GH # result is in TMP1 -.endm - -# Reads DLEN bytes starting at DPTR and stores in XMMDst -# where 0 < DLEN < 16 -# Clobbers %rax, DLEN and XMM1 -.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst - cmp $8, \DLEN - jl .L_read_lt8_\@ - mov (\DPTR), %rax - movq %rax, \XMMDst - sub $8, \DLEN - jz .L_done_read_partial_block_\@ - xor %eax, %eax -.L_read_next_byte_\@: - shl $8, %rax - mov 7(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_\@ - movq %rax, \XMM1 - pslldq $8, \XMM1 - por \XMM1, \XMMDst - jmp .L_done_read_partial_block_\@ -.L_read_lt8_\@: - xor %eax, %eax -.L_read_next_byte_lt8_\@: - shl $8, %rax - mov -1(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_lt8_\@ - movq %rax, \XMMDst -.L_done_read_partial_block_\@: -.endm - -# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. -# clobbers r10-11, xmm14 -.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ - TMP6 TMP7 - MOVADQ SHUF_MASK(%rip), %xmm14 - mov \AAD, %r10 # %r10 = AAD - mov \AADLEN, %r11 # %r11 = aadLen - pxor \TMP7, \TMP7 - pxor \TMP6, \TMP6 - - cmp $16, %r11 - jl .L_get_AAD_rest\@ -.L_get_AAD_blocks\@: - movdqu (%r10), \TMP7 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data - pxor \TMP7, \TMP6 - GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 - add $16, %r10 - sub $16, %r11 - cmp $16, %r11 - jge .L_get_AAD_blocks\@ - - movdqu \TMP6, \TMP7 - - /* read the last <16B of AAD */ -.L_get_AAD_rest\@: - test %r11, %r11 - je .L_get_AAD_done\@ - - READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data - pxor \TMP6, \TMP7 - GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 - movdqu \TMP7, \TMP6 - -.L_get_AAD_done\@: - movdqu \TMP6, AadHash(%arg2) -.endm - -# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks -# between update calls. -# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK -# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context -# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 -.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ - AAD_HASH operation - mov PBlockLen(%arg2), %r13 - test %r13, %r13 - je .L_partial_block_done_\@ # Leave Macro if no partial blocks - # Read in input data without over reading - cmp $16, \PLAIN_CYPH_LEN - jl .L_fewer_than_16_bytes_\@ - movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp .L_data_read_\@ - -.L_fewer_than_16_bytes_\@: - lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 - mov \PLAIN_CYPH_LEN, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 - - mov PBlockLen(%arg2), %r13 - -.L_data_read_\@: # Finished reading in data - - movdqu PBlockEncKey(%arg2), %xmm9 - movdqu HashKey(%arg2), %xmm13 - - lea SHIFT_MASK(%rip), %r12 - - # adjust the shuffle mask pointer to be able to shift r13 bytes - # r16-r13 is the number of bytes in plaintext mod 16) - add %r13, %r12 - movdqu (%r12), %xmm2 # get the appropriate shuffle mask - pshufb %xmm2, %xmm9 # shift right r13 bytes - -.ifc \operation, dec - movdqa %xmm1, %xmm3 - pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_1_\@ - sub %r10, %r12 -.L_no_extra_mask_1_\@: - - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 - - pand %xmm1, %xmm3 - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm3 - pshufb %xmm2, %xmm3 - pxor %xmm3, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_1_\@ - - # GHASH computation for the last <16 Byte block - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax, %eax - - mov %rax, PBlockLen(%arg2) - jmp .L_dec_done_\@ -.L_partial_incomplete_1_\@: - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -.L_dec_done_\@: - movdqu \AAD_HASH, AadHash(%arg2) -.else - pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_2_\@ - sub %r10, %r12 -.L_no_extra_mask_2_\@: - - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - pand %xmm1, %xmm9 - - movdqa SHUF_MASK(%rip), %xmm1 - pshufb %xmm1, %xmm9 - pshufb %xmm2, %xmm9 - pxor %xmm9, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_2_\@ - - # GHASH computation for the last <16 Byte block - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax, %eax - - mov %rax, PBlockLen(%arg2) - jmp .L_encode_done_\@ -.L_partial_incomplete_2_\@: - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -.L_encode_done_\@: - movdqu \AAD_HASH, AadHash(%arg2) - - movdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm9 back to output as ciphertext - pshufb %xmm10, %xmm9 - pshufb %xmm2, %xmm9 -.endif - # output encrypted Bytes - test %r10, %r10 - jl .L_partial_fill_\@ - mov %r13, %r12 - mov $16, %r13 - # Set r13 to be the number of bytes to write out - sub %r12, %r13 - jmp .L_count_set_\@ -.L_partial_fill_\@: - mov \PLAIN_CYPH_LEN, %r13 -.L_count_set_\@: - movdqa %xmm9, %xmm0 - movq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - - mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $8, \DATA_OFFSET - psrldq $8, %xmm0 - movq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $1, \DATA_OFFSET - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_partial_block_done_\@: -.endm # PARTIAL_BLOCK - -/* -* if a = number of total plaintext bytes -* b = floor(a/16) -* num_initial_blocks = b mod 4 -* encrypt the initial num_initial_blocks blocks and apply ghash on -* the ciphertext -* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers -* are clobbered -* arg1, %arg2, %arg3 are used as a pointer only, not modified -*/ - - -.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ - XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation - MOVADQ SHUF_MASK(%rip), %xmm14 - - movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 - - # start AES for num_initial_blocks blocks - - movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 - -.if (\i == 5) || (\i == 6) || (\i == 7) - - MOVADQ ONE(%RIP),\TMP1 - MOVADQ 0(%arg1),\TMP2 -.irpc index, \i_seq - paddd \TMP1, \XMM0 # INCR Y0 -.ifc \operation, dec - movdqa \XMM0, %xmm\index -.else - MOVADQ \XMM0, %xmm\index -.endif - pshufb %xmm14, %xmm\index # perform a 16 byte swap - pxor \TMP2, %xmm\index -.endr - lea 0x10(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - add $5,%eax # 128->9, 192->11, 256->13 - -.Laes_loop_initial_\@: - MOVADQ (%r10),\TMP1 -.irpc index, \i_seq - aesenc \TMP1, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_initial_\@ - - MOVADQ (%r10), \TMP1 -.irpc index, \i_seq - aesenclast \TMP1, %xmm\index # Last Round -.endr -.irpc index, \i_seq - movdqu (%arg4 , %r11, 1), \TMP1 - pxor \TMP1, %xmm\index - movdqu %xmm\index, (%arg3 , %r11, 1) - # write back plaintext/ciphertext for num_initial_blocks - add $16, %r11 - -.ifc \operation, dec - movdqa \TMP1, %xmm\index -.endif - pshufb %xmm14, %xmm\index - - # prepare plaintext/ciphertext for GHASH computation -.endr -.endif - - # apply GHASH on num_initial_blocks blocks - -.if \i == 5 - pxor %xmm5, %xmm6 - GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 6 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 7 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.endif - cmp $64, %r13 - jl .L_initial_blocks_done\@ - # no need for precomputed values -/* -* -* Precomputations for HashKey parallel with encryption of first 4 blocks. -* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i -*/ - MOVADQ ONE(%RIP),\TMP1 - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM1 - pshufb %xmm14, \XMM1 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM2 - pshufb %xmm14, \XMM2 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM3 - pshufb %xmm14, \XMM3 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM4 - pshufb %xmm14, \XMM4 # perform a 16 byte swap - - MOVADQ 0(%arg1),\TMP1 - pxor \TMP1, \XMM1 - pxor \TMP1, \XMM2 - pxor \TMP1, \XMM3 - pxor \TMP1, \XMM4 -.irpc index, 1234 # do 4 rounds - movaps 0x10*\index(%arg1), \TMP1 - aesenc \TMP1, \XMM1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 -.endr -.irpc index, 56789 # do next 5 rounds - movaps 0x10*\index(%arg1), \TMP1 - aesenc \TMP1, \XMM1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 -.endr - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_pre_done\@ - -.Laes_loop_pre_\@: - MOVADQ (%r10),\TMP2 -.irpc index, 1234 - aesenc \TMP2, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_pre_\@ - -.Laes_loop_pre_done\@: - MOVADQ (%r10), \TMP2 - aesenclast \TMP2, \XMM1 - aesenclast \TMP2, \XMM2 - aesenclast \TMP2, \XMM3 - aesenclast \TMP2, \XMM4 - movdqu 16*0(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM1 -.ifc \operation, dec - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM1 -.endif - movdqu 16*1(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM2 -.ifc \operation, dec - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM2 -.endif - movdqu 16*2(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM3 -.ifc \operation, dec - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM3 -.endif - movdqu 16*3(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM4 -.ifc \operation, dec - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM4 -.else - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) -.endif - - add $64, %r11 - pshufb %xmm14, \XMM1 # perform a 16 byte swap - pxor \XMMDst, \XMM1 -# combine GHASHed value with the corresponding ciphertext - pshufb %xmm14, \XMM2 # perform a 16 byte swap - pshufb %xmm14, \XMM3 # perform a 16 byte swap - pshufb %xmm14, \XMM4 # perform a 16 byte swap - -.L_initial_blocks_done\@: - -.endm - -/* -* encrypt 4 blocks at a time -* ghash the 4 previously encrypted ciphertext blocks -* arg1, %arg3, %arg4 are used as pointers only, not modified -* %r11 is the data offset value -*/ -.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ -TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation - - movdqa \XMM1, \XMM5 - movdqa \XMM2, \XMM6 - movdqa \XMM3, \XMM7 - movdqa \XMM4, \XMM8 - - movdqa SHUF_MASK(%rip), %xmm15 - # multiply TMP5 * HashKey using karatsuba - - movdqa \XMM5, \TMP4 - pshufd $78, \XMM5, \TMP6 - pxor \XMM5, \TMP6 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 - movdqa \XMM0, \XMM1 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM2 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM3 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor (%arg1), \XMM1 - pxor (%arg1), \XMM2 - pxor (%arg1), \XMM3 - pxor (%arg1), \XMM4 - movdqu HashKey_4_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) - movaps 0x10(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movaps 0x20(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 2 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movdqa \XMM6, \TMP1 - pshufd $78, \XMM6, \TMP2 - pxor \XMM6, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 - movaps 0x30(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 3 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 - movaps 0x40(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 4 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_3_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x50(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 5 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM6, \XMM5 - pxor \TMP2, \TMP6 - movdqa \XMM7, \TMP1 - pshufd $78, \XMM7, \TMP2 - pxor \XMM7, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - - # Multiply TMP5 * HashKey using karatsuba - - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x60(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 6 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 - movaps 0x70(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 7 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_2_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x80(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 8 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM7, \XMM5 - pxor \TMP2, \TMP6 - - # Multiply XMM8 * HashKey - # XMM8 and TMP5 hold the values for the two operands - - movdqa \XMM8, \TMP1 - pshufd $78, \XMM8, \TMP2 - pxor \XMM8, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x90(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 9 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_par_enc_done\@ - -.Laes_loop_par_enc\@: - MOVADQ (%r10),\TMP3 -.irpc index, 1234 - aesenc \TMP3, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_par_enc\@ - -.Laes_loop_par_enc_done\@: - MOVADQ (%r10), \TMP3 - aesenclast \TMP3, \XMM1 # Round 10 - aesenclast \TMP3, \XMM2 - aesenclast \TMP3, \XMM3 - aesenclast \TMP3, \XMM4 - movdqu HashKey_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu 16(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu 32(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu 48(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor \TMP4, \TMP1 - pxor \XMM8, \XMM5 - pxor \TMP6, \TMP2 - pxor \TMP1, \TMP2 - pxor \XMM5, \TMP2 - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \XMM5 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 - - # first phase of reduction - - movdqa \XMM5, \TMP2 - movdqa \XMM5, \TMP3 - movdqa \XMM5, \TMP4 -# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently - pslld $31, \TMP2 # packed right shift << 31 - pslld $30, \TMP3 # packed right shift << 30 - pslld $25, \TMP4 # packed right shift << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift T5 1 DW - pslldq $12, \TMP2 # left shift T2 3 DWs - pxor \TMP2, \XMM5 - - # second phase of reduction - - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 - movdqa \XMM5,\TMP3 - movdqa \XMM5,\TMP4 - psrld $1, \TMP2 # packed left shift >>1 - psrld $2, \TMP3 # packed left shift >>2 - psrld $7, \TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \XMM5 - pxor \TMP1, \XMM5 # result is in TMP1 - - pxor \XMM5, \XMM1 -.endm - -/* -* decrypt 4 blocks at a time -* ghash the 4 previously decrypted ciphertext blocks -* arg1, %arg3, %arg4 are used as pointers only, not modified -* %r11 is the data offset value -*/ -.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ -TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation - - movdqa \XMM1, \XMM5 - movdqa \XMM2, \XMM6 - movdqa \XMM3, \XMM7 - movdqa \XMM4, \XMM8 - - movdqa SHUF_MASK(%rip), %xmm15 - # multiply TMP5 * HashKey using karatsuba - - movdqa \XMM5, \TMP4 - pshufd $78, \XMM5, \TMP6 - pxor \XMM5, \TMP6 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 - movdqa \XMM0, \XMM1 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM2 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM3 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor (%arg1), \XMM1 - pxor (%arg1), \XMM2 - pxor (%arg1), \XMM3 - pxor (%arg1), \XMM4 - movdqu HashKey_4_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) - movaps 0x10(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movaps 0x20(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 2 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movdqa \XMM6, \TMP1 - pshufd $78, \XMM6, \TMP2 - pxor \XMM6, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 - movaps 0x30(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 3 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 - movaps 0x40(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 4 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_3_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x50(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 5 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM6, \XMM5 - pxor \TMP2, \TMP6 - movdqa \XMM7, \TMP1 - pshufd $78, \XMM7, \TMP2 - pxor \XMM7, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - - # Multiply TMP5 * HashKey using karatsuba - - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x60(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 6 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 - movaps 0x70(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 7 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_2_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x80(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 8 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM7, \XMM5 - pxor \TMP2, \TMP6 - - # Multiply XMM8 * HashKey - # XMM8 and TMP5 hold the values for the two operands - - movdqa \XMM8, \TMP1 - pshufd $78, \XMM8, \TMP2 - pxor \XMM8, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x90(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 9 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_par_dec_done\@ - -.Laes_loop_par_dec\@: - MOVADQ (%r10),\TMP3 -.irpc index, 1234 - aesenc \TMP3, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_par_dec\@ - -.Laes_loop_par_dec_done\@: - MOVADQ (%r10), \TMP3 - aesenclast \TMP3, \XMM1 # last round - aesenclast \TMP3, \XMM2 - aesenclast \TMP3, \XMM3 - aesenclast \TMP3, \XMM4 - movdqu HashKey_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM1 - movdqu 16(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM2 - movdqu 32(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM3 - movdqu 48(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor \TMP4, \TMP1 - pxor \XMM8, \XMM5 - pxor \TMP6, \TMP2 - pxor \TMP1, \TMP2 - pxor \XMM5, \TMP2 - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \XMM5 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 - - # first phase of reduction - - movdqa \XMM5, \TMP2 - movdqa \XMM5, \TMP3 - movdqa \XMM5, \TMP4 -# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently - pslld $31, \TMP2 # packed right shift << 31 - pslld $30, \TMP3 # packed right shift << 30 - pslld $25, \TMP4 # packed right shift << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift T5 1 DW - pslldq $12, \TMP2 # left shift T2 3 DWs - pxor \TMP2, \XMM5 - - # second phase of reduction - - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 - movdqa \XMM5,\TMP3 - movdqa \XMM5,\TMP4 - psrld $1, \TMP2 # packed left shift >>1 - psrld $2, \TMP3 # packed left shift >>2 - psrld $7, \TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \XMM5 - pxor \TMP1, \XMM5 # result is in TMP1 - - pxor \XMM5, \XMM1 -.endm - -/* GHASH the last 4 ciphertext blocks. */ -.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ -TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst - - # Multiply TMP6 * HashKey (using Karatsuba) - - movdqa \XMM1, \TMP6 - pshufd $78, \XMM1, \TMP2 - pxor \XMM1, \TMP2 - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 - movdqu HashKey_4_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqa \XMM1, \XMMDst - movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - - movdqa \XMM2, \TMP1 - pshufd $78, \XMM2, \TMP2 - pxor \XMM2, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 - movdqu HashKey_3_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM2, \XMMDst - pxor \TMP2, \XMM1 -# results accumulated in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - - movdqa \XMM3, \TMP1 - pshufd $78, \XMM3, \TMP2 - pxor \XMM3, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 - movdqu HashKey_2_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM3, \XMMDst - pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - movdqa \XMM4, \TMP1 - pshufd $78, \XMM4, \TMP2 - pxor \XMM4, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 - movdqu HashKey_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM4, \XMMDst - pxor \XMM1, \TMP2 - pxor \TMP6, \TMP2 - pxor \XMMDst, \TMP2 - # middle section of the temp results combined as in karatsuba algorithm - movdqa \TMP2, \TMP4 - pslldq $8, \TMP4 # left shift TMP4 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP4, \XMMDst - pxor \TMP2, \TMP6 -# TMP6:XMMDst holds the result of the accumulated carry-less multiplications - # first phase of the reduction - movdqa \XMMDst, \TMP2 - movdqa \XMMDst, \TMP3 - movdqa \XMMDst, \TMP4 -# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently - pslld $31, \TMP2 # packed right shifting << 31 - pslld $30, \TMP3 # packed right shifting << 30 - pslld $25, \TMP4 # packed right shifting << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP7 - psrldq $4, \TMP7 # right shift TMP7 1 DW - pslldq $12, \TMP2 # left shift TMP2 3 DWs - pxor \TMP2, \XMMDst - - # second phase of the reduction - movdqa \XMMDst, \TMP2 - # make 3 copies of XMMDst for doing 3 shift operations - movdqa \XMMDst, \TMP3 - movdqa \XMMDst, \TMP4 - psrld $1, \TMP2 # packed left shift >> 1 - psrld $2, \TMP3 # packed left shift >> 2 - psrld $7, \TMP4 # packed left shift >> 7 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - pxor \TMP7, \TMP2 - pxor \TMP2, \XMMDst - pxor \TMP6, \XMMDst # reduced result is in XMMDst -.endm - - -/* Encryption of a single block -* uses eax & r10 -*/ - -.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 - - pxor (%arg1), \XMM0 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - add $5,%eax # 128->9, 192->11, 256->13 - lea 16(%arg1), %r10 # get first expanded key address - -_esb_loop_\@: - MOVADQ (%r10),\TMP1 - aesenc \TMP1,\XMM0 - add $16,%r10 - sub $1,%eax - jnz _esb_loop_\@ - - MOVADQ (%r10),\TMP1 - aesenclast \TMP1,\XMM0 -.endm - -/***************************************************************************** -* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len) // Length of AAD in bytes. -*/ -SYM_FUNC_START(aesni_gcm_init) - FUNC_SAVE - GCM_INIT %arg3, %arg4,%arg5, %arg6 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init) - -/***************************************************************************** -* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -*/ -SYM_FUNC_START(aesni_gcm_enc_update) - FUNC_SAVE - GCM_ENC_DEC enc - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update) - -/***************************************************************************** -* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -*/ -SYM_FUNC_START(aesni_gcm_dec_update) - FUNC_SAVE - GCM_ENC_DEC dec - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update) - -/***************************************************************************** -* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *auth_tag, // Authenticated Tag output. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), -* // 12 or 8. -*/ -SYM_FUNC_START(aesni_gcm_finalize) - FUNC_SAVE - GCM_COMPLETE %arg3 %arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize) - -#endif - SYM_FUNC_START_LOCAL(_key_expansion_256a) pshufd $0b11111111, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S deleted file mode 100644 index 8c9749ed0651..000000000000 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ /dev/null @@ -1,2804 +0,0 @@ -######################################################################## -# Copyright (c) 2013, Intel Corporation -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the -# distribution. -# -# * Neither the name of the Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# -# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR -# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -######################################################################## -## -## Authors: -## Erdinc Ozturk -## Vinodh Gopal -## James Guilford -## Tim Chen -## -## References: -## This code was derived and highly optimized from the code described in paper: -## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation -## on Intel Architecture Processors. August, 2010 -## The details of the implementation is explained in: -## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode -## on Intel Architecture Processors. October, 2012. -## -## Assumptions: -## -## -## -## iv: -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | Salt (From the SA) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | Initialization Vector | -## | (This is the sequence number from IPSec header) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x1 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## -## -## AAD: -## AAD padded to 128 bits with 0 -## for example, assume AAD is a u32 vector -## -## if AAD is 8 bytes: -## AAD[3] = {A0, A1}# -## padded AAD in xmm register = {A1 A0 0 0} -## -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | SPI (A1) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 32-bit Sequence Number (A0) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x0 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## AAD Format with 32-bit Sequence Number -## -## if AAD is 12 bytes: -## AAD[3] = {A0, A1, A2}# -## padded AAD in xmm register = {A2 A1 A0 0} -## -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | SPI (A2) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 64-bit Extended Sequence Number {A1,A0} | -## | | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x0 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## AAD Format with 64-bit Extended Sequence Number -## -## -## aadLen: -## from the definition of the spec, aadLen can only be 8 or 12 bytes. -## The code additionally supports aadLen of length 16 bytes. -## -## TLen: -## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. -## -## poly = x^128 + x^127 + x^126 + x^121 + 1 -## throughout the code, one tab and two tab indentations are used. one tab is -## for GHASH part, two tabs is for AES part. -## - -#include - -# constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.POLY, "aM", @progbits, 16 -.align 16 -POLY: .octa 0xC2000000000000000000000000000001 - -.section .rodata.cst16.POLY2, "aM", @progbits, 16 -.align 16 -POLY2: .octa 0xC20000000000000000000001C2000000 - -.section .rodata.cst16.TWOONE, "aM", @progbits, 16 -.align 16 -TWOONE: .octa 0x00000001000000000000000000000001 - -.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 -.align 16 -SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F - -.section .rodata.cst16.ONE, "aM", @progbits, 16 -.align 16 -ONE: .octa 0x00000000000000000000000000000001 - -.section .rodata.cst16.ONEf, "aM", @progbits, 16 -.align 16 -ONEf: .octa 0x01000000000000000000000000000000 - -# order of these constants should not change. -# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F -.section .rodata, "a", @progbits -.align 16 -SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 -ALL_F: .octa 0xffffffffffffffffffffffffffffffff - .octa 0x00000000000000000000000000000000 - -.text - - -#define AadHash 16*0 -#define AadLen 16*1 -#define InLen (16*1)+8 -#define PBlockEncKey 16*2 -#define OrigIV 16*3 -#define CurCount 16*4 -#define PBlockLen 16*5 - -HashKey = 16*6 # store HashKey <<1 mod poly here -HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here -HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here -HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here -HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here -HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here -HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here -HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here -HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) -HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) -HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) -HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) -HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) -HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) -HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) -HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) - -#define arg1 %rdi -#define arg2 %rsi -#define arg3 %rdx -#define arg4 %rcx -#define arg5 %r8 -#define arg6 %r9 -#define keysize 2*15*16(arg1) - -i = 0 -j = 0 - -out_order = 0 -in_order = 1 -DEC = 0 -ENC = 1 - -.macro define_reg r n -reg_\r = %xmm\n -.endm - -.macro setreg -.altmacro -define_reg i %i -define_reg j %j -.noaltmacro -.endm - -TMP1 = 16*0 # Temporary storage for AAD -TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) -TMP3 = 16*2 # Temporary storage for AES State 3 -TMP4 = 16*3 # Temporary storage for AES State 4 -TMP5 = 16*4 # Temporary storage for AES State 5 -TMP6 = 16*5 # Temporary storage for AES State 6 -TMP7 = 16*6 # Temporary storage for AES State 7 -TMP8 = 16*7 # Temporary storage for AES State 8 - -VARIABLE_OFFSET = 16*8 - -################################ -# Utility Macros -################################ - -.macro FUNC_SAVE - push %r12 - push %r13 - push %r15 - - push %rbp - mov %rsp, %rbp - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes -.endm - -.macro FUNC_RESTORE - mov %rbp, %rsp - pop %rbp - - pop %r15 - pop %r13 - pop %r12 -.endm - -# Encryption of a single block -.macro ENCRYPT_SINGLE_BLOCK REP XMM0 - vpxor (arg1), \XMM0, \XMM0 - i = 1 - setreg -.rep \REP - vaesenc 16*i(arg1), \XMM0, \XMM0 - i = (i+1) - setreg -.endr - vaesenclast 16*i(arg1), \XMM0, \XMM0 -.endm - -# combined for GCM encrypt and decrypt functions -# clobbering all xmm registers -# clobbering r10, r11, r12, r13, r15, rax -.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP - vmovdqu AadHash(arg2), %xmm8 - vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey - add arg5, InLen(arg2) - - # initialize the data pointer offset as zero - xor %r11d, %r11d - - PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC - sub %r11, arg5 - - mov arg5, %r13 # save the number of bytes of plaintext/ciphertext - and $-16, %r13 # r13 = r13 - (r13 mod 16) - - mov %r13, %r12 - shr $4, %r12 - and $7, %r12 - jz .L_initial_num_blocks_is_0\@ - - cmp $7, %r12 - je .L_initial_num_blocks_is_7\@ - cmp $6, %r12 - je .L_initial_num_blocks_is_6\@ - cmp $5, %r12 - je .L_initial_num_blocks_is_5\@ - cmp $4, %r12 - je .L_initial_num_blocks_is_4\@ - cmp $3, %r12 - je .L_initial_num_blocks_is_3\@ - cmp $2, %r12 - je .L_initial_num_blocks_is_2\@ - - jmp .L_initial_num_blocks_is_1\@ - -.L_initial_num_blocks_is_7\@: - \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*7, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_6\@: - \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*6, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_5\@: - \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*5, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_4\@: - \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*4, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_3\@: - \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*3, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_2\@: - \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*2, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_1\@: - \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*1, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_0\@: - \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - - -.L_initial_blocks_encrypted\@: - test %r13, %r13 - je .L_zero_cipher_left\@ - - sub $128, %r13 - je .L_eight_cipher_left\@ - - - - - vmovd %xmm9, %r15d - and $255, %r15d - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - -.L_encrypt_by_8_new\@: - cmp $(255-8), %r15d - jg .L_encrypt_by_8\@ - - - - add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC - add $128, %r11 - sub $128, %r13 - jne .L_encrypt_by_8_new\@ - - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp .L_eight_cipher_left\@ - -.L_encrypt_by_8\@: - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $128, %r11 - sub $128, %r13 - jne .L_encrypt_by_8_new\@ - - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - - - -.L_eight_cipher_left\@: - \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 - - -.L_zero_cipher_left\@: - vmovdqu %xmm14, AadHash(arg2) - vmovdqu %xmm9, CurCount(arg2) - - # check for 0 length - mov arg5, %r13 - and $15, %r13 # r13 = (arg5 mod 16) - - je .L_multiple_of_16_bytes\@ - - # handle the last <16 Byte block separately - - mov %r13, PBlockLen(arg2) - - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vmovdqu %xmm9, CurCount(arg2) - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) - vmovdqu %xmm9, PBlockEncKey(arg2) - - cmp $16, arg5 - jge .L_large_enough_update\@ - - lea (arg4,%r11,1), %r10 - mov %r13, %r12 - - READ_PARTIAL_BLOCK %r10 %r12 %xmm1 - - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) - - jmp .L_final_ghash_mul\@ - -.L_large_enough_update\@: - sub $16, %r11 - add %r13, %r11 - - # receive the last <16 Byte block - vmovdqu (arg4, %r11, 1), %xmm1 - - sub %r13, %r11 - add $16, %r11 - - lea SHIFT_MASK+16(%rip), %r12 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - sub %r13, %r12 - # get the appropriate shuffle mask - vmovdqu (%r12), %xmm2 - # shift right 16-r13 bytes - vpshufb %xmm2, %xmm1, %xmm1 - -.L_final_ghash_mul\@: - .if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm2 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm2, %xmm2 - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm14, %xmm14 - - vmovdqu %xmm14, AadHash(arg2) - .else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vpxor %xmm9, %xmm14, %xmm14 - - vmovdqu %xmm14, AadHash(arg2) - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext - .endif - - - ############################# - # output r13 Bytes - vmovq %xmm9, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left\@ - - mov %rax, (arg3 , %r11) - add $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - vmovq %xmm9, %rax - sub $8, %r13 - -.L_less_than_8_bytes_left\@: - movb %al, (arg3 , %r11) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left\@ - ############################# - -.L_multiple_of_16_bytes\@: -.endm - - -# GCM_COMPLETE Finishes update of tag of last partial block -# Output: Authorization Tag (AUTH_TAG) -# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 -.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN - vmovdqu AadHash(arg2), %xmm14 - vmovdqu HashKey(arg2), %xmm13 - - mov PBlockLen(arg2), %r12 - test %r12, %r12 - je .L_partial_done\@ - - #GHASH computation for the last <16 Byte block - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - -.L_partial_done\@: - mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - vmovd %r12d, %xmm15 # len(A) in xmm15 - - mov InLen(arg2), %r12 - shl $3, %r12 # len(C) in bits (*128) - vmovq %r12, %xmm1 - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) - - vpxor %xmm15, %xmm14, %xmm14 - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap - - vmovdqu OrigIV(arg2), %xmm9 - - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) - - vpxor %xmm14, %xmm9, %xmm9 - - - -.L_return_T\@: - mov \AUTH_TAG, %r10 # r10 = authTag - mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len - - cmp $16, %r11 - je .L_T_16\@ - - cmp $8, %r11 - jl .L_T_4\@ - -.L_T_8\@: - vmovq %xmm9, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - test %r11, %r11 - je .L_return_T_done\@ -.L_T_4\@: - vmovd %xmm9, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - vpsrldq $4, %xmm9, %xmm9 - test %r11, %r11 - je .L_return_T_done\@ -.L_T_123\@: - vmovd %xmm9, %eax - cmp $2, %r11 - jl .L_T_1\@ - mov %ax, (%r10) - cmp $2, %r11 - je .L_return_T_done\@ - add $2, %r10 - sar $16, %eax -.L_T_1\@: - mov %al, (%r10) - jmp .L_return_T_done\@ - -.L_T_16\@: - vmovdqu %xmm9, (%r10) - -.L_return_T_done\@: -.endm - -.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 - - mov \AAD, %r10 # r10 = AAD - mov \AADLEN, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor \T8, \T8, \T8 - vpxor \T7, \T7, \T7 - cmp $16, %r11 - jl .L_get_AAD_rest8\@ -.L_get_AAD_blocks\@: - vmovdqu (%r10), \T7 - vpshufb SHUF_MASK(%rip), \T7, \T7 - vpxor \T7, \T8, \T8 - \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge .L_get_AAD_blocks\@ - vmovdqu \T8, \T7 - test %r11, %r11 - je .L_get_AAD_done\@ - - vpxor \T7, \T7, \T7 - - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -.L_get_AAD_rest8\@: - cmp $4, %r11 - jle .L_get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, \T7, \T7 - vpxor \T1, \T7, \T7 - jmp .L_get_AAD_rest8\@ -.L_get_AAD_rest4\@: - test %r11, %r11 - jle .L_get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, \T7, \T7 - vpxor \T1, \T7, \T7 -.L_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and a pair of shuffle masks */ - leaq ALL_F(%rip), %r11 - subq %r12, %r11 - vmovdqu 16(%r11), \T1 - andq $~3, %r11 - vpshufb (%r11), \T7, \T7 - vpand \T1, \T7, \T7 -.L_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), \T7, \T7 - vpxor \T8, \T7, \T7 - \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 - -.L_get_AAD_done\@: - vmovdqu \T7, AadHash(arg2) -.endm - -.macro INIT GHASH_MUL PRECOMPUTE - mov arg6, %r11 - mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length - xor %r11d, %r11d - mov %r11, InLen(arg2) # ctx_data.in_length = 0 - - mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 - mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 - mov arg3, %rax - movdqu (%rax), %xmm0 - movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv - - vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 - movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv - - vmovdqu (arg4), %xmm6 # xmm6 = HashKey - - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey - vmovdqa %xmm6, %xmm2 - vpsllq $1, %xmm6, %xmm6 - vpsrlq $63, %xmm2, %xmm2 - vmovdqa %xmm2, %xmm1 - vpslldq $8, %xmm2, %xmm2 - vpsrldq $8, %xmm1, %xmm1 - vpor %xmm2, %xmm6, %xmm6 - #reduction - vpshufd $0b00100100, %xmm1, %xmm2 - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 - vpand POLY(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly - ####################################################################### - vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly - - CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 - - \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 -.endm - - -# Reads DLEN bytes starting at DPTR and stores in XMMDst -# where 0 < DLEN < 16 -# Clobbers %rax, DLEN -.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst - vpxor \XMMDst, \XMMDst, \XMMDst - - cmp $8, \DLEN - jl .L_read_lt8_\@ - mov (\DPTR), %rax - vpinsrq $0, %rax, \XMMDst, \XMMDst - sub $8, \DLEN - jz .L_done_read_partial_block_\@ - xor %eax, %eax -.L_read_next_byte_\@: - shl $8, %rax - mov 7(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_\@ - vpinsrq $1, %rax, \XMMDst, \XMMDst - jmp .L_done_read_partial_block_\@ -.L_read_lt8_\@: - xor %eax, %eax -.L_read_next_byte_lt8_\@: - shl $8, %rax - mov -1(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_lt8_\@ - vpinsrq $0, %rax, \XMMDst, \XMMDst -.L_done_read_partial_block_\@: -.endm - -# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks -# between update calls. -# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK -# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context -# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 -.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ - AAD_HASH ENC_DEC - mov PBlockLen(arg2), %r13 - test %r13, %r13 - je .L_partial_block_done_\@ # Leave Macro if no partial blocks - # Read in input data without over reading - cmp $16, \PLAIN_CYPH_LEN - jl .L_fewer_than_16_bytes_\@ - vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp .L_data_read_\@ - -.L_fewer_than_16_bytes_\@: - lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 - mov \PLAIN_CYPH_LEN, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm1 - - mov PBlockLen(arg2), %r13 - -.L_data_read_\@: # Finished reading in data - - vmovdqu PBlockEncKey(arg2), %xmm9 - vmovdqu HashKey(arg2), %xmm13 - - lea SHIFT_MASK(%rip), %r12 - - # adjust the shuffle mask pointer to be able to shift r13 bytes - # r16-r13 is the number of bytes in plaintext mod 16) - add %r13, %r12 - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes - -.if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm3 - pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_1_\@ - sub %r10, %r12 -.L_no_extra_mask_1_\@: - - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 - - vpand %xmm1, %xmm3, %xmm3 - vmovdqa SHUF_MASK(%rip), %xmm10 - vpshufb %xmm10, %xmm3, %xmm3 - vpshufb %xmm2, %xmm3, %xmm3 - vpxor %xmm3, \AAD_HASH, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_1_\@ - - # GHASH computation for the last <16 Byte block - \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax,%eax - - mov %rax, PBlockLen(arg2) - jmp .L_dec_done_\@ -.L_partial_incomplete_1_\@: - add \PLAIN_CYPH_LEN, PBlockLen(arg2) -.L_dec_done_\@: - vmovdqu \AAD_HASH, AadHash(arg2) -.else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_2_\@ - sub %r10, %r12 -.L_no_extra_mask_2_\@: - - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 - - vmovdqa SHUF_MASK(%rip), %xmm1 - vpshufb %xmm1, %xmm9, %xmm9 - vpshufb %xmm2, %xmm9, %xmm9 - vpxor %xmm9, \AAD_HASH, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_2_\@ - - # GHASH computation for the last <16 Byte block - \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax,%eax - - mov %rax, PBlockLen(arg2) - jmp .L_encode_done_\@ -.L_partial_incomplete_2_\@: - add \PLAIN_CYPH_LEN, PBlockLen(arg2) -.L_encode_done_\@: - vmovdqu \AAD_HASH, AadHash(arg2) - - vmovdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm9 back to output as ciphertext - vpshufb %xmm10, %xmm9, %xmm9 - vpshufb %xmm2, %xmm9, %xmm9 -.endif - # output encrypted Bytes - test %r10, %r10 - jl .L_partial_fill_\@ - mov %r13, %r12 - mov $16, %r13 - # Set r13 to be the number of bytes to write out - sub %r12, %r13 - jmp .L_count_set_\@ -.L_partial_fill_\@: - mov \PLAIN_CYPH_LEN, %r13 -.L_count_set_\@: - vmovdqa %xmm9, %xmm0 - vmovq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - - mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $8, \DATA_OFFSET - psrldq $8, %xmm0 - vmovq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $1, \DATA_OFFSET - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_partial_block_done_\@: -.endm # PARTIAL_BLOCK - -############################################################################### -# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -# Input: A and B (128-bits each, bit-reflected) -# Output: C = A*B*x mod poly, (i.e. >>1 ) -# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -############################################################################### -.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 - - vpshufd $0b01001110, \GH, \T2 - vpshufd $0b01001110, \HK, \T3 - vpxor \GH , \T2, \T2 # T2 = (a1+a0) - vpxor \HK , \T3, \T3 # T3 = (b1+b0) - - vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 - vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 - vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) - vpxor \GH, \T2,\T2 - vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 - - vpslldq $8, \T2,\T3 # shift-L T3 2 DWs - vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs - vpxor \T3, \GH, \GH - vpxor \T2, \T1, \T1 # = GH x HK - - #first phase of the reduction - vpslld $31, \GH, \T2 # packed right shifting << 31 - vpslld $30, \GH, \T3 # packed right shifting shift << 30 - vpslld $25, \GH, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T5 # shift-R T5 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \GH, \GH # first phase of the reduction complete - - #second phase of the reduction - - vpsrld $1,\GH, \T2 # packed left shifting >> 1 - vpsrld $2,\GH, \T3 # packed left shifting >> 2 - vpsrld $7,\GH, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T5, \T2, \T2 - vpxor \T2, \GH, \GH - vpxor \T1, \GH, \GH # the result is in GH - - -.endm - -.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 - - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vmovdqa \HK, \T5 - - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_2_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqu \T5, HashKey_3(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_3_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqu \T5, HashKey_4(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_4_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqu \T5, HashKey_5(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_5_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqu \T5, HashKey_6(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_6_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqu \T5, HashKey_7(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_7_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqu \T5, HashKey_8(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_8_k(arg2) - -.endm - -## if a = number of total plaintext bytes -## b = floor(a/16) -## num_initial_blocks = b mod 4# -## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext -## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, arg4 are used as pointers only, not modified - -.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC - i = (8-\num_initial_blocks) - setreg - vmovdqu AadHash(arg2), reg_i - - # start AES for num_initial_blocks blocks - vmovdqu CurCount(arg2), \CTR - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap - i = (i+1) - setreg -.endr - - vmovdqa (arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpxor \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = 1 - setreg -.rep \REP - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenc \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = (j+1) - setreg -.endr - - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenclast \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vmovdqu (arg4, %r11), \T1 - vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks - add $16, %r11 -.if \ENC_DEC == DEC - vmovdqa \T1, reg_i -.endif - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations - i = (i+1) - setreg -.endr - - - i = (8-\num_initial_blocks) - j = (9-\num_initial_blocks) - setreg - -.rep \num_initial_blocks - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks - i = (i+1) - j = (j+1) - setreg -.endr - # XMM8 has the combined result here - - vmovdqa \XMM8, TMP1(%rsp) - vmovdqa \XMM8, \T3 - - cmp $128, %r13 - jl .L_initial_blocks_done\@ # no need for precomputed constants - -############################################################################### -# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM1 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM2 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM3 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM4 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM5 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM6 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM7 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM8 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - vmovdqa (arg1), \T_key - vpxor \T_key, \XMM1, \XMM1 - vpxor \T_key, \XMM2, \XMM2 - vpxor \T_key, \XMM3, \XMM3 - vpxor \T_key, \XMM4, \XMM4 - vpxor \T_key, \XMM5, \XMM5 - vpxor \T_key, \XMM6, \XMM6 - vpxor \T_key, \XMM7, \XMM7 - vpxor \T_key, \XMM8, \XMM8 - - i = 1 - setreg -.rep \REP # do REP rounds - vmovdqa 16*i(arg1), \T_key - vaesenc \T_key, \XMM1, \XMM1 - vaesenc \T_key, \XMM2, \XMM2 - vaesenc \T_key, \XMM3, \XMM3 - vaesenc \T_key, \XMM4, \XMM4 - vaesenc \T_key, \XMM5, \XMM5 - vaesenc \T_key, \XMM6, \XMM6 - vaesenc \T_key, \XMM7, \XMM7 - vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg -.endr - - vmovdqa 16*i(arg1), \T_key - vaesenclast \T_key, \XMM1, \XMM1 - vaesenclast \T_key, \XMM2, \XMM2 - vaesenclast \T_key, \XMM3, \XMM3 - vaesenclast \T_key, \XMM4, \XMM4 - vaesenclast \T_key, \XMM5, \XMM5 - vaesenclast \T_key, \XMM6, \XMM6 - vaesenclast \T_key, \XMM7, \XMM7 - vaesenclast \T_key, \XMM8, \XMM8 - - vmovdqu (arg4, %r11), \T1 - vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM1 - .endif - - vmovdqu 16*1(arg4, %r11), \T1 - vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM2 - .endif - - vmovdqu 16*2(arg4, %r11), \T1 - vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM3 - .endif - - vmovdqu 16*3(arg4, %r11), \T1 - vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM4 - .endif - - vmovdqu 16*4(arg4, %r11), \T1 - vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM5 - .endif - - vmovdqu 16*5(arg4, %r11), \T1 - vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM6 - .endif - - vmovdqu 16*6(arg4, %r11), \T1 - vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM7 - .endif - - vmovdqu 16*7(arg4, %r11), \T1 - vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM8 - .endif - - add $128, %r11 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - -############################################################################### - -.L_initial_blocks_done\@: - -.endm - -# encrypt 8 blocks at a time -# ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3, arg4 are used as pointers only, not modified -# r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - - vmovdqa \XMM1, \T2 - vmovdqa \XMM2, TMP2(%rsp) - vmovdqa \XMM3, TMP3(%rsp) - vmovdqa \XMM4, TMP4(%rsp) - vmovdqa \XMM5, TMP5(%rsp) - vmovdqa \XMM6, TMP6(%rsp) - vmovdqa \XMM7, TMP7(%rsp) - vmovdqa \XMM8, TMP8(%rsp) - -.if \loop_idx == in_order - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONE(%rip), \XMM1, \XMM2 - vpaddd ONE(%rip), \XMM2, \XMM3 - vpaddd ONE(%rip), \XMM3, \XMM4 - vpaddd ONE(%rip), \XMM4, \XMM5 - vpaddd ONE(%rip), \XMM5, \XMM6 - vpaddd ONE(%rip), \XMM6, \XMM7 - vpaddd ONE(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -.else - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONEf(%rip), \XMM1, \XMM2 - vpaddd ONEf(%rip), \XMM2, \XMM3 - vpaddd ONEf(%rip), \XMM3, \XMM4 - vpaddd ONEf(%rip), \XMM4, \XMM5 - vpaddd ONEf(%rip), \XMM5, \XMM6 - vpaddd ONEf(%rip), \XMM6, \XMM7 - vpaddd ONEf(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR -.endif - - - ####################################################################### - - vmovdqu (arg1), \T1 - vpxor \T1, \XMM1, \XMM1 - vpxor \T1, \XMM2, \XMM2 - vpxor \T1, \XMM3, \XMM3 - vpxor \T1, \XMM4, \XMM4 - vpxor \T1, \XMM5, \XMM5 - vpxor \T1, \XMM6, \XMM6 - vpxor \T1, \XMM7, \XMM7 - vpxor \T1, \XMM8, \XMM8 - - ####################################################################### - - - - - - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - ####################################################################### - - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 - - vpshufd $0b01001110, \T2, \T6 - vpxor \T2, \T6, \T6 - - vmovdqu HashKey_8_k(arg2), \T5 - vpclmulqdq $0x00, \T5, \T6, \T6 - - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP2(%rsp), \T1 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_7_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - ####################################################################### - - vmovdqa TMP3(%rsp), \T1 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_6_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*5(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP4(%rsp), \T1 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_5_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*6(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - vmovdqa TMP5(%rsp), \T1 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_4_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*7(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP6(%rsp), \T1 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_3_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - - vmovdqu 16*8(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP7(%rsp), \T1 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_2_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - ####################################################################### - - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqa TMP8(%rsp), \T1 - vmovdqu HashKey(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vpxor \T4, \T6, \T6 - vpxor \T7, \T6, \T6 - - vmovdqu 16*10(arg1), \T5 - - i = 11 - setreg -.rep (\REP-9) - - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqu 16*i(arg1), \T5 - i = i + 1 - setreg -.endr - - i = 0 - j = 1 - setreg -.rep 8 - vpxor 16*i(arg4, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg4, %r11), reg_j - vmovdqu \T3, 16*i(arg3, %r11) - .endif - i = (i+1) - j = (j+1) - setreg -.endr - ####################################################################### - - - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 - - - - ####################################################################### - #first phase of the reduction - ####################################################################### - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer - .endif - - ####################################################################### - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 - ####################################################################### - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - - vpxor \T6, \XMM1, \XMM1 - - - -.endm - - -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - - ## Karatsuba Method - - - vpshufd $0b01001110, \XMM1, \T2 - vpxor \XMM1, \T2, \T2 - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 - - vmovdqu HashKey_8_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM2, \T2 - vpxor \XMM2, \T2, \T2 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_7_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM3, \T2 - vpxor \XMM3, \T2, \T2 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_6_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM4, \T2 - vpxor \XMM4, \T2, \T2 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_5_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM5, \T2 - vpxor \XMM5, \T2, \T2 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_4_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM6, \T2 - vpxor \XMM6, \T2, \T2 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_3_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM7, \T2 - vpxor \XMM7, \T2, \T2 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_2_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM8, \T2 - vpxor \XMM8, \T2, \T2 - vmovdqu HashKey(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 - - - - - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 - - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # holds the result of - # the accumulated carry-less multiplications - - ####################################################################### - #first phase of the reduction - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - - - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 - -.endm - -############################################################# -#void aesni_gcm_precomp_avx_gen2 -# (gcm_data *my_ctx_data, -# gcm_context_data *data, -# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -############################################################# -SYM_FUNC_START(aesni_gcm_init_avx_gen2) - FUNC_SAVE - INIT GHASH_MUL_AVX, PRECOMPUTE_AVX - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init_avx_gen2) - -############################################################################### -#void aesni_gcm_enc_update_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ -# const u8 *in, /* Plaintext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) - FUNC_SAVE - mov keysize, %eax - cmp $32, %eax - je key_256_enc_update - cmp $16, %eax - je key_128_enc_update - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 - FUNC_RESTORE - RET -key_128_enc_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 - FUNC_RESTORE - RET -key_256_enc_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) - -############################################################################### -#void aesni_gcm_dec_update_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ -# const u8 *in, /* Ciphertext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_dec_update - cmp $16, %eax - je key_128_dec_update - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 - FUNC_RESTORE - RET -key_128_dec_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 - FUNC_RESTORE - RET -key_256_dec_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) - -############################################################################### -#void aesni_gcm_finalize_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_finalize - cmp $16, %eax - je key_128_finalize - # must be 192 - GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 - FUNC_RESTORE - RET -key_128_finalize: - GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 - FUNC_RESTORE - RET -key_256_finalize: - GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) - -############################################################################### -# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -# Input: A and B (128-bits each, bit-reflected) -# Output: C = A*B*x mod poly, (i.e. >>1 ) -# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -############################################################################### -.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 - - vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 - vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 - vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 - vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 - vpxor \T3, \GH, \GH - - - vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs - vpslldq $8 , \GH, \GH # shift-L GH 2 DWs - - vpxor \T3, \T1, \T1 - vpxor \T2, \GH, \GH - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \GH, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L T2 2 DWs - - vpxor \T2, \GH, \GH # first phase of the reduction complete - ####################################################################### - #second phase of the reduction - vpclmulqdq $0x00, \GH, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \GH, \T3, \GH - vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \GH, \GH # second phase of the reduction complete - ####################################################################### - vpxor \T1, \GH, \GH # the result is in GH - - -.endm - -.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 - - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vmovdqa \HK, \T5 - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqu \T5, HashKey_3(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqu \T5, HashKey_4(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqu \T5, HashKey_5(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqu \T5, HashKey_6(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqu \T5, HashKey_7(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqu \T5, HashKey_8(arg2) - -.endm - -## if a = number of total plaintext bytes -## b = floor(a/16) -## num_initial_blocks = b mod 4# -## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext -## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, arg4 are used as pointers only, not modified - -.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER - i = (8-\num_initial_blocks) - setreg - vmovdqu AadHash(arg2), reg_i - - # start AES for num_initial_blocks blocks - vmovdqu CurCount(arg2), \CTR - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap - i = (i+1) - setreg -.endr - - vmovdqa (arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpxor \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = 1 - setreg -.rep \REP - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenc \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = (j+1) - setreg -.endr - - - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenclast \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vmovdqu (arg4, %r11), \T1 - vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for - # num_initial_blocks blocks - add $16, %r11 -.if \ENC_DEC == DEC - vmovdqa \T1, reg_i -.endif - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations - i = (i+1) - setreg -.endr - - - i = (8-\num_initial_blocks) - j = (9-\num_initial_blocks) - setreg - -.rep \num_initial_blocks - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks - i = (i+1) - j = (j+1) - setreg -.endr - # XMM8 has the combined result here - - vmovdqa \XMM8, TMP1(%rsp) - vmovdqa \XMM8, \T3 - - cmp $128, %r13 - jl .L_initial_blocks_done\@ # no need for precomputed constants - -############################################################################### -# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM1 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM2 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM3 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM4 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM5 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM6 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM7 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM8 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - vmovdqa (arg1), \T_key - vpxor \T_key, \XMM1, \XMM1 - vpxor \T_key, \XMM2, \XMM2 - vpxor \T_key, \XMM3, \XMM3 - vpxor \T_key, \XMM4, \XMM4 - vpxor \T_key, \XMM5, \XMM5 - vpxor \T_key, \XMM6, \XMM6 - vpxor \T_key, \XMM7, \XMM7 - vpxor \T_key, \XMM8, \XMM8 - - i = 1 - setreg -.rep \REP # do REP rounds - vmovdqa 16*i(arg1), \T_key - vaesenc \T_key, \XMM1, \XMM1 - vaesenc \T_key, \XMM2, \XMM2 - vaesenc \T_key, \XMM3, \XMM3 - vaesenc \T_key, \XMM4, \XMM4 - vaesenc \T_key, \XMM5, \XMM5 - vaesenc \T_key, \XMM6, \XMM6 - vaesenc \T_key, \XMM7, \XMM7 - vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg -.endr - - - vmovdqa 16*i(arg1), \T_key - vaesenclast \T_key, \XMM1, \XMM1 - vaesenclast \T_key, \XMM2, \XMM2 - vaesenclast \T_key, \XMM3, \XMM3 - vaesenclast \T_key, \XMM4, \XMM4 - vaesenclast \T_key, \XMM5, \XMM5 - vaesenclast \T_key, \XMM6, \XMM6 - vaesenclast \T_key, \XMM7, \XMM7 - vaesenclast \T_key, \XMM8, \XMM8 - - vmovdqu (arg4, %r11), \T1 - vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM1 - .endif - - vmovdqu 16*1(arg4, %r11), \T1 - vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM2 - .endif - - vmovdqu 16*2(arg4, %r11), \T1 - vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM3 - .endif - - vmovdqu 16*3(arg4, %r11), \T1 - vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM4 - .endif - - vmovdqu 16*4(arg4, %r11), \T1 - vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM5 - .endif - - vmovdqu 16*5(arg4, %r11), \T1 - vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM6 - .endif - - vmovdqu 16*6(arg4, %r11), \T1 - vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM7 - .endif - - vmovdqu 16*7(arg4, %r11), \T1 - vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM8 - .endif - - add $128, %r11 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with - # the corresponding ciphertext - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - -############################################################################### - -.L_initial_blocks_done\@: - - -.endm - - - -# encrypt 8 blocks at a time -# ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3, arg4 are used as pointers only, not modified -# r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - - vmovdqa \XMM1, \T2 - vmovdqa \XMM2, TMP2(%rsp) - vmovdqa \XMM3, TMP3(%rsp) - vmovdqa \XMM4, TMP4(%rsp) - vmovdqa \XMM5, TMP5(%rsp) - vmovdqa \XMM6, TMP6(%rsp) - vmovdqa \XMM7, TMP7(%rsp) - vmovdqa \XMM8, TMP8(%rsp) - -.if \loop_idx == in_order - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONE(%rip), \XMM1, \XMM2 - vpaddd ONE(%rip), \XMM2, \XMM3 - vpaddd ONE(%rip), \XMM3, \XMM4 - vpaddd ONE(%rip), \XMM4, \XMM5 - vpaddd ONE(%rip), \XMM5, \XMM6 - vpaddd ONE(%rip), \XMM6, \XMM7 - vpaddd ONE(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -.else - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONEf(%rip), \XMM1, \XMM2 - vpaddd ONEf(%rip), \XMM2, \XMM3 - vpaddd ONEf(%rip), \XMM3, \XMM4 - vpaddd ONEf(%rip), \XMM4, \XMM5 - vpaddd ONEf(%rip), \XMM5, \XMM6 - vpaddd ONEf(%rip), \XMM6, \XMM7 - vpaddd ONEf(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR -.endif - - - ####################################################################### - - vmovdqu (arg1), \T1 - vpxor \T1, \XMM1, \XMM1 - vpxor \T1, \XMM2, \XMM2 - vpxor \T1, \XMM3, \XMM3 - vpxor \T1, \XMM4, \XMM4 - vpxor \T1, \XMM5, \XMM5 - vpxor \T1, \XMM6, \XMM6 - vpxor \T1, \XMM7, \XMM7 - vpxor \T1, \XMM8, \XMM8 - - ####################################################################### - - - - - - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - ####################################################################### - - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 - vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 - vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 - vpxor \T5, \T6, \T6 - - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP2(%rsp), \T1 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - ####################################################################### - - vmovdqa TMP3(%rsp), \T1 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*5(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP4(%rsp), \T1 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*6(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - vmovdqa TMP5(%rsp), \T1 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*7(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP6(%rsp), \T1 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*8(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP7(%rsp), \T1 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - - ####################################################################### - - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqa TMP8(%rsp), \T1 - vmovdqu HashKey(arg2), \T5 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T1 - - - vmovdqu 16*10(arg1), \T5 - - i = 11 - setreg -.rep (\REP-9) - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqu 16*i(arg1), \T5 - i = i + 1 - setreg -.endr - - i = 0 - j = 1 - setreg -.rep 8 - vpxor 16*i(arg4, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg4, %r11), reg_j - vmovdqu \T3, 16*i(arg3, %r11) - .endif - i = (i+1) - j = (j+1) - setreg -.endr - ####################################################################### - - - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 - - - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer - .endif - - ####################################################################### - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \T4, \T4 # second phase of the reduction complete - ####################################################################### - vpxor \T4, \T1, \T1 # the result is in T1 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - - vpxor \T1, \XMM1, \XMM1 - - - -.endm - - -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - - ## Karatsuba Method - - vmovdqu HashKey_8(arg2), \T5 - - vpshufd $0b01001110, \XMM1, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM1, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 - - vpclmulqdq $0x00, \T3, \T2, \XMM1 - - ###################### - - vmovdqu HashKey_7(arg2), \T5 - vpshufd $0b01001110, \XMM2, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM2, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_6(arg2), \T5 - vpshufd $0b01001110, \XMM3, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM3, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_5(arg2), \T5 - vpshufd $0b01001110, \XMM4, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM4, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_4(arg2), \T5 - vpshufd $0b01001110, \XMM5, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM5, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_3(arg2), \T5 - vpshufd $0b01001110, \XMM6, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM6, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_2(arg2), \T5 - vpshufd $0b01001110, \XMM7, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM7, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey(arg2), \T5 - vpshufd $0b01001110, \XMM8, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM8, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 - - - - - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 - - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # holds the result of the - # accumulated carry-less multiplications - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - - - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \T4, \T4 # second phase of the reduction complete - ####################################################################### - vpxor \T4, \T6, \T6 # the result is in T6 -.endm - - - -############################################################# -#void aesni_gcm_init_avx_gen4 -# (gcm_data *my_ctx_data, -# gcm_context_data *data, -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -############################################################# -SYM_FUNC_START(aesni_gcm_init_avx_gen4) - FUNC_SAVE - INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init_avx_gen4) - -############################################################################### -#void aesni_gcm_enc_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ -# const u8 *in, /* Plaintext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_enc_update4 - cmp $16, %eax - je key_128_enc_update4 - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 - FUNC_RESTORE - RET -key_128_enc_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 - FUNC_RESTORE - RET -key_256_enc_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) - -############################################################################### -#void aesni_gcm_dec_update_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ -# const u8 *in, /* Ciphertext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_dec_update4 - cmp $16, %eax - je key_128_dec_update4 - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 - FUNC_RESTORE - RET -key_128_dec_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 - FUNC_RESTORE - RET -key_256_dec_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) - -############################################################################### -#void aesni_gcm_finalize_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_finalize4 - cmp $16, %eax - je key_128_finalize4 - # must be 192 - GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 - FUNC_RESTORE - RET -key_128_finalize4: - GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 - FUNC_RESTORE - RET -key_256_finalize4: - GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index c1cc01583103..cd37de5ec404 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -46,41 +46,11 @@ #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) -/* This data is stored at the end of the crypto_tfm struct. - * It's a type of per "session" data storage location. - * This needs to be 16 byte aligned. - */ -struct aesni_rfc4106_gcm_ctx { - u8 hash_subkey[16] AESNI_ALIGN_ATTR; - struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; - u8 nonce[4]; -}; - -struct generic_gcmaes_ctx { - u8 hash_subkey[16] AESNI_ALIGN_ATTR; - struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; -}; - struct aesni_xts_ctx { struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR; struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR; }; -#define GCM_BLOCK_LEN 16 - -struct gcm_context_data { - /* init, update and finalize context data */ - u8 aad_hash[GCM_BLOCK_LEN]; - u64 aad_length; - u64 in_length; - u8 partial_block_enc_key[GCM_BLOCK_LEN]; - u8 orig_IV[GCM_BLOCK_LEN]; - u8 current_counter[GCM_BLOCK_LEN]; - u64 partial_block_len; - u64 unused; - u8 hash_keys[GCM_BLOCK_LEN * 16]; -}; - static inline void *aes_align_addr(void *addr) { if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN) @@ -105,9 +75,6 @@ asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -#define AVX_GEN2_OPTSIZE 640 -#define AVX_GEN4_OPTSIZE 4096 - asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); @@ -120,23 +87,6 @@ asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); -/* Scatter / Gather routines, with args similar to above */ -asmlinkage void aesni_gcm_init(void *ctx, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, const u8 *aad, - unsigned long aad_len); -asmlinkage void aesni_gcm_enc_update(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, @@ -156,67 +106,6 @@ asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, u8 *out, unsigned int num_bytes, unsigned int byte_ctr); - -/* - * asmlinkage void aesni_gcm_init_avx_gen2() - * gcm_data *my_ctx_data, context data - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - */ -asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, - const u8 *aad, - unsigned long aad_len); - -asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - -/* - * asmlinkage void aesni_gcm_init_avx_gen4() - * gcm_data *my_ctx_data, context data - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - */ -asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, - const u8 *aad, - unsigned long aad_len); - -asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2); - -static inline struct -aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) -{ - return aes_align_addr(crypto_aead_ctx(tfm)); -} - -static inline struct -generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm) -{ - return aes_align_addr(crypto_aead_ctx(tfm)); -} #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) @@ -590,280 +479,6 @@ static int xctr_crypt(struct skcipher_request *req) } return err; } - -static int aes_gcm_derive_hash_subkey(const struct crypto_aes_ctx *aes_key, - u8 hash_subkey[AES_BLOCK_SIZE]) -{ - static const u8 zeroes[AES_BLOCK_SIZE]; - - aes_encrypt(aes_key, hash_subkey, zeroes); - return 0; -} - -static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, - unsigned int key_len) -{ - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); - - if (key_len < 4) - return -EINVAL; - - /*Account for 4 byte nonce at the end.*/ - key_len -= 4; - - memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); - - return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: - aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded, - ctx->hash_subkey); -} - -/* This is the Integrity Check Value (aka the authentication tag) length and can - * be 8, 12 or 16 bytes long. */ -static int common_rfc4106_set_authsize(struct crypto_aead *aead, - unsigned int authsize) -{ - switch (authsize) { - case 8: - case 12: - case 16: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - switch (authsize) { - case 4: - case 8: - case 12: - case 13: - case 14: - case 15: - case 16: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, - unsigned int assoclen, u8 *hash_subkey, - u8 *iv, void *aes_ctx, u8 *auth_tag, - unsigned long auth_tag_len) -{ - u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8); - struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN); - unsigned long left = req->cryptlen; - struct scatter_walk assoc_sg_walk; - struct skcipher_walk walk; - bool do_avx, do_avx2; - u8 *assocmem = NULL; - u8 *assoc; - int err; - - if (!enc) - left -= auth_tag_len; - - do_avx = (left >= AVX_GEN2_OPTSIZE); - do_avx2 = (left >= AVX_GEN4_OPTSIZE); - - /* Linearize assoc, if not already linear */ - if (req->src->length >= assoclen && req->src->length) { - scatterwalk_start(&assoc_sg_walk, req->src); - assoc = scatterwalk_map(&assoc_sg_walk); - } else { - gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? - GFP_KERNEL : GFP_ATOMIC; - - /* assoc can be any length, so must be on heap */ - assocmem = kmalloc(assoclen, flags); - if (unlikely(!assocmem)) - return -ENOMEM; - assoc = assocmem; - - scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); - } - - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) - aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc, - assoclen); - else if (static_branch_likely(&gcm_use_avx) && do_avx) - aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc, - assoclen); - else - aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); - kernel_fpu_end(); - - if (!assocmem) - scatterwalk_unmap(assoc); - else - kfree(assocmem); - - err = enc ? skcipher_walk_aead_encrypt(&walk, req, false) - : skcipher_walk_aead_decrypt(&walk, req, false); - - while (walk.nbytes > 0) { - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) { - if (enc) - aesni_gcm_enc_update_avx_gen4(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - else - aesni_gcm_dec_update_avx_gen4(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - } else if (static_branch_likely(&gcm_use_avx) && do_avx) { - if (enc) - aesni_gcm_enc_update_avx_gen2(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - else - aesni_gcm_dec_update_avx_gen2(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - } else if (enc) { - aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr, - walk.src.virt.addr, walk.nbytes); - } else { - aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr, - walk.src.virt.addr, walk.nbytes); - } - kernel_fpu_end(); - - err = skcipher_walk_done(&walk, 0); - } - - if (err) - return err; - - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) - aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag, - auth_tag_len); - else if (static_branch_likely(&gcm_use_avx) && do_avx) - aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag, - auth_tag_len); - else - aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len); - kernel_fpu_end(); - - return 0; -} - -static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, - u8 *hash_subkey, u8 *iv, void *aes_ctx) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 auth_tag[16]; - int err; - - err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx, - auth_tag, auth_tag_len); - if (err) - return err; - - scatterwalk_map_and_copy(auth_tag, req->dst, - req->assoclen + req->cryptlen, - auth_tag_len, 1); - return 0; -} - -static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, - u8 *hash_subkey, u8 *iv, void *aes_ctx) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 auth_tag_msg[16]; - u8 auth_tag[16]; - int err; - - err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx, - auth_tag, auth_tag_len); - if (err) - return err; - - /* Copy out original auth_tag */ - scatterwalk_map_and_copy(auth_tag_msg, req->src, - req->assoclen + req->cryptlen - auth_tag_len, - auth_tag_len, 0); - - /* Compare generated tag with passed in tag. */ - if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) { - memzero_explicit(auth_tag, sizeof(auth_tag)); - return -EBADMSG; - } - return 0; -} - -static int helper_rfc4106_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - unsigned int i; - __be32 counter = cpu_to_be32(1); - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length equal */ - /* to 16 or 20 bytes */ - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; - - return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, - aes_ctx); -} - -static int helper_rfc4106_decrypt(struct aead_request *req) -{ - __be32 counter = cpu_to_be32(1); - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - unsigned int i; - - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length */ - /* equal to 16 or 20 bytes */ - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; - - return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, - aes_ctx); -} #endif static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, @@ -1216,6 +831,7 @@ DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500); DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600); DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700); DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800); +#endif /* The common part of the x86_64 AES-GCM key struct */ struct aes_gcm_key { @@ -1226,6 +842,40 @@ struct aes_gcm_key { u32 rfc4106_nonce; }; +/* Key struct used by the AES-NI implementations of AES-GCM */ +struct aes_gcm_key_aesni { + /* + * Common part of the key. The assembly code requires 16-byte alignment + * for the round keys; we get this by them being located at the start of + * the struct and the whole struct being 16-byte aligned. + */ + struct aes_gcm_key base; + + /* + * Powers of the hash key H^8 through H^1. These are 128-bit values. + * They all have an extra factor of x^-1 and are byte-reversed. 16-byte + * alignment is required by the assembly code. + */ + u64 h_powers[8][2] __aligned(16); + + /* + * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd + * together. It's used for Karatsuba multiplication. 16-byte alignment + * is required by the assembly code. + */ + u64 h_powers_xored[8] __aligned(16); + + /* + * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte + * alignment is required by the assembly code. + */ + u64 h_times_x64[2] __aligned(16); +}; +#define AES_GCM_KEY_AESNI(key) \ + container_of((key), struct aes_gcm_key_aesni, base) +#define AES_GCM_KEY_AESNI_SIZE \ + (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1))) + /* Key struct used by the VAES + AVX10 implementations of AES-GCM */ struct aes_gcm_key_avx10 { /* @@ -1261,14 +911,32 @@ struct aes_gcm_key_avx10 { */ #define FLAG_RFC4106 BIT(0) #define FLAG_ENC BIT(1) -#define FLAG_AVX10_512 BIT(2) +#define FLAG_AVX BIT(2) +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +# define FLAG_AVX10_256 BIT(3) +# define FLAG_AVX10_512 BIT(4) +#else + /* + * This should cause all calls to the AVX10 assembly functions to be + * optimized out, avoiding the need to ifdef each call individually. + */ +# define FLAG_AVX10_256 0 +# define FLAG_AVX10_512 0 +#endif static inline struct aes_gcm_key * aes_gcm_key_get(struct crypto_aead *tfm, int flags) { - return PTR_ALIGN(crypto_aead_ctx(tfm), 64); + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + return PTR_ALIGN(crypto_aead_ctx(tfm), 64); + else + return PTR_ALIGN(crypto_aead_ctx(tfm), 16); } +asmlinkage void +aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key); +asmlinkage void +aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key); asmlinkage void aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key); asmlinkage void @@ -1283,13 +951,25 @@ static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) * straightforward to provide a 512-bit one because of how the assembly * code is structured, and it works nicely because the total size of the * key powers is a multiple of 512 bits. So we take advantage of that. + * + * A similar situation applies to the AES-NI implementations. */ if (flags & FLAG_AVX10_512) aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key)); - else + else if (flags & FLAG_AVX10_256) aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key)); + else if (flags & FLAG_AVX) + aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); + else + aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key)); } +asmlinkage void +aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); +asmlinkage void +aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); asmlinkage void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, u8 ghash_acc[16], const u8 *aad, int aadlen); @@ -1297,10 +977,25 @@ aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], const u8 *aad, int aadlen, int flags) { - aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc, - aad, aadlen); + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc, + aad, aadlen); + else if (flags & FLAG_AVX) + aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc, + aad, aadlen); + else + aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc, + aad, aadlen); } +asmlinkage void +aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, const u32 le_ctr[4], u8 ghash_acc[16], @@ -1311,6 +1006,14 @@ aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, const u8 *src, u8 *dst, int datalen); asmlinkage void +aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); @@ -1330,22 +1033,45 @@ aes_gcm_update(const struct aes_gcm_key *key, aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), le_ctr, ghash_acc, src, dst, datalen); - else + else if (flags & FLAG_AVX10_256) aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), le_ctr, ghash_acc, src, dst, datalen); + else if (flags & FLAG_AVX) + aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); + else + aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, + ghash_acc, src, dst, datalen); } else { if (flags & FLAG_AVX10_512) aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), le_ctr, ghash_acc, src, dst, datalen); - else + else if (flags & FLAG_AVX10_256) aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), le_ctr, ghash_acc, src, dst, datalen); + else if (flags & FLAG_AVX) + aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); + else + aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); } } +asmlinkage void +aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); +asmlinkage void +aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); asmlinkage void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, const u32 le_ctr[4], u8 ghash_acc[16], @@ -1357,11 +1083,30 @@ aes_gcm_enc_final(const struct aes_gcm_key *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, int flags) { - aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key), - le_ctr, ghash_acc, - total_aadlen, total_datalen); + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); + else if (flags & FLAG_AVX) + aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); + else + aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); } +asmlinkage bool __must_check +aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); +asmlinkage bool __must_check +aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); asmlinkage bool __must_check aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, const u32 le_ctr[4], const u8 ghash_acc[16], @@ -1374,10 +1119,59 @@ aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, u8 tag[16], int taglen, int flags) { - return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key), - le_ctr, ghash_acc, - total_aadlen, total_datalen, - tag, taglen); + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); + else if (flags & FLAG_AVX) + return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); + else + return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); +} + +/* + * This is the Integrity Check Value (aka the authentication tag) length and can + * be 8, 12 or 16 bytes long. + */ +static int common_rfc4106_set_authsize(struct crypto_aead *aead, + unsigned int authsize) +{ + switch (authsize) { + case 8: + case 12: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12: + case 13: + case 14: + case 15: + case 16: + break; + default: + return -EINVAL; + } + + return 0; } /* @@ -1407,6 +1201,11 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, } /* The assembly code assumes the following offsets. */ + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688); BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0); BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480); BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512); @@ -1424,7 +1223,9 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = { [0] = 0xc2, [15] = 1 }; - struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key); + static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = { + [7] = 1, + }; be128 h1 = {}; be128 h; int i; @@ -1441,12 +1242,29 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, gf128mul_lle(&h, (const be128 *)x_to_the_minus1); /* Compute the needed key powers */ - for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { - k->h_powers[i][0] = be64_to_cpu(h.b); - k->h_powers[i][1] = be64_to_cpu(h.a); - gf128mul_lle(&h, &h1); + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) { + struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key); + + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { + k->h_powers[i][0] = be64_to_cpu(h.b); + k->h_powers[i][1] = be64_to_cpu(h.a); + gf128mul_lle(&h, &h1); + } + memset(k->padding, 0, sizeof(k->padding)); + } else { + struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key); + + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { + k->h_powers[i][0] = be64_to_cpu(h.b); + k->h_powers[i][1] = be64_to_cpu(h.a); + k->h_powers_xored[i] = k->h_powers[i][0] ^ + k->h_powers[i][1]; + gf128mul_lle(&h, &h1); + } + gf128mul_lle(&h1, (const be128 *)x_to_the_63); + k->h_times_x64[0] = be64_to_cpu(h1.b); + k->h_times_x64[1] = be64_to_cpu(h1.a); } - memset(k->padding, 0, sizeof(k->padding)); } return 0; } @@ -1630,7 +1448,7 @@ out: ctxsize, priority) \ \ static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ - unsigned int keylen) \ + unsigned int keylen) \ { \ return gcm_setkey(tfm, raw_key, keylen, (flags)); \ } \ @@ -1646,7 +1464,7 @@ static int gcm_decrypt_##suffix(struct aead_request *req) \ } \ \ static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ - unsigned int keylen) \ + unsigned int keylen) \ { \ return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \ } \ @@ -1699,8 +1517,19 @@ static struct aead_alg aes_gcm_algs_##suffix[] = { { \ \ static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2] \ +/* aes_gcm_algs_aesni */ +DEFINE_GCM_ALGS(aesni, /* no flags */ 0, + "generic-gcm-aesni", "rfc4106-gcm-aesni", + AES_GCM_KEY_AESNI_SIZE, 400); + +/* aes_gcm_algs_aesni_avx */ +DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX, + "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx", + AES_GCM_KEY_AESNI_SIZE, 500); + +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) /* aes_gcm_algs_vaes_avx10_256 */ -DEFINE_GCM_ALGS(vaes_avx10_256, 0, +DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256, "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256", AES_GCM_KEY_AVX10_SIZE, 700); @@ -1740,6 +1569,11 @@ static int __init register_avx_algs(void) &aes_xts_simdalg_aesni_avx); if (err) return err; + err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx, + ARRAY_SIZE(aes_gcm_algs_aesni_avx), + aes_gcm_simdalgs_aesni_avx); + if (err) + return err; #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_VAES) || @@ -1795,6 +1629,10 @@ static void unregister_avx_algs(void) if (aes_xts_simdalg_aesni_avx) simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, &aes_xts_simdalg_aesni_avx); + if (aes_gcm_simdalgs_aesni_avx[0]) + simd_unregister_aeads(aes_gcm_algs_aesni_avx, + ARRAY_SIZE(aes_gcm_algs_aesni_avx), + aes_gcm_simdalgs_aesni_avx); #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) if (aes_xts_simdalg_vaes_avx2) simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1, @@ -1816,6 +1654,9 @@ static void unregister_avx_algs(void) #endif } #else /* CONFIG_X86_64 */ +static struct aead_alg aes_gcm_algs_aesni[0]; +static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0]; + static int __init register_avx_algs(void) { return 0; @@ -1826,90 +1667,6 @@ static void unregister_avx_algs(void) } #endif /* !CONFIG_X86_64 */ -#ifdef CONFIG_X86_64 -static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, - unsigned int key_len) -{ - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); - - return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: - aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded, - ctx->hash_subkey); -} - -static int generic_gcmaes_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - __be32 counter = cpu_to_be32(1); - - memcpy(iv, req->iv, 12); - *((__be32 *)(iv+12)) = counter; - - return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv, - aes_ctx); -} - -static int generic_gcmaes_decrypt(struct aead_request *req) -{ - __be32 counter = cpu_to_be32(1); - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - - memcpy(iv, req->iv, 12); - *((__be32 *)(iv+12)) = counter; - - return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv, - aes_ctx); -} - -static struct aead_alg aesni_aeads[] = { { - .setkey = common_rfc4106_set_key, - .setauthsize = common_rfc4106_set_authsize, - .encrypt = helper_rfc4106_encrypt, - .decrypt = helper_rfc4106_decrypt, - .ivsize = GCM_RFC4106_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "__rfc4106(gcm(aes))", - .cra_driver_name = "__rfc4106-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, -}, { - .setkey = generic_gcmaes_set_key, - .setauthsize = generic_gcmaes_set_authsize, - .encrypt = generic_gcmaes_encrypt, - .decrypt = generic_gcmaes_decrypt, - .ivsize = GCM_AES_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "__gcm(aes)", - .cra_driver_name = "__generic-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, -} }; -#else -static struct aead_alg aesni_aeads[0]; -#endif - -static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)]; - static const struct x86_cpu_id aesni_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_AES, NULL), {} @@ -1923,17 +1680,6 @@ static int __init aesni_init(void) if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; #ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_AVX2)) { - pr_info("AVX2 version of gcm_enc/dec engaged.\n"); - static_branch_enable(&gcm_use_avx); - static_branch_enable(&gcm_use_avx2); - } else - if (boot_cpu_has(X86_FEATURE_AVX)) { - pr_info("AVX version of gcm_enc/dec engaged.\n"); - static_branch_enable(&gcm_use_avx); - } else { - pr_info("SSE version of gcm_enc/dec engaged.\n"); - } if (boot_cpu_has(X86_FEATURE_AVX)) { /* optimize performance of ctr mode encryption transform */ static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); @@ -1951,8 +1697,9 @@ static int __init aesni_init(void) if (err) goto unregister_cipher; - err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); + err = simd_register_aeads_compat(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni), + aes_gcm_simdalgs_aesni); if (err) goto unregister_skciphers; @@ -1977,9 +1724,9 @@ unregister_avx: simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); unregister_aeads: #endif /* CONFIG_X86_64 */ - simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); - + simd_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni), + aes_gcm_simdalgs_aesni); unregister_skciphers: simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); @@ -1990,8 +1737,9 @@ unregister_cipher: static void __exit aesni_exit(void) { - simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); + simd_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni), + aes_gcm_simdalgs_aesni); simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); crypto_unregister_alg(&aesni_cipher_alg); -- cgit From 3aa461e37c0ee309b6dc6887bba5f7184f3a6740 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 15:33:39 -0700 Subject: crypto: atmel-sha204a - add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/crypto/atmel-sha204a.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- drivers/crypto/atmel-sha204a.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/atmel-sha204a.c b/drivers/crypto/atmel-sha204a.c index 2034f6031518..a02d496f4c41 100644 --- a/drivers/crypto/atmel-sha204a.c +++ b/drivers/crypto/atmel-sha204a.c @@ -232,4 +232,5 @@ module_init(atmel_sha204a_init); module_exit(atmel_sha204a_exit); MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_DESCRIPTION("Microchip / Atmel SHA204A (I2C) driver"); MODULE_LICENSE("GPL v2"); -- cgit From f2cbb74633ab9b07ab2df7252d23971115a88be8 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 16:03:54 -0700 Subject: crypto: keembay - add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/crypto/intel/keembay/keembay-ocs-hcu.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- drivers/crypto/intel/keembay/ocs-hcu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/intel/keembay/ocs-hcu.c b/drivers/crypto/intel/keembay/ocs-hcu.c index deb9bd460ee6..55a41e6ab103 100644 --- a/drivers/crypto/intel/keembay/ocs-hcu.c +++ b/drivers/crypto/intel/keembay/ocs-hcu.c @@ -837,4 +837,5 @@ complete: return IRQ_HANDLED; } +MODULE_DESCRIPTION("Intel Keem Bay OCS HCU Crypto Driver"); MODULE_LICENSE("GPL"); -- cgit From c8edb3ccfd3951a84a8bf6638b7befcfa775fd06 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 16:11:17 -0700 Subject: crypto: sa2ul - add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/crypto/sa2ul.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- drivers/crypto/sa2ul.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/sa2ul.c b/drivers/crypto/sa2ul.c index 78a4930c6480..461eca40e878 100644 --- a/drivers/crypto/sa2ul.c +++ b/drivers/crypto/sa2ul.c @@ -2496,4 +2496,5 @@ static struct platform_driver sa_ul_driver = { }, }; module_platform_driver(sa_ul_driver); +MODULE_DESCRIPTION("K3 SA2UL crypto accelerator driver"); MODULE_LICENSE("GPL v2"); -- cgit From ed6261d553f505e6b0ce8593bf3f3792928b0ab8 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sun, 2 Jun 2024 16:17:08 -0700 Subject: crypto: xilinx - add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/crypto/xilinx/zynqmp-aes-gcm.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Michal Simek Signed-off-by: Herbert Xu --- drivers/crypto/xilinx/zynqmp-aes-gcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/xilinx/zynqmp-aes-gcm.c b/drivers/crypto/xilinx/zynqmp-aes-gcm.c index e61405718840..7f0ec6887a39 100644 --- a/drivers/crypto/xilinx/zynqmp-aes-gcm.c +++ b/drivers/crypto/xilinx/zynqmp-aes-gcm.c @@ -446,4 +446,5 @@ static struct platform_driver zynqmp_aes_driver = { }; module_platform_driver(zynqmp_aes_driver); +MODULE_DESCRIPTION("Xilinx ZynqMP AES Driver"); MODULE_LICENSE("GPL"); -- cgit From 6d4e1993a30539f556da2ebd36f1936c583eb812 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 3 Jun 2024 07:55:39 -0700 Subject: hwrng: omap - add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/char/hw_random/omap-rng.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/char/hw_random/omap3-rom-rng.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- drivers/char/hw_random/omap-rng.c | 1 + drivers/char/hw_random/omap3-rom-rng.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/char/hw_random/omap-rng.c b/drivers/char/hw_random/omap-rng.c index d4c02e900466..4914a8720e58 100644 --- a/drivers/char/hw_random/omap-rng.c +++ b/drivers/char/hw_random/omap-rng.c @@ -564,4 +564,5 @@ static struct platform_driver omap_rng_driver = { module_platform_driver(omap_rng_driver); MODULE_ALIAS("platform:omap_rng"); MODULE_AUTHOR("Deepak Saxena (and others)"); +MODULE_DESCRIPTION("RNG driver for TI OMAP CPU family"); MODULE_LICENSE("GPL"); diff --git a/drivers/char/hw_random/omap3-rom-rng.c b/drivers/char/hw_random/omap3-rom-rng.c index 18dc46b1b58e..8064c792caf0 100644 --- a/drivers/char/hw_random/omap3-rom-rng.c +++ b/drivers/char/hw_random/omap3-rom-rng.c @@ -178,4 +178,5 @@ module_platform_driver(omap3_rom_rng_driver); MODULE_ALIAS("platform:omap3-rom-rng"); MODULE_AUTHOR("Juha Yrjola"); MODULE_AUTHOR("Pali Rohár "); +MODULE_DESCRIPTION("RNG driver for TI OMAP3 CPU family"); MODULE_LICENSE("GPL"); -- cgit From 468e3295774d0edce15f4ae475913b5076dd4f40 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 4 Jun 2024 12:47:39 -0500 Subject: crypto: ccp - Fix null pointer dereference in __sev_snp_shutdown_locked Fix a null pointer dereference induced by DEBUG_TEST_DRIVER_REMOVE. Return from __sev_snp_shutdown_locked() if the psp_device or the sev_device structs are not initialized. Without the fix, the driver will produce the following splat: ccp 0000:55:00.5: enabling device (0000 -> 0002) ccp 0000:55:00.5: sev enabled ccp 0000:55:00.5: psp enabled BUG: kernel NULL pointer dereference, address: 00000000000000f0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC NOPTI CPU: 262 PID: 1 Comm: swapper/0 Not tainted 6.9.0-rc1+ #29 RIP: 0010:__sev_snp_shutdown_locked+0x2e/0x150 Code: 00 55 48 89 e5 41 57 41 56 41 54 53 48 83 ec 10 41 89 f7 49 89 fe 65 48 8b 04 25 28 00 00 00 48 89 45 d8 48 8b 05 6a 5a 7f 06 <4c> 8b a0 f0 00 00 00 41 0f b6 9c 24 a2 00 00 00 48 83 fb 02 0f 83 RSP: 0018:ffffb2ea4014b7b8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff9e4acd2e0a28 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffb2ea4014b808 RBP: ffffb2ea4014b7e8 R08: 0000000000000106 R09: 000000000003d9c0 R10: 0000000000000001 R11: ffffffffa39ff070 R12: ffff9e49d40590c8 R13: 0000000000000000 R14: ffffb2ea4014b808 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff9e58b1e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000f0 CR3: 0000000418a3e001 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: ? __die_body+0x6f/0xb0 ? __die+0xcc/0xf0 ? page_fault_oops+0x330/0x3a0 ? save_trace+0x2a5/0x360 ? do_user_addr_fault+0x583/0x630 ? exc_page_fault+0x81/0x120 ? asm_exc_page_fault+0x2b/0x30 ? __sev_snp_shutdown_locked+0x2e/0x150 __sev_firmware_shutdown+0x349/0x5b0 ? pm_runtime_barrier+0x66/0xe0 sev_dev_destroy+0x34/0xb0 psp_dev_destroy+0x27/0x60 sp_destroy+0x39/0x90 sp_pci_remove+0x22/0x60 pci_device_remove+0x4e/0x110 really_probe+0x271/0x4e0 __driver_probe_device+0x8f/0x160 driver_probe_device+0x24/0x120 __driver_attach+0xc7/0x280 ? driver_attach+0x30/0x30 bus_for_each_dev+0x10d/0x130 driver_attach+0x22/0x30 bus_add_driver+0x171/0x2b0 ? unaccepted_memory_init_kdump+0x20/0x20 driver_register+0x67/0x100 __pci_register_driver+0x83/0x90 sp_pci_init+0x22/0x30 sp_mod_init+0x13/0x30 do_one_initcall+0xb8/0x290 ? sched_clock_noinstr+0xd/0x10 ? local_clock_noinstr+0x3e/0x100 ? stack_depot_save_flags+0x21e/0x6a0 ? local_clock+0x1c/0x60 ? stack_depot_save_flags+0x21e/0x6a0 ? sched_clock_noinstr+0xd/0x10 ? local_clock_noinstr+0x3e/0x100 ? __lock_acquire+0xd90/0xe30 ? sched_clock_noinstr+0xd/0x10 ? local_clock_noinstr+0x3e/0x100 ? __create_object+0x66/0x100 ? local_clock+0x1c/0x60 ? __create_object+0x66/0x100 ? parameq+0x1b/0x90 ? parse_one+0x6d/0x1d0 ? parse_args+0xd7/0x1f0 ? do_initcall_level+0x180/0x180 do_initcall_level+0xb0/0x180 do_initcalls+0x60/0xa0 ? kernel_init+0x1f/0x1d0 do_basic_setup+0x41/0x50 kernel_init_freeable+0x1ac/0x230 ? rest_init+0x1f0/0x1f0 kernel_init+0x1f/0x1d0 ? rest_init+0x1f0/0x1f0 ret_from_fork+0x3d/0x50 ? rest_init+0x1f0/0x1f0 ret_from_fork_asm+0x11/0x20 Modules linked in: CR2: 00000000000000f0 ---[ end trace 0000000000000000 ]--- RIP: 0010:__sev_snp_shutdown_locked+0x2e/0x150 Code: 00 55 48 89 e5 41 57 41 56 41 54 53 48 83 ec 10 41 89 f7 49 89 fe 65 48 8b 04 25 28 00 00 00 48 89 45 d8 48 8b 05 6a 5a 7f 06 <4c> 8b a0 f0 00 00 00 41 0f b6 9c 24 a2 00 00 00 48 83 fb 02 0f 83 RSP: 0018:ffffb2ea4014b7b8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff9e4acd2e0a28 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffb2ea4014b808 RBP: ffffb2ea4014b7e8 R08: 0000000000000106 R09: 000000000003d9c0 R10: 0000000000000001 R11: ffffffffa39ff070 R12: ffff9e49d40590c8 R13: 0000000000000000 R14: ffffb2ea4014b808 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff9e58b1e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000f0 CR3: 0000000418a3e001 CR4: 0000000000770ef0 PKRU: 55555554 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x1fc00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Fixes: 1ca5614b84ee ("crypto: ccp: Add support to initialize the AMD-SP for SEV-SNP") Cc: stable@vger.kernel.org Signed-off-by: Kim Phillips Reviewed-by: Liam Merwick Reviewed-by: Mario Limonciello Reviewed-by: John Allen Reviewed-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 2102377f727b..1912bee22dd4 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -1642,10 +1642,16 @@ fw_err: static int __sev_snp_shutdown_locked(int *error, bool panic) { - struct sev_device *sev = psp_master->sev_data; + struct psp_device *psp = psp_master; + struct sev_device *sev; struct sev_data_snp_shutdown_ex data; int ret; + if (!psp || !psp->sev_data) + return 0; + + sev = psp->sev_data; + if (!sev->snp_initialized) return 0; -- cgit From 293695f17ee4a8585f1d5a4e90a2c2c92995225a Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Wed, 5 Jun 2024 11:36:33 +0200 Subject: dt-bindings: rng: meson: add optional power-domains On newer SoCs, the random number generator can require a power-domain to operate, add it as optional. Signed-off-by: Neil Armstrong Acked-by: Rob Herring (Arm) Signed-off-by: Herbert Xu --- Documentation/devicetree/bindings/rng/amlogic,meson-rng.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/rng/amlogic,meson-rng.yaml b/Documentation/devicetree/bindings/rng/amlogic,meson-rng.yaml index afa52af442a7..f03b87e1b01c 100644 --- a/Documentation/devicetree/bindings/rng/amlogic,meson-rng.yaml +++ b/Documentation/devicetree/bindings/rng/amlogic,meson-rng.yaml @@ -26,6 +26,9 @@ properties: items: - const: core + power-domains: + maxItems: 1 + required: - compatible - reg -- cgit From 4604b3888f614a353c7afa17bdc8e7393bb1f9a1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Jun 2024 21:51:33 +0300 Subject: hwrng: core - Remove list.h from the hw_random.h The 'struct list' type is defined in types.h, no need to include list.h for that. Signed-off-by: Andy Shevchenko Signed-off-by: Herbert Xu --- include/linux/hw_random.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index 136e9842120e..b424555753b1 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -13,9 +13,8 @@ #define LINUX_HWRANDOM_H_ #include -#include -#include #include +#include /** * struct hwrng - Hardware Random Number Generator driver -- cgit From 0eb3bed57a06a20c612012127f4405d48fa4a50f Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Fri, 7 Jun 2024 18:34:17 -0400 Subject: crypto: ecc - Add comment to ecc_digits_from_bytes about input byte array Add comment to ecc_digits_from_bytes kdoc that the first byte is expected to hold the most significant bits of the large integer that is converted into an array of digits. Signed-off-by: Stefan Berger Signed-off-by: Herbert Xu --- include/crypto/internal/ecc.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/crypto/internal/ecc.h b/include/crypto/internal/ecc.h index f7e75e1e71f3..0717a53ae732 100644 --- a/include/crypto/internal/ecc.h +++ b/include/crypto/internal/ecc.h @@ -63,6 +63,9 @@ static inline void ecc_swap_digits(const void *in, u64 *out, unsigned int ndigit * @nbytes Size of input byte array * @out Output digits array * @ndigits: Number of digits to create from byte array + * + * The first byte in the input byte array is expected to hold the most + * significant bits of the large integer. */ void ecc_digits_from_bytes(const u8 *in, unsigned int nbytes, u64 *out, unsigned int ndigits); -- cgit From 1dcf865d3bf5bff45e93cb2410911b3428dacb78 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Thu, 13 Jun 2024 17:38:20 -0400 Subject: crypto: ecc - Fix off-by-one missing to clear most significant digit Fix an off-by-one error where the most significant digit was not initialized leading to signature verification failures by the testmgr. Example: If a curve requires ndigits (=9) and diff (=2) indicates that 2 digits need to be set to zero then start with digit 'ndigits - diff' (=7) and clear 'diff' digits starting from there, so 7 and 8. Reported-by: Venkat Rao Bagalkote Closes: https://lore.kernel.org/linux-crypto/619bc2de-b18a-4939-a652-9ca886bf6349@linux.ibm.com/T/#m045d8812409ce233c17fcdb8b88b6629c671f9f4 Fixes: 2fd2a82ccbfc ("crypto: ecdsa - Use ecc_digits_from_bytes to create hash digits array") Signed-off-by: Stefan Berger Tested-by: Venkat Rao Bagalkote Signed-off-by: Herbert Xu --- crypto/ecc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/ecc.c b/crypto/ecc.c index af698f8852fb..420decdad7d9 100644 --- a/crypto/ecc.c +++ b/crypto/ecc.c @@ -78,7 +78,7 @@ void ecc_digits_from_bytes(const u8 *in, unsigned int nbytes, /* diff > 0: not enough input bytes: set most significant digits to 0 */ if (diff > 0) { ndigits -= diff; - memset(&out[ndigits - 1], 0, diff * sizeof(u64)); + memset(&out[ndigits], 0, diff * sizeof(u64)); } if (o) { -- cgit From a654b354b8cb3143b4e363ddc28704991cd762da Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 10 Jun 2024 10:18:51 +0200 Subject: crypto: qat - make adf_ctl_class constant Now that the driver core allows for struct class to be in read-only memory, we should make all 'class' structures declared at build time placing them into read-only memory, instead of having to be dynamically allocated at runtime. Cc: Giovanni Cabiddu Cc: Herbert Xu Cc: "David S. Miller" Cc: Adam Guerin Cc: Benjamin Tissoires Cc: Tom Zanussi Cc: Shashank Gupta Cc: qat-linux@intel.com Cc: linux-crypto@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Reviewed-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c b/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c index 29c4422f243c..26a1662fafbb 100644 --- a/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c +++ b/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c @@ -31,19 +31,22 @@ static const struct file_operations adf_ctl_ops = { .compat_ioctl = compat_ptr_ioctl, }; +static const struct class adf_ctl_class = { + .name = DEVICE_NAME, +}; + struct adf_ctl_drv_info { unsigned int major; struct cdev drv_cdev; - struct class *drv_class; }; static struct adf_ctl_drv_info adf_ctl_drv; static void adf_chr_drv_destroy(void) { - device_destroy(adf_ctl_drv.drv_class, MKDEV(adf_ctl_drv.major, 0)); + device_destroy(&adf_ctl_class, MKDEV(adf_ctl_drv.major, 0)); cdev_del(&adf_ctl_drv.drv_cdev); - class_destroy(adf_ctl_drv.drv_class); + class_unregister(&adf_ctl_class); unregister_chrdev_region(MKDEV(adf_ctl_drv.major, 0), 1); } @@ -51,17 +54,17 @@ static int adf_chr_drv_create(void) { dev_t dev_id; struct device *drv_device; + int ret; if (alloc_chrdev_region(&dev_id, 0, 1, DEVICE_NAME)) { pr_err("QAT: unable to allocate chrdev region\n"); return -EFAULT; } - adf_ctl_drv.drv_class = class_create(DEVICE_NAME); - if (IS_ERR(adf_ctl_drv.drv_class)) { - pr_err("QAT: class_create failed for adf_ctl\n"); + ret = class_register(&adf_ctl_class); + if (ret) goto err_chrdev_unreg; - } + adf_ctl_drv.major = MAJOR(dev_id); cdev_init(&adf_ctl_drv.drv_cdev, &adf_ctl_ops); if (cdev_add(&adf_ctl_drv.drv_cdev, dev_id, 1)) { @@ -69,7 +72,7 @@ static int adf_chr_drv_create(void) goto err_class_destr; } - drv_device = device_create(adf_ctl_drv.drv_class, NULL, + drv_device = device_create(&adf_ctl_class, NULL, MKDEV(adf_ctl_drv.major, 0), NULL, DEVICE_NAME); if (IS_ERR(drv_device)) { @@ -80,7 +83,7 @@ static int adf_chr_drv_create(void) err_cdev_del: cdev_del(&adf_ctl_drv.drv_cdev); err_class_destr: - class_destroy(adf_ctl_drv.drv_class); + class_unregister(&adf_ctl_class); err_chrdev_unreg: unregister_chrdev_region(dev_id, 1); return -EFAULT; -- cgit From b568826eff5d4ef859d2405b127bbee282b9dfe2 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Wed, 12 Jun 2024 13:11:57 -0700 Subject: crypto: arm64 - add missing MODULE_DESCRIPTION() macros With ARCH=arm64, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in arch/arm64/crypto/crct10dif-ce.o WARNING: modpost: missing MODULE_DESCRIPTION() in arch/arm64/crypto/poly1305-neon.o WARNING: modpost: missing MODULE_DESCRIPTION() in arch/arm64/crypto/aes-neon-bs.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-neonbs-glue.c | 1 + arch/arm64/crypto/crct10dif-ce-glue.c | 1 + arch/arm64/crypto/poly1305-glue.c | 1 + 3 files changed, 3 insertions(+) diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index 467ac2f768ac..46425e7b9755 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -16,6 +16,7 @@ #include MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_DESCRIPTION("Bit sliced AES using NEON instructions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("ecb(aes)"); diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index 59016518f44d..606d25c559ed 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -138,6 +138,7 @@ module_cpu_feature_match(ASIMD, crc_t10dif_mod_init); module_exit(crc_t10dif_mod_exit); MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("crct10dif"); MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce"); diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c index 1fae18ba11ed..9c4bfd62e789 100644 --- a/arch/arm64/crypto/poly1305-glue.c +++ b/arch/arm64/crypto/poly1305-glue.c @@ -226,6 +226,7 @@ static void __exit neon_poly1305_mod_exit(void) module_init(neon_poly1305_mod_init); module_exit(neon_poly1305_mod_exit); +MODULE_DESCRIPTION("Poly1305 transform using NEON instructions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("poly1305"); MODULE_ALIAS_CRYPTO("poly1305-neon"); -- cgit From 691eaf1d66d806fcc2e22068775cf325a5b8c076 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 13 Jun 2024 13:47:49 -0700 Subject: hwrng: drivers - add missing Arm & Cavium MODULE_DESCRIPTION() macros With ARCH=arm64, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/char/hw_random/cavium-rng.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/char/hw_random/cavium-rng-vf.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/char/hw_random/arm_smccc_trng.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- drivers/char/hw_random/arm_smccc_trng.c | 1 + drivers/char/hw_random/cavium-rng-vf.c | 1 + drivers/char/hw_random/cavium-rng.c | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/char/hw_random/arm_smccc_trng.c b/drivers/char/hw_random/arm_smccc_trng.c index 7e954341b09f..dcb8e7f37f25 100644 --- a/drivers/char/hw_random/arm_smccc_trng.c +++ b/drivers/char/hw_random/arm_smccc_trng.c @@ -118,4 +118,5 @@ module_platform_driver(smccc_trng_driver); MODULE_ALIAS("platform:smccc_trng"); MODULE_AUTHOR("Andre Przywara"); +MODULE_DESCRIPTION("Arm SMCCC TRNG firmware interface support"); MODULE_LICENSE("GPL"); diff --git a/drivers/char/hw_random/cavium-rng-vf.c b/drivers/char/hw_random/cavium-rng-vf.c index c99c54cd99c6..c1b8918b2292 100644 --- a/drivers/char/hw_random/cavium-rng-vf.c +++ b/drivers/char/hw_random/cavium-rng-vf.c @@ -266,4 +266,5 @@ static struct pci_driver cavium_rng_vf_driver = { module_pci_driver(cavium_rng_vf_driver); MODULE_AUTHOR("Omer Khaliq "); +MODULE_DESCRIPTION("Cavium ThunderX Random Number Generator VF support"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/char/hw_random/cavium-rng.c b/drivers/char/hw_random/cavium-rng.c index b96579222408..d9d7b6038c06 100644 --- a/drivers/char/hw_random/cavium-rng.c +++ b/drivers/char/hw_random/cavium-rng.c @@ -88,4 +88,5 @@ static struct pci_driver cavium_rng_pf_driver = { module_pci_driver(cavium_rng_pf_driver); MODULE_AUTHOR("Omer Khaliq "); +MODULE_DESCRIPTION("Cavium ThunderX Random Number Generator support"); MODULE_LICENSE("GPL v2"); -- cgit From 5ca95a907939f22467767829007b2707f50bba48 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 13 Jun 2024 17:57:05 -0700 Subject: crypto: arm/poly1305 - add missing MODULE_DESCRIPTION() macro With ARCH=arm, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in arch/arm/crypto/poly1305-arm.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- arch/arm/crypto/poly1305-glue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c index c31bd8f7c092..8482e302c45a 100644 --- a/arch/arm/crypto/poly1305-glue.c +++ b/arch/arm/crypto/poly1305-glue.c @@ -267,6 +267,7 @@ static void __exit arm_poly1305_mod_exit(void) module_init(arm_poly1305_mod_init); module_exit(arm_poly1305_mod_exit); +MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("poly1305"); MODULE_ALIAS_CRYPTO("poly1305-arm"); -- cgit From f0da7a231c7d0bafb6bb56da597cb2bfaf2c7028 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 14 Jun 2024 11:09:11 +0800 Subject: crypto: lib/mpi - Use swap() in mpi_ec_mul_point() Use existing swap() function rather than duplicating its implementation. ./lib/crypto/mpi/ec.c:1291:20-21: WARNING opportunity for swap(). ./lib/crypto/mpi/ec.c:1292:20-21: WARNING opportunity for swap(). Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=9328 Signed-off-by: Jiapeng Chong Signed-off-by: Herbert Xu --- lib/crypto/mpi/ec.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/crypto/mpi/ec.c b/lib/crypto/mpi/ec.c index e16dca1e23d5..4781f00982ef 100644 --- a/lib/crypto/mpi/ec.c +++ b/lib/crypto/mpi/ec.c @@ -1285,14 +1285,12 @@ void mpi_ec_mul_point(MPI_POINT result, sum = &p2_; for (j = nbits-1; j >= 0; j--) { - MPI_POINT t; - sw = mpi_test_bit(scalar, j); point_swap_cond(q1, q2, sw, ctx); montgomery_ladder(prd, sum, q1, q2, point->x, ctx); point_swap_cond(prd, sum, sw, ctx); - t = q1; q1 = prd; prd = t; - t = q2; q2 = sum; sum = t; + swap(q1, prd); + swap(q2, sum); } mpi_clear(result->y); -- cgit From b44327ebc1c926e4b55a7721e3d91313fcf188b1 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 14 Jun 2024 11:09:12 +0800 Subject: crypto: lib/mpi - Use swap() in mpi_powm() Use existing swap() function rather than duplicating its implementation. ./lib/crypto/mpi/mpi-pow.c:211:11-12: WARNING opportunity for swap(). ./lib/crypto/mpi/mpi-pow.c:239:12-13: WARNING opportunity for swap(). Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=9327 Signed-off-by: Jiapeng Chong Signed-off-by: Herbert Xu --- lib/crypto/mpi/mpi-pow.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lib/crypto/mpi/mpi-pow.c b/lib/crypto/mpi/mpi-pow.c index 2fd7a46d55ec..67fbd4c2503d 100644 --- a/lib/crypto/mpi/mpi-pow.c +++ b/lib/crypto/mpi/mpi-pow.c @@ -176,7 +176,6 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) for (;;) { while (c) { - mpi_ptr_t tp; mpi_size_t xsize; /*if (mpihelp_mul_n(xp, rp, rp, rsize) < 0) goto enomem */ @@ -207,9 +206,7 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) xsize = msize; } - tp = rp; - rp = xp; - xp = tp; + swap(rp, xp); rsize = xsize; if ((mpi_limb_signed_t) e < 0) { @@ -235,9 +232,7 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) xsize = msize; } - tp = rp; - rp = xp; - xp = tp; + swap(rp, xp); rsize = xsize; } e <<= 1; -- cgit From 70d57ffbb11cfe1a686e6051f36413866502492d Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sat, 15 Jun 2024 22:32:37 -0700 Subject: crypto: arm - add missing MODULE_DESCRIPTION() macros With ARCH=arm, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in arch/arm/crypto/aes-arm-bs.o WARNING: modpost: missing MODULE_DESCRIPTION() in arch/arm/crypto/crc32-arm-ce.o Add the missing invocation of the MODULE_DESCRIPTION() macro to all files which have a MODULE_LICENSE(). This includes crct10dif-ce-glue.c and curve25519-glue.c which, although they did not produce a warning with the arm allmodconfig configuration, may cause this warning with other configurations. Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- arch/arm/crypto/aes-neonbs-glue.c | 1 + arch/arm/crypto/crc32-ce-glue.c | 1 + arch/arm/crypto/crct10dif-ce-glue.c | 1 + arch/arm/crypto/curve25519-glue.c | 1 + 4 files changed, 4 insertions(+) diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c index f00f042ef357..201eb35dde37 100644 --- a/arch/arm/crypto/aes-neonbs-glue.c +++ b/arch/arm/crypto/aes-neonbs-glue.c @@ -17,6 +17,7 @@ #include MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_DESCRIPTION("Bit sliced AES using NEON instructions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("ecb(aes)"); diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c index 2208445808d7..4ff18044af07 100644 --- a/arch/arm/crypto/crc32-ce-glue.c +++ b/arch/arm/crypto/crc32-ce-glue.c @@ -241,6 +241,7 @@ module_init(crc32_pmull_mod_init); module_exit(crc32_pmull_mod_exit); MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_DESCRIPTION("Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("crc32"); MODULE_ALIAS_CRYPTO("crc32c"); diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c index e9191a8c87b9..79f3b204d8c0 100644 --- a/arch/arm/crypto/crct10dif-ce-glue.c +++ b/arch/arm/crypto/crct10dif-ce-glue.c @@ -84,5 +84,6 @@ module_init(crc_t10dif_mod_init); module_exit(crc_t10dif_mod_exit); MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_DESCRIPTION("Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("crct10dif"); diff --git a/arch/arm/crypto/curve25519-glue.c b/arch/arm/crypto/curve25519-glue.c index 9bdafd57888c..e7b87e09dd99 100644 --- a/arch/arm/crypto/curve25519-glue.c +++ b/arch/arm/crypto/curve25519-glue.c @@ -133,4 +133,5 @@ module_exit(arm_curve25519_exit); MODULE_ALIAS_CRYPTO("curve25519"); MODULE_ALIAS_CRYPTO("curve25519-neon"); +MODULE_DESCRIPTION("Public key crypto: Curve25519 (NEON-accelerated)"); MODULE_LICENSE("GPL v2"); -- cgit From 3cbe18b0bc9f0653709adbdadad04491a190c71a Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Sat, 15 Jun 2024 23:14:57 -0700 Subject: crypto: lib - add missing MODULE_DESCRIPTION() macros With ARCH=arm, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in lib/crypto/libsha256.o Add the missing invocation of the MODULE_DESCRIPTION() macro to all files which have a MODULE_LICENSE(). This includes sha1.c and utils.c which, although they did not produce a warning with the arm allmodconfig configuration, may cause this warning with other configurations. Signed-off-by: Jeff Johnson Signed-off-by: Herbert Xu --- lib/crypto/sha1.c | 1 + lib/crypto/sha256.c | 1 + lib/crypto/utils.c | 1 + 3 files changed, 3 insertions(+) diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c index 1aebe7be9401..6d2922747cab 100644 --- a/lib/crypto/sha1.c +++ b/lib/crypto/sha1.c @@ -137,4 +137,5 @@ void sha1_init(__u32 *buf) } EXPORT_SYMBOL(sha1_init); +MODULE_DESCRIPTION("SHA-1 Algorithm"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 3ac1ef8677db..3f42d203c7bc 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -165,4 +165,5 @@ void sha256(const u8 *data, unsigned int len, u8 *out) } EXPORT_SYMBOL(sha256); +MODULE_DESCRIPTION("SHA-256 Algorithm"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/utils.c b/lib/crypto/utils.c index c852c7151b0a..373364141408 100644 --- a/lib/crypto/utils.c +++ b/lib/crypto/utils.c @@ -85,4 +85,5 @@ void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len) } EXPORT_SYMBOL_GPL(__crypto_xor); +MODULE_DESCRIPTION("Crypto library utility functions"); MODULE_LICENSE("GPL"); -- cgit From ff33c2e6af99afcac3024a5c3ec8730d1e6b8ac7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 10 Jun 2024 17:26:39 +0200 Subject: crypto: arm/crc32 - add kCFI annotations to asm routines The crc32/crc32c implementations using the scalar CRC32 instructions are accessed via indirect calls, and so they must be annotated with type ids in order to execute correctly when kCFI is enabled. Cc: Kees Cook Cc: Linus Walleij Signed-off-by: Ard Biesheuvel Reviewed-by: Kees Cook Reviewed-by: Linus Walleij Signed-off-by: Herbert Xu --- arch/arm/crypto/crc32-ce-core.S | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/arm/crypto/crc32-ce-core.S b/arch/arm/crypto/crc32-ce-core.S index 3f13a76b9066..88f9edf94e95 100644 --- a/arch/arm/crypto/crc32-ce-core.S +++ b/arch/arm/crypto/crc32-ce-core.S @@ -48,6 +48,7 @@ */ #include +#include #include .text @@ -123,11 +124,12 @@ * uint crc32_pmull_le(unsigned char const *buffer, * size_t len, uint crc32) */ -ENTRY(crc32_pmull_le) +SYM_FUNC_START(crc32_pmull_le) adr r3, .Lcrc32_constants b 0f +SYM_FUNC_END(crc32_pmull_le) -ENTRY(crc32c_pmull_le) +SYM_FUNC_START(crc32c_pmull_le) adr r3, .Lcrc32c_constants 0: bic LEN, LEN, #15 @@ -236,8 +238,7 @@ fold_64: vmov r0, s5 bx lr -ENDPROC(crc32_pmull_le) -ENDPROC(crc32c_pmull_le) +SYM_FUNC_END(crc32c_pmull_le) .macro __crc32, c subs ip, r2, #8 @@ -296,11 +297,11 @@ ARM_BE8(rev16 r3, r3 ) .endm .align 5 -ENTRY(crc32_armv8_le) +SYM_TYPED_FUNC_START(crc32_armv8_le) __crc32 -ENDPROC(crc32_armv8_le) +SYM_FUNC_END(crc32_armv8_le) .align 5 -ENTRY(crc32c_armv8_le) +SYM_TYPED_FUNC_START(crc32c_armv8_le) __crc32 c -ENDPROC(crc32c_armv8_le) +SYM_FUNC_END(crc32c_armv8_le) -- cgit From 7b3058eb3f3b04e534d6cda6820ef8c7cf519a79 Mon Sep 17 00:00:00 2001 From: Sergey Portnoy Date: Mon, 17 Jun 2024 17:08:29 +0100 Subject: crypto: tcrypt - add skcipher speed for given alg Allow to run skcipher speed for given algorithm. Case 600 is modified to cover ENCRYPT and DECRYPT directions. Example: modprobe tcrypt mode=600 alg="qat_aes_xts" klen=32 If succeed, the performance numbers will be printed in dmesg: testing speed of multibuffer qat_aes_xts (qat_aes_xts) encryption test 0 (256 bit key, 16 byte blocks): 1 operation in 14596 cycles (16 bytes) ... test 6 (256 bit key, 4096 byte blocks): 1 operation in 8053 cycles (4096 bytes) Signed-off-by: Sergey Portnoy Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 8aea416f6480..e9e7dceb606e 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -2613,6 +2613,15 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) break; case 600: + if (alg) { + u8 speed_template[2] = {klen, 0}; + test_mb_skcipher_speed(alg, ENCRYPT, sec, NULL, 0, + speed_template, num_mb); + test_mb_skcipher_speed(alg, DECRYPT, sec, NULL, 0, + speed_template, num_mb); + break; + } + test_mb_skcipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0, speed_template_16_24_32, num_mb); test_mb_skcipher_speed("ecb(aes)", DECRYPT, sec, NULL, 0, -- cgit From e0eece0cebe4b739c47abad958d17669ff8c005d Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Mon, 17 Jun 2024 15:33:17 -0500 Subject: crypto: deflate - Add aliases to deflate iaa_crypto depends on the deflate compression algorithm that's provided by deflate. If the algorithm is not available because CRYPTO_DEFLATE=m and deflate is not inserted, iaa_crypto will request "crypto-deflate-generic". Deflate will not be inserted because "crypto-deflate-generic" is not a valid alias. Add deflate-generic and crypto-deflate-generic aliases to deflate. Signed-off-by: Kyle Meyer Signed-off-by: Herbert Xu --- crypto/deflate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/crypto/deflate.c b/crypto/deflate.c index 6e31e0db0e86..98e8bcb81a6a 100644 --- a/crypto/deflate.c +++ b/crypto/deflate.c @@ -311,3 +311,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Deflate Compression Algorithm for IPCOMP"); MODULE_AUTHOR("James Morris "); MODULE_ALIAS_CRYPTO("deflate"); +MODULE_ALIAS_CRYPTO("deflate-generic"); -- cgit From ccacbbc3176277bbfc324f85fa827d1a2656bedf Mon Sep 17 00:00:00 2001 From: Jiwei Sun Date: Thu, 20 Jun 2024 16:51:10 +0800 Subject: crypto: qat - initialize user_input.lock for rate_limiting If the following configurations are set, CONFIG_DEBUG_RWSEMS=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_RWSEM_SPIN_ON_OWNER=y And run the following command, [root@localhost sys]# cat /sys/devices/pci0000:6b/0000:6b:00.0/qat_rl/pir The following warning log appears, ------------[ cut here ]------------ DEBUG_RWSEMS_WARN_ON(sem->magic != sem): count = 0x0, magic = 0x0, owner = 0x1, curr 0xff11000119288040, list not empty WARNING: CPU: 131 PID: 1254984 at kernel/locking/rwsem.c:1280 down_read+0x439/0x7f0 CPU: 131 PID: 1254984 Comm: cat Kdump: loaded Tainted: G W 6.10.0-rc4+ #86 b2ae60c8ceabed15f4fd2dba03c1c5a5f7f4040c Hardware name: Lenovo ThinkServer SR660 V3/SR660 V3, BIOS T8E166X-2.54 05/30/2024 RIP: 0010:down_read+0x439/0x7f0 Code: 44 24 10 80 3c 02 00 0f 85 05 03 00 00 48 8b 13 41 54 48 c7 c6 a0 3e 0e b4 48 c7 c7 e0 3e 0e b4 4c 8b 4c 24 08 e8 77 d5 40 fd <0f> 0b 59 e9 bc fc ff ff 0f 1f 44 00 00 e9 e2 fd ff ff 4c 8d 7b 08 RSP: 0018:ffa0000035f67a78 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ff1100012b03a658 RCX: 0000000000000000 RDX: 0000000080000002 RSI: 0000000000000008 RDI: 0000000000000001 RBP: 1ff4000006becf53 R08: fff3fc0006becf17 R09: fff3fc0006becf17 R10: fff3fc0006becf16 R11: ffa0000035f678b7 R12: ffffffffb40e3e60 R13: ffffffffb627d1f4 R14: ff1100012b03a6d0 R15: ff1100012b03a6c8 FS: 00007fa9ff9a6740(0000) GS:ff1100081e600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa9ff984000 CR3: 00000002118ae006 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: pir_show+0x5d/0xe0 [intel_qat 9e297e249ab040329cf58b657b06f418fd5c5855] dev_attr_show+0x3f/0xc0 sysfs_kf_seq_show+0x1ce/0x400 seq_read_iter+0x3fa/0x10b0 vfs_read+0x6f5/0xb20 ksys_read+0xe9/0x1d0 do_syscall_64+0x8a/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fa9ff6fd9b2 Code: c0 e9 b2 fe ff ff 50 48 8d 3d ea 1d 0c 00 e8 c5 fd 01 00 0f 1f 44 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 0f 05 <48> 3d 00 f0 ff ff 77 56 c3 0f 1f 44 00 00 48 83 ec 28 48 89 54 24 RSP: 002b:00007ffc0616b968 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fa9ff6fd9b2 RDX: 0000000000020000 RSI: 00007fa9ff985000 RDI: 0000000000000003 RBP: 00007fa9ff985000 R08: 00007fa9ff984010 R09: 0000000000000000 R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000022000 R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x21e6/0x6e70 softirqs last enabled at (0): [] copy_process+0x2236/0x6e70 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace 0000000000000000 ]--- The rate_limiting->user_input.lock rwsem lock is not initialized before use. Let's initialize it. Signed-off-by: Jiwei Sun Reviewed-by: Adrian Huang Reviewed-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- drivers/crypto/intel/qat/qat_common/adf_rl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/intel/qat/qat_common/adf_rl.c b/drivers/crypto/intel/qat/qat_common/adf_rl.c index 346ef8bee99d..e782c23fc1bf 100644 --- a/drivers/crypto/intel/qat/qat_common/adf_rl.c +++ b/drivers/crypto/intel/qat/qat_common/adf_rl.c @@ -1106,6 +1106,7 @@ int adf_rl_init(struct adf_accel_dev *accel_dev) mutex_init(&rl->rl_lock); rl->device_data = &accel_dev->hw_device->rl_data; rl->accel_dev = accel_dev; + init_rwsem(&rl->user_input.lock); accel_dev->rate_limiting = rl; err_ret: -- cgit From 70003f512c5ff488081460c664340b11b57b1293 Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Thu, 20 Jun 2024 18:13:34 -0500 Subject: dt-bindings: rng: Add Exynos850 support to exynos-trng The TRNG block in Exynos850 is pretty much the same as in Exynos5250, but there are two clocks that has to be controlled to make it work: 1. Functional (operating) clock: called ACLK in Exynos850, the same as "secss" clock in Exynos5250 2. Interface (bus) clock: called PCLK in Exynos850. It has to be enabled in order to access TRNG registers Document Exynos850 compatible and the related clock changes. Signed-off-by: Sam Protsenko Reviewed-by: Krzysztof Kozlowski Signed-off-by: Herbert Xu --- .../bindings/rng/samsung,exynos5250-trng.yaml | 40 +++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/rng/samsung,exynos5250-trng.yaml b/Documentation/devicetree/bindings/rng/samsung,exynos5250-trng.yaml index 765d9f9edd6e..1a71935d8a19 100644 --- a/Documentation/devicetree/bindings/rng/samsung,exynos5250-trng.yaml +++ b/Documentation/devicetree/bindings/rng/samsung,exynos5250-trng.yaml @@ -12,14 +12,17 @@ maintainers: properties: compatible: - const: samsung,exynos5250-trng + enum: + - samsung,exynos5250-trng + - samsung,exynos850-trng clocks: - maxItems: 1 + minItems: 1 + maxItems: 2 clock-names: - items: - - const: secss + minItems: 1 + maxItems: 2 reg: maxItems: 1 @@ -30,6 +33,35 @@ required: - clock-names - reg +allOf: + - if: + properties: + compatible: + contains: + const: samsung,exynos850-trng + + then: + properties: + clocks: + items: + - description: SSS (Security Sub System) operating clock + - description: SSS (Security Sub System) bus clock + + clock-names: + items: + - const: secss + - const: pclk + + else: + properties: + clocks: + items: + - description: SSS (Security Sub System) operating clock + + clock-names: + items: + - const: secss + additionalProperties: false examples: -- cgit From 76536caabedbeafb01d364690914c88ebadab5b3 Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Thu, 20 Jun 2024 18:13:35 -0500 Subject: hwrng: exynos - Improve coding style MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix obvious style issues. Some of those were found with checkpatch, and some just contradict the kernel coding style guide. No functional change. Signed-off-by: Sam Protsenko Reviewed-by: Krzysztof Kozlowski Acked-by: Łukasz Stelmach Signed-off-by: Herbert Xu --- drivers/char/hw_random/exynos-trng.c | 63 +++++++++++++++++------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/drivers/char/hw_random/exynos-trng.c b/drivers/char/hw_random/exynos-trng.c index 0ed5d22fe667..266bdad84f3c 100644 --- a/drivers/char/hw_random/exynos-trng.c +++ b/drivers/char/hw_random/exynos-trng.c @@ -23,45 +23,41 @@ #include #include -#define EXYNOS_TRNG_CLKDIV (0x0) - -#define EXYNOS_TRNG_CTRL (0x20) -#define EXYNOS_TRNG_CTRL_RNGEN BIT(31) - -#define EXYNOS_TRNG_POST_CTRL (0x30) -#define EXYNOS_TRNG_ONLINE_CTRL (0x40) -#define EXYNOS_TRNG_ONLINE_STAT (0x44) -#define EXYNOS_TRNG_ONLINE_MAXCHI2 (0x48) -#define EXYNOS_TRNG_FIFO_CTRL (0x50) -#define EXYNOS_TRNG_FIFO_0 (0x80) -#define EXYNOS_TRNG_FIFO_1 (0x84) -#define EXYNOS_TRNG_FIFO_2 (0x88) -#define EXYNOS_TRNG_FIFO_3 (0x8c) -#define EXYNOS_TRNG_FIFO_4 (0x90) -#define EXYNOS_TRNG_FIFO_5 (0x94) -#define EXYNOS_TRNG_FIFO_6 (0x98) -#define EXYNOS_TRNG_FIFO_7 (0x9c) -#define EXYNOS_TRNG_FIFO_LEN (8) -#define EXYNOS_TRNG_CLOCK_RATE (500000) - +#define EXYNOS_TRNG_CLKDIV 0x0 + +#define EXYNOS_TRNG_CTRL 0x20 +#define EXYNOS_TRNG_CTRL_RNGEN BIT(31) + +#define EXYNOS_TRNG_POST_CTRL 0x30 +#define EXYNOS_TRNG_ONLINE_CTRL 0x40 +#define EXYNOS_TRNG_ONLINE_STAT 0x44 +#define EXYNOS_TRNG_ONLINE_MAXCHI2 0x48 +#define EXYNOS_TRNG_FIFO_CTRL 0x50 +#define EXYNOS_TRNG_FIFO_0 0x80 +#define EXYNOS_TRNG_FIFO_1 0x84 +#define EXYNOS_TRNG_FIFO_2 0x88 +#define EXYNOS_TRNG_FIFO_3 0x8c +#define EXYNOS_TRNG_FIFO_4 0x90 +#define EXYNOS_TRNG_FIFO_5 0x94 +#define EXYNOS_TRNG_FIFO_6 0x98 +#define EXYNOS_TRNG_FIFO_7 0x9c +#define EXYNOS_TRNG_FIFO_LEN 8 +#define EXYNOS_TRNG_CLOCK_RATE 500000 struct exynos_trng_dev { - struct device *dev; - void __iomem *mem; - struct clk *clk; - struct hwrng rng; + struct device *dev; + void __iomem *mem; + struct clk *clk; + struct hwrng rng; }; static int exynos_trng_do_read(struct hwrng *rng, void *data, size_t max, bool wait) { - struct exynos_trng_dev *trng; + struct exynos_trng_dev *trng = (struct exynos_trng_dev *)rng->priv; int val; max = min_t(size_t, max, (EXYNOS_TRNG_FIFO_LEN * 4)); - - trng = (struct exynos_trng_dev *)rng->priv; - writel_relaxed(max * 8, trng->mem + EXYNOS_TRNG_FIFO_CTRL); val = readl_poll_timeout(trng->mem + EXYNOS_TRNG_FIFO_CTRL, val, val == 0, 200, 1000000); @@ -87,7 +83,7 @@ static int exynos_trng_init(struct hwrng *rng) */ val = sss_rate / (EXYNOS_TRNG_CLOCK_RATE * 2); if (val > 0x7fff) { - dev_err(trng->dev, "clock divider too large: %d", val); + dev_err(trng->dev, "clock divider too large: %d\n", val); return -ERANGE; } val = val << 1; @@ -122,7 +118,7 @@ static int exynos_trng_probe(struct platform_device *pdev) trng->rng.init = exynos_trng_init; trng->rng.read = exynos_trng_do_read; - trng->rng.priv = (unsigned long) trng; + trng->rng.priv = (unsigned long)trng; platform_set_drvdata(pdev, trng); trng->dev = &pdev->dev; @@ -175,7 +171,7 @@ err_pm_get: static void exynos_trng_remove(struct platform_device *pdev) { - struct exynos_trng_dev *trng = platform_get_drvdata(pdev); + struct exynos_trng_dev *trng = platform_get_drvdata(pdev); clk_disable_unprepare(trng->clk); @@ -204,7 +200,7 @@ static int exynos_trng_resume(struct device *dev) } static DEFINE_SIMPLE_DEV_PM_OPS(exynos_trng_pm_ops, exynos_trng_suspend, - exynos_trng_resume); + exynos_trng_resume); static const struct of_device_id exynos_trng_dt_match[] = { { @@ -225,6 +221,7 @@ static struct platform_driver exynos_trng_driver = { }; module_platform_driver(exynos_trng_driver); + MODULE_AUTHOR("Łukasz Stelmach"); MODULE_DESCRIPTION("H/W TRNG driver for Exynos chips"); MODULE_LICENSE("GPL v2"); -- cgit From 81da8056e92bd255178413d36382653ed5a1a230 Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Thu, 20 Jun 2024 18:13:36 -0500 Subject: hwrng: exynos - Use devm_clk_get_enabled() to get the clock Use devm_clk_get_enabled() helper instead of calling devm_clk_get() and then clk_prepare_enable(). It simplifies the error handling and makes the code more compact. Also use dev_err_probe() to handle possible -EPROBE_DEFER errors if the clock is not available yet. Signed-off-by: Sam Protsenko Reviewed-by: Krzysztof Kozlowski Reviewed-by: Anand Moon Signed-off-by: Herbert Xu --- drivers/char/hw_random/exynos-trng.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/char/hw_random/exynos-trng.c b/drivers/char/hw_random/exynos-trng.c index 266bdad84f3c..997bd22f4498 100644 --- a/drivers/char/hw_random/exynos-trng.c +++ b/drivers/char/hw_random/exynos-trng.c @@ -134,32 +134,23 @@ static int exynos_trng_probe(struct platform_device *pdev) goto err_pm_get; } - trng->clk = devm_clk_get(&pdev->dev, "secss"); + trng->clk = devm_clk_get_enabled(&pdev->dev, "secss"); if (IS_ERR(trng->clk)) { - ret = PTR_ERR(trng->clk); - dev_err(&pdev->dev, "Could not get clock.\n"); - goto err_clock; - } - - ret = clk_prepare_enable(trng->clk); - if (ret) { - dev_err(&pdev->dev, "Could not enable the clk.\n"); + ret = dev_err_probe(&pdev->dev, PTR_ERR(trng->clk), + "Could not get clock\n"); goto err_clock; } ret = devm_hwrng_register(&pdev->dev, &trng->rng); if (ret) { dev_err(&pdev->dev, "Could not register hwrng device.\n"); - goto err_register; + goto err_clock; } dev_info(&pdev->dev, "Exynos True Random Number Generator.\n"); return 0; -err_register: - clk_disable_unprepare(trng->clk); - err_clock: pm_runtime_put_noidle(&pdev->dev); @@ -171,10 +162,6 @@ err_pm_get: static void exynos_trng_remove(struct platform_device *pdev) { - struct exynos_trng_dev *trng = platform_get_drvdata(pdev); - - clk_disable_unprepare(trng->clk); - pm_runtime_put_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); } -- cgit From e003d67067043488595f33f3a82230a4281686ca Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Thu, 20 Jun 2024 18:13:37 -0500 Subject: hwrng: exynos - Implement bus clock control Some SoCs like Exynos850 might require the SSS bus clock (PCLK) to be enabled in order to access TRNG registers. Add and handle the optional PCLK clock accordingly to make it possible. Signed-off-by: Sam Protsenko Reviewed-by: Krzysztof Kozlowski Reviewed-by: Anand Moon Signed-off-by: Herbert Xu --- drivers/char/hw_random/exynos-trng.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/char/hw_random/exynos-trng.c b/drivers/char/hw_random/exynos-trng.c index 997bd22f4498..6ef2ee6c9804 100644 --- a/drivers/char/hw_random/exynos-trng.c +++ b/drivers/char/hw_random/exynos-trng.c @@ -47,7 +47,8 @@ struct exynos_trng_dev { struct device *dev; void __iomem *mem; - struct clk *clk; + struct clk *clk; /* operating clock */ + struct clk *pclk; /* bus clock */ struct hwrng rng; }; @@ -141,6 +142,13 @@ static int exynos_trng_probe(struct platform_device *pdev) goto err_clock; } + trng->pclk = devm_clk_get_optional_enabled(&pdev->dev, "pclk"); + if (IS_ERR(trng->pclk)) { + ret = dev_err_probe(&pdev->dev, PTR_ERR(trng->pclk), + "Could not get pclk\n"); + goto err_clock; + } + ret = devm_hwrng_register(&pdev->dev, &trng->rng); if (ret) { dev_err(&pdev->dev, "Could not register hwrng device.\n"); -- cgit From 10bb6ac8f86f4b65ef8d227504868a61e0bcb148 Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Thu, 20 Jun 2024 18:13:38 -0500 Subject: hwrng: exynos - Add SMC based TRNG operation On some Exynos chips like Exynos850 the access to Security Sub System (SSS) registers is protected with TrustZone, and therefore only possible from EL3 monitor software. The Linux kernel is running in EL1, so the only way for the driver to obtain TRNG data is via SMC calls to EL3 monitor. Implement such SMC operation and use it when EXYNOS_SMC flag is set in the corresponding chip driver data. Signed-off-by: Sam Protsenko Reviewed-by: Krzysztof Kozlowski Signed-off-by: Herbert Xu --- drivers/char/hw_random/exynos-trng.c | 140 ++++++++++++++++++++++++++++++++--- 1 file changed, 130 insertions(+), 10 deletions(-) diff --git a/drivers/char/hw_random/exynos-trng.c b/drivers/char/hw_random/exynos-trng.c index 6ef2ee6c9804..9fa30583cc86 100644 --- a/drivers/char/hw_random/exynos-trng.c +++ b/drivers/char/hw_random/exynos-trng.c @@ -10,6 +10,7 @@ * Krzysztof Kozłowski */ +#include #include #include #include @@ -22,6 +23,7 @@ #include #include #include +#include #define EXYNOS_TRNG_CLKDIV 0x0 @@ -44,16 +46,41 @@ #define EXYNOS_TRNG_FIFO_LEN 8 #define EXYNOS_TRNG_CLOCK_RATE 500000 +/* Driver feature flags */ +#define EXYNOS_SMC BIT(0) + +#define EXYNOS_SMC_CALL_VAL(func_num) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_SIP, \ + func_num) + +/* SMC command for DTRNG access */ +#define SMC_CMD_RANDOM EXYNOS_SMC_CALL_VAL(0x1012) + +/* SMC_CMD_RANDOM: arguments */ +#define HWRNG_INIT 0x0 +#define HWRNG_EXIT 0x1 +#define HWRNG_GET_DATA 0x2 +#define HWRNG_RESUME 0x3 + +/* SMC_CMD_RANDOM: return values */ +#define HWRNG_RET_OK 0x0 +#define HWRNG_RET_RETRY_ERROR 0x2 + +#define HWRNG_MAX_TRIES 100 + struct exynos_trng_dev { struct device *dev; void __iomem *mem; struct clk *clk; /* operating clock */ struct clk *pclk; /* bus clock */ struct hwrng rng; + unsigned long flags; }; -static int exynos_trng_do_read(struct hwrng *rng, void *data, size_t max, - bool wait) +static int exynos_trng_do_read_reg(struct hwrng *rng, void *data, size_t max, + bool wait) { struct exynos_trng_dev *trng = (struct exynos_trng_dev *)rng->priv; int val; @@ -70,7 +97,40 @@ static int exynos_trng_do_read(struct hwrng *rng, void *data, size_t max, return max; } -static int exynos_trng_init(struct hwrng *rng) +static int exynos_trng_do_read_smc(struct hwrng *rng, void *data, size_t max, + bool wait) +{ + struct arm_smccc_res res; + unsigned int copied = 0; + u32 *buf = data; + int tries = 0; + + while (copied < max) { + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_GET_DATA, 0, 0, 0, 0, 0, 0, + &res); + switch (res.a0) { + case HWRNG_RET_OK: + *buf++ = res.a2; + *buf++ = res.a3; + copied += 8; + tries = 0; + break; + case HWRNG_RET_RETRY_ERROR: + if (!wait) + return copied; + if (++tries >= HWRNG_MAX_TRIES) + return copied; + cond_resched(); + break; + default: + return -EIO; + } + } + + return copied; +} + +static int exynos_trng_init_reg(struct hwrng *rng) { struct exynos_trng_dev *trng = (struct exynos_trng_dev *)rng->priv; unsigned long sss_rate; @@ -103,6 +163,24 @@ static int exynos_trng_init(struct hwrng *rng) return 0; } +static int exynos_trng_init_smc(struct hwrng *rng) +{ + struct exynos_trng_dev *trng = (struct exynos_trng_dev *)rng->priv; + struct arm_smccc_res res; + int ret = 0; + + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_INIT, 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != HWRNG_RET_OK) { + dev_err(trng->dev, "SMC command for TRNG init failed (%d)\n", + (int)res.a0); + ret = -EIO; + } + if ((int)res.a0 == -1) + dev_info(trng->dev, "Make sure LDFW is loaded by your BL\n"); + + return ret; +} + static int exynos_trng_probe(struct platform_device *pdev) { struct exynos_trng_dev *trng; @@ -112,21 +190,29 @@ static int exynos_trng_probe(struct platform_device *pdev) if (!trng) return ret; + platform_set_drvdata(pdev, trng); + trng->dev = &pdev->dev; + + trng->flags = (unsigned long)device_get_match_data(&pdev->dev); + trng->rng.name = devm_kstrdup(&pdev->dev, dev_name(&pdev->dev), GFP_KERNEL); if (!trng->rng.name) return ret; - trng->rng.init = exynos_trng_init; - trng->rng.read = exynos_trng_do_read; trng->rng.priv = (unsigned long)trng; - platform_set_drvdata(pdev, trng); - trng->dev = &pdev->dev; + if (trng->flags & EXYNOS_SMC) { + trng->rng.init = exynos_trng_init_smc; + trng->rng.read = exynos_trng_do_read_smc; + } else { + trng->rng.init = exynos_trng_init_reg; + trng->rng.read = exynos_trng_do_read_reg; - trng->mem = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(trng->mem)) - return PTR_ERR(trng->mem); + trng->mem = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(trng->mem)) + return PTR_ERR(trng->mem); + } pm_runtime_enable(&pdev->dev); ret = pm_runtime_resume_and_get(&pdev->dev); @@ -170,12 +256,31 @@ err_pm_get: static void exynos_trng_remove(struct platform_device *pdev) { + struct exynos_trng_dev *trng = platform_get_drvdata(pdev); + + if (trng->flags & EXYNOS_SMC) { + struct arm_smccc_res res; + + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_EXIT, 0, 0, 0, 0, 0, 0, + &res); + } + pm_runtime_put_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); } static int exynos_trng_suspend(struct device *dev) { + struct exynos_trng_dev *trng = dev_get_drvdata(dev); + struct arm_smccc_res res; + + if (trng->flags & EXYNOS_SMC) { + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_EXIT, 0, 0, 0, 0, 0, 0, + &res); + if (res.a0 != HWRNG_RET_OK) + return -EIO; + } + pm_runtime_put_sync(dev); return 0; @@ -183,6 +288,7 @@ static int exynos_trng_suspend(struct device *dev) static int exynos_trng_resume(struct device *dev) { + struct exynos_trng_dev *trng = dev_get_drvdata(dev); int ret; ret = pm_runtime_resume_and_get(dev); @@ -191,6 +297,20 @@ static int exynos_trng_resume(struct device *dev) return ret; } + if (trng->flags & EXYNOS_SMC) { + struct arm_smccc_res res; + + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_RESUME, 0, 0, 0, 0, 0, 0, + &res); + if (res.a0 != HWRNG_RET_OK) + return -EIO; + + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_INIT, 0, 0, 0, 0, 0, 0, + &res); + if (res.a0 != HWRNG_RET_OK) + return -EIO; + } + return 0; } -- cgit From b0c2036df8868b97176f9a97b777846620c9a74d Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Thu, 20 Jun 2024 18:13:39 -0500 Subject: hwrng: exynos - Enable Exynos850 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Exynos850 compatible and its driver data. It's only possible to access TRNG block via SMC calls in Exynos850, so specify that fact using EXYNOS_SMC flag in the driver data. Signed-off-by: Sam Protsenko Reviewed-by: Krzysztof Kozlowski Acked-by: Łukasz Stelmach Signed-off-by: Herbert Xu --- drivers/char/hw_random/exynos-trng.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/char/hw_random/exynos-trng.c b/drivers/char/hw_random/exynos-trng.c index 9fa30583cc86..9f039fddaee3 100644 --- a/drivers/char/hw_random/exynos-trng.c +++ b/drivers/char/hw_random/exynos-trng.c @@ -320,6 +320,9 @@ static DEFINE_SIMPLE_DEV_PM_OPS(exynos_trng_pm_ops, exynos_trng_suspend, static const struct of_device_id exynos_trng_dt_match[] = { { .compatible = "samsung,exynos5250-trng", + }, { + .compatible = "samsung,exynos850-trng", + .data = (void *)EXYNOS_SMC, }, { }, }; -- cgit From 95c0f5c3b8bb7acdc5c4f04bc6a7d3f40d319e9e Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 21 Jun 2024 17:02:24 +0200 Subject: hwrng: core - Fix wrong quality calculation at hw rng registration When there are rng sources registering at the hwrng core via hwrng_register() a struct hwrng is delivered. There is a quality field in there which is used to decide which of the registered hw rng sources will be used by the hwrng core. With commit 16bdbae39428 ("hwrng: core - treat default_quality as a maximum and default to 1024") there came in a new default of 1024 in case this field is empty and all the known hw rng sources at that time had been reworked to not fill this field and thus use the default of 1024. The code choosing the 'better' hw rng source during registration of a new hw rng source has never been adapted to this and thus used 0 if the hw rng implementation does not fill the quality field. So when two rng sources register, one with 0 (meaning 1024) and the other one with 999, the 999 hw rng will be chosen. As the later invoked function hwrng_init() anyway adjusts the quality field of the hw rng source, this adjustment is now done during registration of this new hw rng source. Tested on s390 with two hardware rng sources: crypto cards and trng true random generator device driver. Fixes: 16bdbae39428 ("hwrng: core - treat default_quality as a maximum and default to 1024") Reported-by: Christian Rund Suggested-by: Herbert Xu Signed-off-by: Harald Freudenberger Signed-off-by: Herbert Xu --- drivers/char/hw_random/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index f5c71a617a99..d8bc69025ae1 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -174,7 +174,6 @@ static int hwrng_init(struct hwrng *rng) reinit_completion(&rng->cleanup_done); skip_init: - rng->quality = min_t(u16, min_t(u16, default_quality, 1024), rng->quality ?: 1024); current_quality = rng->quality; /* obsolete */ return 0; @@ -563,6 +562,9 @@ int hwrng_register(struct hwrng *rng) complete(&rng->cleanup_done); init_completion(&rng->dying); + /* Adjust quality field to always have a proper value */ + rng->quality = min_t(u16, min_t(u16, default_quality, 1024), rng->quality ?: 1024); + if (!current_rng || (!cur_rng_set_by_user && rng->quality > current_rng->quality)) { /* -- cgit From 996f8a9654d0c7b8742379d4e2f1052fccac6643 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 25 Jun 2024 00:21:07 +0100 Subject: dt-bindings: crypto: sun8i-ce: Add compatible for H616 The Allwinner H616 has a crypto engine very similar to the one in the H6, although all addresses in the DMA descriptors are shifted by 2 bits, to accommodate for the larger physical address space. That makes it incompatible to the H6 variant, and thus requires a new compatible string. Clock wise it relies on the internal oscillator for the TRNG, so needs all four possible clocks specified. Add the compatible string to the list of recognised names, and add the H616 to list of devices requiring all four clocks. Signed-off-by: Andre Przywara Acked-by: Krzysztof Kozlowski Acked-by: Chen-Yu Tsai Signed-off-by: Herbert Xu --- Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml b/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml index 4287678aa79f..da47b601c165 100644 --- a/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml +++ b/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml @@ -18,6 +18,7 @@ properties: - allwinner,sun50i-a64-crypto - allwinner,sun50i-h5-crypto - allwinner,sun50i-h6-crypto + - allwinner,sun50i-h616-crypto reg: maxItems: 1 @@ -49,6 +50,7 @@ if: compatible: enum: - allwinner,sun20i-d1-crypto + - allwinner,sun50i-h616-crypto then: properties: clocks: -- cgit From e0740bee6c21c209191f55d8dfff7c16aeb3578a Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 25 Jun 2024 00:21:08 +0100 Subject: crypto: sun8i-ce - wrap accesses to descriptor address fields The Allwinner H616 (and later) SoCs support more than 32 bits worth of physical addresses. To accommodate the larger address space, the CE task descriptor fields holding addresses are now encoded as "word addresses", so take the actual address divided by four. This is true for the fields within the descriptor, but also for the descriptor base address, in the CE_TDA register. Wrap all accesses to those fields in a function, which will do the required division if needed. For now this in unused, so there should be no change in behaviour. Signed-off-by: Andre Przywara Reviewed-by: Chen-Yu Tsai Signed-off-by: Herbert Xu --- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c | 8 ++++---- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c | 2 +- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c | 6 +++--- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-prng.c | 6 +++--- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c | 2 +- drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h | 15 +++++++++++++++ 6 files changed, 27 insertions(+), 12 deletions(-) diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c index de50c00ba218..19b7fb4a93e8 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c @@ -190,7 +190,7 @@ static int sun8i_ce_cipher_prepare(struct crypto_engine *engine, void *async_req err = -EFAULT; goto theend; } - cet->t_key = cpu_to_le32(rctx->addr_key); + cet->t_key = desc_addr_val_le32(ce, rctx->addr_key); ivsize = crypto_skcipher_ivsize(tfm); if (areq->iv && crypto_skcipher_ivsize(tfm) > 0) { @@ -208,7 +208,7 @@ static int sun8i_ce_cipher_prepare(struct crypto_engine *engine, void *async_req err = -ENOMEM; goto theend_iv; } - cet->t_iv = cpu_to_le32(rctx->addr_iv); + cet->t_iv = desc_addr_val_le32(ce, rctx->addr_iv); } if (areq->src == areq->dst) { @@ -236,7 +236,7 @@ static int sun8i_ce_cipher_prepare(struct crypto_engine *engine, void *async_req len = areq->cryptlen; for_each_sg(areq->src, sg, nr_sgs, i) { - cet->t_src[i].addr = cpu_to_le32(sg_dma_address(sg)); + cet->t_src[i].addr = desc_addr_val_le32(ce, sg_dma_address(sg)); todo = min(len, sg_dma_len(sg)); cet->t_src[i].len = cpu_to_le32(todo / 4); dev_dbg(ce->dev, "%s total=%u SG(%d %u off=%d) todo=%u\n", __func__, @@ -251,7 +251,7 @@ static int sun8i_ce_cipher_prepare(struct crypto_engine *engine, void *async_req len = areq->cryptlen; for_each_sg(areq->dst, sg, nr_sgd, i) { - cet->t_dst[i].addr = cpu_to_le32(sg_dma_address(sg)); + cet->t_dst[i].addr = desc_addr_val_le32(ce, sg_dma_address(sg)); todo = min(len, sg_dma_len(sg)); cet->t_dst[i].len = cpu_to_le32(todo / 4); dev_dbg(ce->dev, "%s total=%u SG(%d %u off=%d) todo=%u\n", __func__, diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c index 0408b2d5d533..6d45c1e559f7 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c @@ -172,7 +172,7 @@ int sun8i_ce_run_task(struct sun8i_ce_dev *ce, int flow, const char *name) writel(v, ce->base + CE_ICR); reinit_completion(&ce->chanlist[flow].complete); - writel(ce->chanlist[flow].t_phy, ce->base + CE_TDQ); + writel(desc_addr_val(ce, ce->chanlist[flow].t_phy), ce->base + CE_TDQ); ce->chanlist[flow].status = 0; /* Be sure all data is written before enabling the task */ diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c index ee2a28c906ed..6072dd9f390b 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c @@ -403,7 +403,7 @@ int sun8i_ce_hash_run(struct crypto_engine *engine, void *breq) len = areq->nbytes; for_each_sg(areq->src, sg, nr_sgs, i) { - cet->t_src[i].addr = cpu_to_le32(sg_dma_address(sg)); + cet->t_src[i].addr = desc_addr_val_le32(ce, sg_dma_address(sg)); todo = min(len, sg_dma_len(sg)); cet->t_src[i].len = cpu_to_le32(todo / 4); len -= todo; @@ -414,7 +414,7 @@ int sun8i_ce_hash_run(struct crypto_engine *engine, void *breq) goto theend; } addr_res = dma_map_single(ce->dev, result, digestsize, DMA_FROM_DEVICE); - cet->t_dst[0].addr = cpu_to_le32(addr_res); + cet->t_dst[0].addr = desc_addr_val_le32(ce, addr_res); cet->t_dst[0].len = cpu_to_le32(digestsize / 4); if (dma_mapping_error(ce->dev, addr_res)) { dev_err(ce->dev, "DMA map dest\n"); @@ -445,7 +445,7 @@ int sun8i_ce_hash_run(struct crypto_engine *engine, void *breq) } addr_pad = dma_map_single(ce->dev, buf, j * 4, DMA_TO_DEVICE); - cet->t_src[i].addr = cpu_to_le32(addr_pad); + cet->t_src[i].addr = desc_addr_val_le32(ce, addr_pad); cet->t_src[i].len = cpu_to_le32(j); if (dma_mapping_error(ce->dev, addr_pad)) { dev_err(ce->dev, "DMA error on padding SG\n"); diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-prng.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-prng.c index 80815379f6fc..762459867b6c 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-prng.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-prng.c @@ -132,10 +132,10 @@ int sun8i_ce_prng_generate(struct crypto_rng *tfm, const u8 *src, cet->t_sym_ctl = cpu_to_le32(sym); cet->t_asym_ctl = 0; - cet->t_key = cpu_to_le32(dma_iv); - cet->t_iv = cpu_to_le32(dma_iv); + cet->t_key = desc_addr_val_le32(ce, dma_iv); + cet->t_iv = desc_addr_val_le32(ce, dma_iv); - cet->t_dst[0].addr = cpu_to_le32(dma_dst); + cet->t_dst[0].addr = desc_addr_val_le32(ce, dma_dst); cet->t_dst[0].len = cpu_to_le32(todo / 4); ce->chanlist[flow].timeout = 2000; diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c index 9c35f2a83eda..e1e8bc15202e 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c @@ -77,7 +77,7 @@ static int sun8i_ce_trng_read(struct hwrng *rng, void *data, size_t max, bool wa cet->t_sym_ctl = 0; cet->t_asym_ctl = 0; - cet->t_dst[0].addr = cpu_to_le32(dma_dst); + cet->t_dst[0].addr = desc_addr_val_le32(ce, dma_dst); cet->t_dst[0].len = cpu_to_le32(todo / 4); ce->chanlist[flow].timeout = todo; diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h index 93d4985def87..3b5c2af013d0 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h @@ -149,6 +149,7 @@ struct ce_variant { bool hash_t_dlen_in_bits; bool prng_t_dlen_in_bytes; bool trng_t_dlen_in_bytes; + bool needs_word_addresses; struct ce_clock ce_clks[CE_MAX_CLOCKS]; int esr; unsigned char prng; @@ -241,6 +242,20 @@ struct sun8i_ce_dev { #endif }; +static inline u32 desc_addr_val(struct sun8i_ce_dev *dev, dma_addr_t addr) +{ + if (dev->variant->needs_word_addresses) + return addr / 4; + + return addr; +} + +static inline __le32 desc_addr_val_le32(struct sun8i_ce_dev *dev, + dma_addr_t addr) +{ + return cpu_to_le32(desc_addr_val(dev, addr)); +} + /* * struct sun8i_cipher_req_ctx - context for a skcipher request * @op_dir: direction (encrypt vs decrypt) for this request -- cgit From 1611f74974d8b64711375bcd5dc33d80f28104cb Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 25 Jun 2024 00:21:09 +0100 Subject: crypto: sun8i-ce - add Allwinner H616 support The crypto engine in the Allwinner H616 is very similar to the H6, but needs the base address for the task descriptor and the addresses within it to be expressed in words, not in bytes. Add a new variant struct entry for the H616, and set the new flag to mark the use of 34 bit addresses. Also the internal 32K oscillator is required for TRNG operation, so specify all four clocks. Signed-off-by: Andre Przywara Reviewed-by: Chen-Yu Tsai Tested-by: Ryan Walklin Tested-by: Philippe Simons Signed-off-by: Herbert Xu --- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c index 6d45c1e559f7..e55e58e164db 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c @@ -92,6 +92,30 @@ static const struct ce_variant ce_h6_variant = { .trng = CE_ALG_TRNG_V2, }; +static const struct ce_variant ce_h616_variant = { + .alg_cipher = { CE_ALG_AES, CE_ALG_DES, CE_ALG_3DES, + }, + .alg_hash = { CE_ALG_MD5, CE_ALG_SHA1, CE_ALG_SHA224, CE_ALG_SHA256, + CE_ALG_SHA384, CE_ALG_SHA512 + }, + .op_mode = { CE_OP_ECB, CE_OP_CBC + }, + .cipher_t_dlen_in_bytes = true, + .hash_t_dlen_in_bits = true, + .prng_t_dlen_in_bytes = true, + .trng_t_dlen_in_bytes = true, + .needs_word_addresses = true, + .ce_clks = { + { "bus", 0, 200000000 }, + { "mod", 300000000, 0 }, + { "ram", 0, 400000000 }, + { "trng", 0, 0 }, + }, + .esr = ESR_H6, + .prng = CE_ALG_PRNG_V2, + .trng = CE_ALG_TRNG_V2, +}; + static const struct ce_variant ce_a64_variant = { .alg_cipher = { CE_ALG_AES, CE_ALG_DES, CE_ALG_3DES, }, @@ -1097,6 +1121,8 @@ static const struct of_device_id sun8i_ce_crypto_of_match_table[] = { .data = &ce_h5_variant }, { .compatible = "allwinner,sun50i-h6-crypto", .data = &ce_h6_variant }, + { .compatible = "allwinner,sun50i-h616-crypto", + .data = &ce_h616_variant }, {} }; MODULE_DEVICE_TABLE(of, sun8i_ce_crypto_of_match_table); -- cgit From d26cb4f53dab654d337595cd247e6c435b571ca7 Mon Sep 17 00:00:00 2001 From: Damian Muszynski Date: Tue, 25 Jun 2024 15:36:44 +0100 Subject: Documentation: qat: fix auto_reset attribute details The auto_reset attribute was introduced in kernel 6.9. Fix version and date in documentation. Fixes: f5419a4239af ("crypto: qat - add auto reset on error") Signed-off-by: Damian Muszynski Signed-off-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- Documentation/ABI/testing/sysfs-driver-qat | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-driver-qat b/Documentation/ABI/testing/sysfs-driver-qat index 96020fb051c3..f290e77cd590 100644 --- a/Documentation/ABI/testing/sysfs-driver-qat +++ b/Documentation/ABI/testing/sysfs-driver-qat @@ -143,8 +143,8 @@ Description: This attribute is only available for qat_4xxx devices. What: /sys/bus/pci/devices//qat/auto_reset -Date: March 2024 -KernelVersion: 6.8 +Date: May 2024 +KernelVersion: 6.9 Contact: qat-linux@intel.com Description: (RW) Reports the current state of the autoreset feature for a QAT device -- cgit From 6424da7d8b938fe66e7e771eaa949bc7b6c29c00 Mon Sep 17 00:00:00 2001 From: Nivas Varadharajan Mugunthakumar Date: Tue, 25 Jun 2024 15:38:50 +0100 Subject: crypto: qat - extend scope of lock in adf_cfg_add_key_value_param() The function adf_cfg_add_key_value_param() attempts to access and modify the key value store of the driver without locking. Extend the scope of cfg->lock to avoid a potential race condition. Fixes: 92bf269fbfe9 ("crypto: qat - change behaviour of adf_cfg_add_key_value_param()") Signed-off-by: Nivas Varadharajan Mugunthakumar Signed-off-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- drivers/crypto/intel/qat/qat_common/adf_cfg.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/intel/qat/qat_common/adf_cfg.c b/drivers/crypto/intel/qat/qat_common/adf_cfg.c index 8836f015c39c..2cf102ad4ca8 100644 --- a/drivers/crypto/intel/qat/qat_common/adf_cfg.c +++ b/drivers/crypto/intel/qat/qat_common/adf_cfg.c @@ -290,17 +290,19 @@ int adf_cfg_add_key_value_param(struct adf_accel_dev *accel_dev, * 3. if the key exists with the same value, then return without doing * anything (the newly created key_val is freed). */ + down_write(&cfg->lock); if (!adf_cfg_key_val_get(accel_dev, section_name, key, temp_val)) { if (strncmp(temp_val, key_val->val, sizeof(temp_val))) { adf_cfg_keyval_remove(key, section); } else { kfree(key_val); - return 0; + goto out; } } - down_write(&cfg->lock); adf_cfg_keyval_add(key_val, section); + +out: up_write(&cfg->lock); return 0; } -- cgit From f0622894c59458fceb33c4197462bc2006f3fc6b Mon Sep 17 00:00:00 2001 From: Hareshx Sankar Raj Date: Tue, 25 Jun 2024 15:41:19 +0100 Subject: crypto: qat - fix unintentional re-enabling of error interrupts The logic that detects pending VF2PF interrupts unintentionally clears the section of the error mask register(s) not related to VF2PF. This might cause interrupts unrelated to VF2PF, reported through errsou3 and errsou5, to be reported again after the execution of the function disable_pending_vf2pf_interrupts() in dh895xcc and GEN2 devices. Fix by updating only section of errmsk3 and errmsk5 related to VF2PF. Signed-off-by: Hareshx Sankar Raj Reviewed-by: Damian Muszynski Signed-off-by: Giovanni Cabiddu Signed-off-by: Herbert Xu --- drivers/crypto/intel/qat/qat_common/adf_gen2_pfvf.c | 4 +++- drivers/crypto/intel/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/intel/qat/qat_common/adf_gen2_pfvf.c b/drivers/crypto/intel/qat/qat_common/adf_gen2_pfvf.c index 70ef11963938..43af81fcab86 100644 --- a/drivers/crypto/intel/qat/qat_common/adf_gen2_pfvf.c +++ b/drivers/crypto/intel/qat/qat_common/adf_gen2_pfvf.c @@ -100,7 +100,9 @@ static u32 adf_gen2_disable_pending_vf2pf_interrupts(void __iomem *pmisc_addr) errmsk3 |= ADF_GEN2_ERR_MSK_VF2PF(ADF_GEN2_VF_MSK); ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); - errmsk3 &= ADF_GEN2_ERR_MSK_VF2PF(sources | disabled); + /* Update only section of errmsk3 related to VF2PF */ + errmsk3 &= ~ADF_GEN2_ERR_MSK_VF2PF(ADF_GEN2_VF_MSK); + errmsk3 |= ADF_GEN2_ERR_MSK_VF2PF(sources | disabled); ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); /* Return the sources of the (new) interrupt(s) */ diff --git a/drivers/crypto/intel/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c b/drivers/crypto/intel/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c index 6e24d57e6b98..c0661ff5e929 100644 --- a/drivers/crypto/intel/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c +++ b/drivers/crypto/intel/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c @@ -193,8 +193,12 @@ static u32 disable_pending_vf2pf_interrupts(void __iomem *pmisc_addr) ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK5, errmsk5); - errmsk3 &= ADF_DH895XCC_ERR_MSK_VF2PF_L(sources | disabled); - errmsk5 &= ADF_DH895XCC_ERR_MSK_VF2PF_U(sources | disabled); + /* Update only section of errmsk3 and errmsk5 related to VF2PF */ + errmsk3 &= ~ADF_DH895XCC_ERR_MSK_VF2PF_L(ADF_DH895XCC_VF_MSK); + errmsk5 &= ~ADF_DH895XCC_ERR_MSK_VF2PF_U(ADF_DH895XCC_VF_MSK); + + errmsk3 |= ADF_DH895XCC_ERR_MSK_VF2PF_L(sources | disabled); + errmsk5 |= ADF_DH895XCC_ERR_MSK_VF2PF_U(sources | disabled); ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK5, errmsk5); -- cgit From 6aad7019f697ab0bed98eba737d19bd7f67713de Mon Sep 17 00:00:00 2001 From: Jia Jie Ho Date: Wed, 26 Jun 2024 09:40:42 +0800 Subject: crypto: starfive - Align rsa input data to 32-bit Hardware expects RSA input plain/ciphertext to be 32-bit aligned. Set fixed length for preallocated buffer to the maximum supported keysize of the hardware and shift input text accordingly. Signed-off-by: Jia Jie Ho Signed-off-by: Herbert Xu --- drivers/crypto/starfive/jh7110-cryp.h | 3 ++- drivers/crypto/starfive/jh7110-rsa.c | 12 ++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/starfive/jh7110-cryp.h b/drivers/crypto/starfive/jh7110-cryp.h index 494a74f52706..85c65c6c0327 100644 --- a/drivers/crypto/starfive/jh7110-cryp.h +++ b/drivers/crypto/starfive/jh7110-cryp.h @@ -30,6 +30,7 @@ #define MAX_KEY_SIZE SHA512_BLOCK_SIZE #define STARFIVE_AES_IV_LEN AES_BLOCK_SIZE #define STARFIVE_AES_CTR_LEN AES_BLOCK_SIZE +#define STARFIVE_RSA_MAX_KEYSZ 256 union starfive_aes_csr { u32 v; @@ -222,7 +223,7 @@ struct starfive_cryp_request_ctx { unsigned int digsize; unsigned long in_sg_len; unsigned char *adata; - u8 rsa_data[] __aligned(sizeof(u32)); + u8 rsa_data[STARFIVE_RSA_MAX_KEYSZ] __aligned(sizeof(u32)); }; struct starfive_cryp_dev *starfive_cryp_find_dev(struct starfive_cryp_ctx *ctx); diff --git a/drivers/crypto/starfive/jh7110-rsa.c b/drivers/crypto/starfive/jh7110-rsa.c index 33093ba4b13a..59f5979e9360 100644 --- a/drivers/crypto/starfive/jh7110-rsa.c +++ b/drivers/crypto/starfive/jh7110-rsa.c @@ -31,7 +31,6 @@ /* A * A * R mod N ==> A */ #define CRYPTO_CMD_AARN 0x7 -#define STARFIVE_RSA_MAX_KEYSZ 256 #define STARFIVE_RSA_RESET 0x2 static inline int starfive_pka_wait_done(struct starfive_cryp_ctx *ctx) @@ -74,7 +73,7 @@ static int starfive_rsa_montgomery_form(struct starfive_cryp_ctx *ctx, { struct starfive_cryp_dev *cryp = ctx->cryp; struct starfive_cryp_request_ctx *rctx = ctx->rctx; - int count = rctx->total / sizeof(u32) - 1; + int count = (ALIGN(rctx->total, 4) / 4) - 1; int loop; u32 temp; u8 opsize; @@ -251,12 +250,17 @@ static int starfive_rsa_enc_core(struct starfive_cryp_ctx *ctx, int enc) struct starfive_cryp_dev *cryp = ctx->cryp; struct starfive_cryp_request_ctx *rctx = ctx->rctx; struct starfive_rsa_key *key = &ctx->rsa_key; - int ret = 0; + int ret = 0, shift = 0; writel(STARFIVE_RSA_RESET, cryp->base + STARFIVE_PKA_CACR_OFFSET); + if (!IS_ALIGNED(rctx->total, sizeof(u32))) { + shift = sizeof(u32) - (rctx->total & 0x3); + memset(rctx->rsa_data, 0, shift); + } + rctx->total = sg_copy_to_buffer(rctx->in_sg, rctx->nents, - rctx->rsa_data, rctx->total); + rctx->rsa_data + shift, rctx->total); if (enc) { key->bitlen = key->e_bitlen; -- cgit From 8323c036789b8b4a61925fce439a89dba17b7f2f Mon Sep 17 00:00:00 2001 From: Jia Jie Ho Date: Wed, 26 Jun 2024 09:40:43 +0800 Subject: crypto: starfive - Fix nent assignment in rsa dec Missing src scatterlist nent assignment in rsa decrypt function. Removing all unneeded assignment and use nents value from req->src instead. Signed-off-by: Jia Jie Ho Signed-off-by: Herbert Xu --- drivers/crypto/starfive/jh7110-cryp.h | 1 - drivers/crypto/starfive/jh7110-rsa.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/crypto/starfive/jh7110-cryp.h b/drivers/crypto/starfive/jh7110-cryp.h index 85c65c6c0327..5ed4ba5da7f9 100644 --- a/drivers/crypto/starfive/jh7110-cryp.h +++ b/drivers/crypto/starfive/jh7110-cryp.h @@ -218,7 +218,6 @@ struct starfive_cryp_request_ctx { struct scatterlist *out_sg; struct ahash_request ahash_fbk_req; size_t total; - size_t nents; unsigned int blksize; unsigned int digsize; unsigned long in_sg_len; diff --git a/drivers/crypto/starfive/jh7110-rsa.c b/drivers/crypto/starfive/jh7110-rsa.c index 59f5979e9360..a778c4846025 100644 --- a/drivers/crypto/starfive/jh7110-rsa.c +++ b/drivers/crypto/starfive/jh7110-rsa.c @@ -259,7 +259,7 @@ static int starfive_rsa_enc_core(struct starfive_cryp_ctx *ctx, int enc) memset(rctx->rsa_data, 0, shift); } - rctx->total = sg_copy_to_buffer(rctx->in_sg, rctx->nents, + rctx->total = sg_copy_to_buffer(rctx->in_sg, sg_nents(rctx->in_sg), rctx->rsa_data + shift, rctx->total); if (enc) { @@ -309,7 +309,6 @@ static int starfive_rsa_enc(struct akcipher_request *req) rctx->in_sg = req->src; rctx->out_sg = req->dst; rctx->total = req->src_len; - rctx->nents = sg_nents(rctx->in_sg); ctx->rctx = rctx; return starfive_rsa_enc_core(ctx, 1); -- cgit From addea5858b66a185d33bf1b90980316f5f375d75 Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Tue, 25 Jun 2024 21:57:46 +0200 Subject: hwrng: Kconfig - Do not enable by default CN10K driver Do not enable by default the CN10K HW random generator driver. CN10K Random Number Generator is available only on some specific Marvell SoCs, however the driver is in practice enabled by default on all arm64 configs. Signed-off-by: Francesco Dolcini Signed-off-by: Herbert Xu --- drivers/char/hw_random/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index 442c40efb200..01e2e1ef82cf 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -555,7 +555,6 @@ config HW_RANDOM_ARM_SMCCC_TRNG config HW_RANDOM_CN10K tristate "Marvell CN10K Random Number Generator support" depends on HW_RANDOM && PCI && (ARM64 || (64BIT && COMPILE_TEST)) - default HW_RANDOM help This driver provides support for the True Random Number generator available in Marvell CN10K SoCs. -- cgit From dd52b5eeb0f70893f762da7254e923fd23fd1379 Mon Sep 17 00:00:00 2001 From: David Gstir Date: Wed, 3 Jul 2024 14:49:58 +0200 Subject: crypto: mxs-dcp - Ensure payload is zero when using key slot We could leak stack memory through the payload field when running AES with a key from one of the hardware's key slots. Fix this by ensuring the payload field is set to 0 in such cases. This does not affect the common use case when the key is supplied from main memory via the descriptor payload. Signed-off-by: David Gstir Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202405270146.Y9tPoil8-lkp@intel.com/ Fixes: 3d16af0b4cfa ("crypto: mxs-dcp: Add support for hardware-bound keys") Signed-off-by: Herbert Xu --- drivers/crypto/mxs-dcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c index 057d73c370b7..c82775dbb557 100644 --- a/drivers/crypto/mxs-dcp.c +++ b/drivers/crypto/mxs-dcp.c @@ -225,7 +225,8 @@ static int mxs_dcp_start_dma(struct dcp_async_ctx *actx) static int mxs_dcp_run_aes(struct dcp_async_ctx *actx, struct skcipher_request *req, int init) { - dma_addr_t key_phys, src_phys, dst_phys; + dma_addr_t key_phys = 0; + dma_addr_t src_phys, dst_phys; struct dcp *sdcp = global_sdcp; struct dcp_dma_desc *desc = &sdcp->coh->desc[actx->chan]; struct dcp_aes_req_ctx *rctx = skcipher_request_ctx(req); -- cgit From 101e99c23af9460890edc9055141e6aba6d3e08a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 3 Jul 2024 12:04:31 -0700 Subject: crypto: testmgr - generate power-of-2 lengths more often Implementations of hash functions often have special cases when lengths are a multiple of the hash function's internal block size (e.g. 64 for SHA-256, 128 for SHA-512). Currently, when the fuzz testing code generates lengths, it doesn't prefer any length mod 64 over any other. This limits the coverage of these special cases. Therefore, this patch updates the fuzz testing code to generate power-of-2 lengths and divide messages exactly in half a bit more often. Reviewed-by: Sami Tolvanen Acked-by: Ard Biesheuvel Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/testmgr.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index a780b615f8c6..f02cb075bd68 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -916,14 +916,20 @@ static unsigned int generate_random_length(struct rnd_state *rng, switch (prandom_u32_below(rng, 4)) { case 0: - return len % 64; + len %= 64; + break; case 1: - return len % 256; + len %= 256; + break; case 2: - return len % 1024; + len %= 1024; + break; default: - return len; + break; } + if (len && prandom_u32_below(rng, 4) == 0) + len = rounddown_pow_of_two(len); + return len; } /* Flip a random bit in the given nonempty data buffer */ @@ -1019,6 +1025,8 @@ static char *generate_random_sgl_divisions(struct rnd_state *rng, if (div == &divs[max_divs - 1] || prandom_bool(rng)) this_len = remaining; + else if (prandom_u32_below(rng, 4) == 0) + this_len = (remaining + 1) / 2; else this_len = prandom_u32_inclusive(rng, 1, remaining); div->proportion_of_total = this_len; -- cgit From fe69b772e35e181ff0576ab4a610515ffe7a3325 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Jul 2024 10:25:05 -0500 Subject: crypto: lib/mpi - delete unnecessary condition We checked that "nlimbs" is non-zero in the outside if statement so delete the duplicate check here. Signed-off-by: Dan Carpenter Reviewed-by: Tianjia Zhang Signed-off-by: Herbert Xu --- lib/crypto/mpi/mpi-bit.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c index 070ba784c9f1..e08fc202ea5c 100644 --- a/lib/crypto/mpi/mpi-bit.c +++ b/lib/crypto/mpi/mpi-bit.c @@ -212,12 +212,10 @@ void mpi_rshift(MPI x, MPI a, unsigned int n) return; } - if (nlimbs) { - for (i = 0; i < x->nlimbs - nlimbs; i++) - x->d[i] = x->d[i+nlimbs]; - x->d[i] = 0; - x->nlimbs -= nlimbs; - } + for (i = 0; i < x->nlimbs - nlimbs; i++) + x->d[i] = x->d[i+nlimbs]; + x->d[i] = 0; + x->nlimbs -= nlimbs; if (x->nlimbs && nbits) mpihelp_rshift(x->d, x->d, x->nlimbs, nbits); -- cgit From df1e9791998a92fe9f1e7d3f031b34daaad39e2f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 6 Jul 2024 23:41:24 +0900 Subject: hwrng: core - remove (un)register_miscdev() These functions are redundant after commit 0daa7a0afd0f ("hwrng: Avoid manual device_create_file() calls"). Let's call misc_(de)register() directly. Signed-off-by: Masahiro Yamada Signed-off-by: Herbert Xu --- drivers/char/hw_random/core.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index d8bc69025ae1..c42fb9fdccce 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -486,16 +486,6 @@ static struct attribute *rng_dev_attrs[] = { ATTRIBUTE_GROUPS(rng_dev); -static void __exit unregister_miscdev(void) -{ - misc_deregister(&rng_miscdev); -} - -static int __init register_miscdev(void) -{ - return misc_register(&rng_miscdev); -} - static int hwrng_fillfn(void *unused) { size_t entropy, entropy_credit = 0; /* in 1/1024 of a bit */ @@ -709,7 +699,7 @@ static int __init hwrng_modinit(void) return -ENOMEM; } - ret = register_miscdev(); + ret = misc_register(&rng_miscdev); if (ret) { kfree(rng_fillbuf); kfree(rng_buffer); @@ -726,7 +716,7 @@ static void __exit hwrng_modexit(void) kfree(rng_fillbuf); mutex_unlock(&rng_mutex); - unregister_miscdev(); + misc_deregister(&rng_miscdev); } fs_initcall(hwrng_modinit); /* depends on misc_register() */ -- cgit