diff options
Diffstat (limited to 'arch/x86/crypto')
| -rw-r--r-- | arch/x86/crypto/Makefile | 25 | ||||
| -rw-r--r-- | arch/x86/crypto/aegis128-aesni-asm.S | 749 | ||||
| -rw-r--r-- | arch/x86/crypto/aegis128-aesni-glue.c | 407 | ||||
| -rw-r--r-- | arch/x86/crypto/aegis128l-aesni-asm.S | 825 | ||||
| -rw-r--r-- | arch/x86/crypto/aegis128l-aesni-glue.c | 407 | ||||
| -rw-r--r-- | arch/x86/crypto/aegis256-aesni-asm.S | 702 | ||||
| -rw-r--r-- | arch/x86/crypto/aegis256-aesni-glue.c | 407 | ||||
| -rw-r--r-- | arch/x86/crypto/ghash-clmulni-intel_glue.c | 2 | ||||
| -rw-r--r-- | arch/x86/crypto/morus1280-avx2-asm.S | 621 | ||||
| -rw-r--r-- | arch/x86/crypto/morus1280-avx2-glue.c | 68 | ||||
| -rw-r--r-- | arch/x86/crypto/morus1280-sse2-asm.S | 895 | ||||
| -rw-r--r-- | arch/x86/crypto/morus1280-sse2-glue.c | 68 | ||||
| -rw-r--r-- | arch/x86/crypto/morus1280_glue.c | 302 | ||||
| -rw-r--r-- | arch/x86/crypto/morus640-sse2-asm.S | 614 | ||||
| -rw-r--r-- | arch/x86/crypto/morus640-sse2-glue.c | 68 | ||||
| -rw-r--r-- | arch/x86/crypto/morus640_glue.c | 298 | ||||
| -rw-r--r-- | arch/x86/crypto/salsa20-i586-asm_32.S | 938 | ||||
| -rw-r--r-- | arch/x86/crypto/salsa20-x86_64-asm_64.S | 805 | ||||
| -rw-r--r-- | arch/x86/crypto/salsa20_glue.c | 91 | 
19 files changed, 6453 insertions, 1839 deletions
| diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5f07333bb224..a450ad573dcb 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o  obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o  obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o -obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o  obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o  obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o @@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o  obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o  obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o  obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o -obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o  obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o  obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o  obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o @@ -38,6 +36,16 @@ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o  obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o  obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o +obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o +obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o +obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o + +obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o +obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o + +obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o +obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o +  # These modules require assembler to support AVX.  ifeq ($(avx_supported),yes)  	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ @@ -55,11 +63,12 @@ ifeq ($(avx2_supported),yes)  	obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/  	obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/  	obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/ + +	obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o  endif  aes-i586-y := aes-i586-asm_32.o aes_glue.o  twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o -salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o  serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o  aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o @@ -68,10 +77,16 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o  blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o  twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o  twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o -salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o  chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o  serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o +aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o +aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o +aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o + +morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o +morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o +  ifeq ($(avx_supported),yes)  	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \  					camellia_aesni_avx_glue.o @@ -87,6 +102,8 @@ ifeq ($(avx2_supported),yes)  	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o  	chacha20-x86_64-y += chacha20-avx2-x86_64.o  	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o + +	morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o  endif  aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S new file mode 100644 index 000000000000..9254e0b6cc06 --- /dev/null +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -0,0 +1,749 @@ +/* + * AES-NI + SSE2 implementation of AEGIS-128 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define STATE0	%xmm0 +#define STATE1	%xmm1 +#define STATE2	%xmm2 +#define STATE3	%xmm3 +#define STATE4	%xmm4 +#define KEY	%xmm5 +#define MSG	%xmm5 +#define T0	%xmm6 +#define T1	%xmm7 + +#define STATEP	%rdi +#define LEN	%rsi +#define SRC	%rdx +#define DST	%rcx + +.section .rodata.cst16.aegis128_const, "aM", @progbits, 32 +.align 16 +.Laegis128_const_0: +	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d +	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Laegis128_const_1: +	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 +	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 +.align 16 +.Laegis128_counter: +	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 +	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + +.text + +/* + * aegis128_update + * input: + *   STATE[0-4] - input state + * output: + *   STATE[0-4] - output state (shifted positions) + * changed: + *   T0 + */ +.macro aegis128_update +	movdqa STATE4, T0 +	aesenc STATE0, STATE4 +	aesenc STATE1, STATE0 +	aesenc STATE2, STATE1 +	aesenc STATE3, STATE2 +	aesenc T0,     STATE3 +.endm + +/* + * __load_partial: internal ABI + * input: + *   LEN - bytes + *   SRC - src + * output: + *   MSG  - message block + * changed: + *   T0 + *   %r8 + *   %r9 + */ +__load_partial: +	xor %r9, %r9 +	pxor MSG, MSG + +	mov LEN, %r8 +	and $0x1, %r8 +	jz .Lld_partial_1 + +	mov LEN, %r8 +	and $0x1E, %r8 +	add SRC, %r8 +	mov (%r8), %r9b + +.Lld_partial_1: +	mov LEN, %r8 +	and $0x2, %r8 +	jz .Lld_partial_2 + +	mov LEN, %r8 +	and $0x1C, %r8 +	add SRC, %r8 +	shl $0x10, %r9 +	mov (%r8), %r9w + +.Lld_partial_2: +	mov LEN, %r8 +	and $0x4, %r8 +	jz .Lld_partial_4 + +	mov LEN, %r8 +	and $0x18, %r8 +	add SRC, %r8 +	shl $32, %r9 +	mov (%r8), %r8d +	xor %r8, %r9 + +.Lld_partial_4: +	movq %r9, MSG + +	mov LEN, %r8 +	and $0x8, %r8 +	jz .Lld_partial_8 + +	mov LEN, %r8 +	and $0x10, %r8 +	add SRC, %r8 +	pslldq $8, MSG +	movq (%r8), T0 +	pxor T0, MSG + +.Lld_partial_8: +	ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + *   LEN - bytes + *   DST - dst + * output: + *   T0   - message block + * changed: + *   %r8 + *   %r9 + *   %r10 + */ +__store_partial: +	mov LEN, %r8 +	mov DST, %r9 + +	movq T0, %r10 + +	cmp $8, %r8 +	jl .Lst_partial_8 + +	mov %r10, (%r9) +	psrldq $8, T0 +	movq T0, %r10 + +	sub $8, %r8 +	add $8, %r9 + +.Lst_partial_8: +	cmp $4, %r8 +	jl .Lst_partial_4 + +	mov %r10d, (%r9) +	shr $32, %r10 + +	sub $4, %r8 +	add $4, %r9 + +.Lst_partial_4: +	cmp $2, %r8 +	jl .Lst_partial_2 + +	mov %r10w, (%r9) +	shr $0x10, %r10 + +	sub $2, %r8 +	add $2, %r9 + +.Lst_partial_2: +	cmp $1, %r8 +	jl .Lst_partial_1 + +	mov %r10b, (%r9) + +.Lst_partial_1: +	ret +ENDPROC(__store_partial) + +/* + * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_aegis128_aesni_init) +	FRAME_BEGIN + +	/* load IV: */ +	movdqu (%rdx), T1 + +	/* load key: */ +	movdqa (%rsi), KEY +	pxor KEY, T1 +	movdqa T1, STATE0 +	movdqa KEY, STATE3 +	movdqa KEY, STATE4 + +	/* load the constants: */ +	movdqa .Laegis128_const_0, STATE2 +	movdqa .Laegis128_const_1, STATE1 +	pxor STATE2, STATE3 +	pxor STATE1, STATE4 + +	/* update 10 times with KEY / KEY xor IV: */ +	aegis128_update; pxor KEY, STATE4 +	aegis128_update; pxor T1,  STATE3 +	aegis128_update; pxor KEY, STATE2 +	aegis128_update; pxor T1,  STATE1 +	aegis128_update; pxor KEY, STATE0 +	aegis128_update; pxor T1,  STATE4 +	aegis128_update; pxor KEY, STATE3 +	aegis128_update; pxor T1,  STATE2 +	aegis128_update; pxor KEY, STATE1 +	aegis128_update; pxor T1,  STATE0 + +	/* store the state: */ +	movdqu STATE0, 0x00(STATEP) +	movdqu STATE1, 0x10(STATEP) +	movdqu STATE2, 0x20(STATEP) +	movdqu STATE3, 0x30(STATEP) +	movdqu STATE4, 0x40(STATEP) + +	FRAME_END +	ret +ENDPROC(crypto_aegis128_aesni_init) + +/* + * void crypto_aegis128_aesni_ad(void *state, unsigned int length, + *                               const void *data); + */ +ENTRY(crypto_aegis128_aesni_ad) +	FRAME_BEGIN + +	cmp $0x10, LEN +	jb .Lad_out + +	/* load the state: */ +	movdqu 0x00(STATEP), STATE0 +	movdqu 0x10(STATEP), STATE1 +	movdqu 0x20(STATEP), STATE2 +	movdqu 0x30(STATEP), STATE3 +	movdqu 0x40(STATEP), STATE4 + +	mov SRC, %r8 +	and $0xF, %r8 +	jnz .Lad_u_loop + +.align 8 +.Lad_a_loop: +	movdqa 0x00(SRC), MSG +	aegis128_update +	pxor MSG, STATE4 +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lad_out_1 + +	movdqa 0x10(SRC), MSG +	aegis128_update +	pxor MSG, STATE3 +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lad_out_2 + +	movdqa 0x20(SRC), MSG +	aegis128_update +	pxor MSG, STATE2 +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lad_out_3 + +	movdqa 0x30(SRC), MSG +	aegis128_update +	pxor MSG, STATE1 +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lad_out_4 + +	movdqa 0x40(SRC), MSG +	aegis128_update +	pxor MSG, STATE0 +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lad_out_0 + +	add $0x50, SRC +	jmp .Lad_a_loop + +.align 8 +.Lad_u_loop: +	movdqu 0x00(SRC), MSG +	aegis128_update +	pxor MSG, STATE4 +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lad_out_1 + +	movdqu 0x10(SRC), MSG +	aegis128_update +	pxor MSG, STATE3 +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lad_out_2 + +	movdqu 0x20(SRC), MSG +	aegis128_update +	pxor MSG, STATE2 +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lad_out_3 + +	movdqu 0x30(SRC), MSG +	aegis128_update +	pxor MSG, STATE1 +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lad_out_4 + +	movdqu 0x40(SRC), MSG +	aegis128_update +	pxor MSG, STATE0 +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lad_out_0 + +	add $0x50, SRC +	jmp .Lad_u_loop + +	/* store the state: */ +.Lad_out_0: +	movdqu STATE0, 0x00(STATEP) +	movdqu STATE1, 0x10(STATEP) +	movdqu STATE2, 0x20(STATEP) +	movdqu STATE3, 0x30(STATEP) +	movdqu STATE4, 0x40(STATEP) +	FRAME_END +	ret + +.Lad_out_1: +	movdqu STATE4, 0x00(STATEP) +	movdqu STATE0, 0x10(STATEP) +	movdqu STATE1, 0x20(STATEP) +	movdqu STATE2, 0x30(STATEP) +	movdqu STATE3, 0x40(STATEP) +	FRAME_END +	ret + +.Lad_out_2: +	movdqu STATE3, 0x00(STATEP) +	movdqu STATE4, 0x10(STATEP) +	movdqu STATE0, 0x20(STATEP) +	movdqu STATE1, 0x30(STATEP) +	movdqu STATE2, 0x40(STATEP) +	FRAME_END +	ret + +.Lad_out_3: +	movdqu STATE2, 0x00(STATEP) +	movdqu STATE3, 0x10(STATEP) +	movdqu STATE4, 0x20(STATEP) +	movdqu STATE0, 0x30(STATEP) +	movdqu STATE1, 0x40(STATEP) +	FRAME_END +	ret + +.Lad_out_4: +	movdqu STATE1, 0x00(STATEP) +	movdqu STATE2, 0x10(STATEP) +	movdqu STATE3, 0x20(STATEP) +	movdqu STATE4, 0x30(STATEP) +	movdqu STATE0, 0x40(STATEP) +	FRAME_END +	ret + +.Lad_out: +	FRAME_END +	ret +ENDPROC(crypto_aegis128_aesni_ad) + +.macro encrypt_block a s0 s1 s2 s3 s4 i +	movdq\a (\i * 0x10)(SRC), MSG +	movdqa MSG, T0 +	pxor \s1, T0 +	pxor \s4, T0 +	movdqa \s2, T1 +	pand \s3, T1 +	pxor T1, T0 +	movdq\a T0, (\i * 0x10)(DST) + +	aegis128_update +	pxor MSG, \s4 + +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lenc_out_\i +.endm + +/* + * void crypto_aegis128_aesni_enc(void *state, unsigned int length, + *                                const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_enc) +	FRAME_BEGIN + +	cmp $0x10, LEN +	jb .Lenc_out + +	/* load the state: */ +	movdqu 0x00(STATEP), STATE0 +	movdqu 0x10(STATEP), STATE1 +	movdqu 0x20(STATEP), STATE2 +	movdqu 0x30(STATEP), STATE3 +	movdqu 0x40(STATEP), STATE4 + +	mov  SRC,  %r8 +	or   DST,  %r8 +	and $0xF, %r8 +	jnz .Lenc_u_loop + +.align 8 +.Lenc_a_loop: +	encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 +	encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 +	encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 +	encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 +	encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 + +	add $0x50, SRC +	add $0x50, DST +	jmp .Lenc_a_loop + +.align 8 +.Lenc_u_loop: +	encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 +	encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 +	encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 +	encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 +	encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 + +	add $0x50, SRC +	add $0x50, DST +	jmp .Lenc_u_loop + +	/* store the state: */ +.Lenc_out_0: +	movdqu STATE4, 0x00(STATEP) +	movdqu STATE0, 0x10(STATEP) +	movdqu STATE1, 0x20(STATEP) +	movdqu STATE2, 0x30(STATEP) +	movdqu STATE3, 0x40(STATEP) +	FRAME_END +	ret + +.Lenc_out_1: +	movdqu STATE3, 0x00(STATEP) +	movdqu STATE4, 0x10(STATEP) +	movdqu STATE0, 0x20(STATEP) +	movdqu STATE1, 0x30(STATEP) +	movdqu STATE2, 0x40(STATEP) +	FRAME_END +	ret + +.Lenc_out_2: +	movdqu STATE2, 0x00(STATEP) +	movdqu STATE3, 0x10(STATEP) +	movdqu STATE4, 0x20(STATEP) +	movdqu STATE0, 0x30(STATEP) +	movdqu STATE1, 0x40(STATEP) +	FRAME_END +	ret + +.Lenc_out_3: +	movdqu STATE1, 0x00(STATEP) +	movdqu STATE2, 0x10(STATEP) +	movdqu STATE3, 0x20(STATEP) +	movdqu STATE4, 0x30(STATEP) +	movdqu STATE0, 0x40(STATEP) +	FRAME_END +	ret + +.Lenc_out_4: +	movdqu STATE0, 0x00(STATEP) +	movdqu STATE1, 0x10(STATEP) +	movdqu STATE2, 0x20(STATEP) +	movdqu STATE3, 0x30(STATEP) +	movdqu STATE4, 0x40(STATEP) +	FRAME_END +	ret + +.Lenc_out: +	FRAME_END +	ret +ENDPROC(crypto_aegis128_aesni_enc) + +/* + * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, + *                                     const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_enc_tail) +	FRAME_BEGIN + +	/* load the state: */ +	movdqu 0x00(STATEP), STATE0 +	movdqu 0x10(STATEP), STATE1 +	movdqu 0x20(STATEP), STATE2 +	movdqu 0x30(STATEP), STATE3 +	movdqu 0x40(STATEP), STATE4 + +	/* encrypt message: */ +	call __load_partial + +	movdqa MSG, T0 +	pxor STATE1, T0 +	pxor STATE4, T0 +	movdqa STATE2, T1 +	pand STATE3, T1 +	pxor T1, T0 + +	call __store_partial + +	aegis128_update +	pxor MSG, STATE4 + +	/* store the state: */ +	movdqu STATE4, 0x00(STATEP) +	movdqu STATE0, 0x10(STATEP) +	movdqu STATE1, 0x20(STATEP) +	movdqu STATE2, 0x30(STATEP) +	movdqu STATE3, 0x40(STATEP) + +	FRAME_END +ENDPROC(crypto_aegis128_aesni_enc_tail) + +.macro decrypt_block a s0 s1 s2 s3 s4 i +	movdq\a (\i * 0x10)(SRC), MSG +	pxor \s1, MSG +	pxor \s4, MSG +	movdqa \s2, T1 +	pand \s3, T1 +	pxor T1, MSG +	movdq\a MSG, (\i * 0x10)(DST) + +	aegis128_update +	pxor MSG, \s4 + +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Ldec_out_\i +.endm + +/* + * void crypto_aegis128_aesni_dec(void *state, unsigned int length, + *                                const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_dec) +	FRAME_BEGIN + +	cmp $0x10, LEN +	jb .Ldec_out + +	/* load the state: */ +	movdqu 0x00(STATEP), STATE0 +	movdqu 0x10(STATEP), STATE1 +	movdqu 0x20(STATEP), STATE2 +	movdqu 0x30(STATEP), STATE3 +	movdqu 0x40(STATEP), STATE4 + +	mov  SRC, %r8 +	or   DST, %r8 +	and $0xF, %r8 +	jnz .Ldec_u_loop + +.align 8 +.Ldec_a_loop: +	decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 +	decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 +	decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 +	decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 +	decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 + +	add $0x50, SRC +	add $0x50, DST +	jmp .Ldec_a_loop + +.align 8 +.Ldec_u_loop: +	decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 +	decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 +	decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 +	decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 +	decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 + +	add $0x50, SRC +	add $0x50, DST +	jmp .Ldec_u_loop + +	/* store the state: */ +.Ldec_out_0: +	movdqu STATE4, 0x00(STATEP) +	movdqu STATE0, 0x10(STATEP) +	movdqu STATE1, 0x20(STATEP) +	movdqu STATE2, 0x30(STATEP) +	movdqu STATE3, 0x40(STATEP) +	FRAME_END +	ret + +.Ldec_out_1: +	movdqu STATE3, 0x00(STATEP) +	movdqu STATE4, 0x10(STATEP) +	movdqu STATE0, 0x20(STATEP) +	movdqu STATE1, 0x30(STATEP) +	movdqu STATE2, 0x40(STATEP) +	FRAME_END +	ret + +.Ldec_out_2: +	movdqu STATE2, 0x00(STATEP) +	movdqu STATE3, 0x10(STATEP) +	movdqu STATE4, 0x20(STATEP) +	movdqu STATE0, 0x30(STATEP) +	movdqu STATE1, 0x40(STATEP) +	FRAME_END +	ret + +.Ldec_out_3: +	movdqu STATE1, 0x00(STATEP) +	movdqu STATE2, 0x10(STATEP) +	movdqu STATE3, 0x20(STATEP) +	movdqu STATE4, 0x30(STATEP) +	movdqu STATE0, 0x40(STATEP) +	FRAME_END +	ret + +.Ldec_out_4: +	movdqu STATE0, 0x00(STATEP) +	movdqu STATE1, 0x10(STATEP) +	movdqu STATE2, 0x20(STATEP) +	movdqu STATE3, 0x30(STATEP) +	movdqu STATE4, 0x40(STATEP) +	FRAME_END +	ret + +.Ldec_out: +	FRAME_END +	ret +ENDPROC(crypto_aegis128_aesni_dec) + +/* + * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, + *                                     const void *src, void *dst); + */ +ENTRY(crypto_aegis128_aesni_dec_tail) +	FRAME_BEGIN + +	/* load the state: */ +	movdqu 0x00(STATEP), STATE0 +	movdqu 0x10(STATEP), STATE1 +	movdqu 0x20(STATEP), STATE2 +	movdqu 0x30(STATEP), STATE3 +	movdqu 0x40(STATEP), STATE4 + +	/* decrypt message: */ +	call __load_partial + +	pxor STATE1, MSG +	pxor STATE4, MSG +	movdqa STATE2, T1 +	pand STATE3, T1 +	pxor T1, MSG + +	movdqa MSG, T0 +	call __store_partial + +	/* mask with byte count: */ +	movq LEN, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	movdqa .Laegis128_counter, T1 +	pcmpgtb T1, T0 +	pand T0, MSG + +	aegis128_update +	pxor MSG, STATE4 + +	/* store the state: */ +	movdqu STATE4, 0x00(STATEP) +	movdqu STATE0, 0x10(STATEP) +	movdqu STATE1, 0x20(STATEP) +	movdqu STATE2, 0x30(STATEP) +	movdqu STATE3, 0x40(STATEP) + +	FRAME_END +	ret +ENDPROC(crypto_aegis128_aesni_dec_tail) + +/* + * void crypto_aegis128_aesni_final(void *state, void *tag_xor, + *                                  u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_aegis128_aesni_final) +	FRAME_BEGIN + +	/* load the state: */ +	movdqu 0x00(STATEP), STATE0 +	movdqu 0x10(STATEP), STATE1 +	movdqu 0x20(STATEP), STATE2 +	movdqu 0x30(STATEP), STATE3 +	movdqu 0x40(STATEP), STATE4 + +	/* prepare length block: */ +	movq %rdx, MSG +	movq %rcx, T0 +	pslldq $8, T0 +	pxor T0, MSG +	psllq $3, MSG /* multiply by 8 (to get bit count) */ + +	pxor STATE3, MSG + +	/* update state: */ +	aegis128_update; pxor MSG, STATE4 +	aegis128_update; pxor MSG, STATE3 +	aegis128_update; pxor MSG, STATE2 +	aegis128_update; pxor MSG, STATE1 +	aegis128_update; pxor MSG, STATE0 +	aegis128_update; pxor MSG, STATE4 +	aegis128_update; pxor MSG, STATE3 + +	/* xor tag: */ +	movdqu (%rsi), MSG + +	pxor STATE0, MSG +	pxor STATE1, MSG +	pxor STATE2, MSG +	pxor STATE3, MSG +	pxor STATE4, MSG + +	movdqu MSG, (%rsi) + +	FRAME_END +	ret +ENDPROC(crypto_aegis128_aesni_final) diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c new file mode 100644 index 000000000000..5de7c0d46edf --- /dev/null +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -0,0 +1,407 @@ +/* + * The AEGIS-128 Authenticated-Encryption Algorithm + *   Glue for AES-NI + SSE2 implementation + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/cryptd.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +#define AEGIS128_BLOCK_ALIGN 16 +#define AEGIS128_BLOCK_SIZE 16 +#define AEGIS128_NONCE_SIZE 16 +#define AEGIS128_STATE_BLOCKS 5 +#define AEGIS128_KEY_SIZE 16 +#define AEGIS128_MIN_AUTH_SIZE 8 +#define AEGIS128_MAX_AUTH_SIZE 16 + +asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv); + +asmlinkage void crypto_aegis128_aesni_ad( +		void *state, unsigned int length, const void *data); + +asmlinkage void crypto_aegis128_aesni_enc( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_dec( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_enc_tail( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_dec_tail( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128_aesni_final( +		void *state, void *tag_xor, unsigned int cryptlen, +		unsigned int assoclen); + +struct aegis_block { +	u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN); +}; + +struct aegis_state { +	struct aegis_block blocks[AEGIS128_STATE_BLOCKS]; +}; + +struct aegis_ctx { +	struct aegis_block key; +}; + +struct aegis_crypt_ops { +	int (*skcipher_walk_init)(struct skcipher_walk *walk, +				  struct aead_request *req, bool atomic); + +	void (*crypt_blocks)(void *state, unsigned int length, const void *src, +			     void *dst); +	void (*crypt_tail)(void *state, unsigned int length, const void *src, +			   void *dst); +}; + +static void crypto_aegis128_aesni_process_ad( +		struct aegis_state *state, struct scatterlist *sg_src, +		unsigned int assoclen) +{ +	struct scatter_walk walk; +	struct aegis_block buf; +	unsigned int pos = 0; + +	scatterwalk_start(&walk, sg_src); +	while (assoclen != 0) { +		unsigned int size = scatterwalk_clamp(&walk, assoclen); +		unsigned int left = size; +		void *mapped = scatterwalk_map(&walk); +		const u8 *src = (const u8 *)mapped; + +		if (pos + size >= AEGIS128_BLOCK_SIZE) { +			if (pos > 0) { +				unsigned int fill = AEGIS128_BLOCK_SIZE - pos; +				memcpy(buf.bytes + pos, src, fill); +				crypto_aegis128_aesni_ad(state, +							 AEGIS128_BLOCK_SIZE, +							 buf.bytes); +				pos = 0; +				left -= fill; +				src += fill; +			} + +			crypto_aegis128_aesni_ad(state, left, src); + +			src += left & ~(AEGIS128_BLOCK_SIZE - 1); +			left &= AEGIS128_BLOCK_SIZE - 1; +		} + +		memcpy(buf.bytes + pos, src, left); +		pos += left; +		assoclen -= size; + +		scatterwalk_unmap(mapped); +		scatterwalk_advance(&walk, size); +		scatterwalk_done(&walk, 0, assoclen); +	} + +	if (pos > 0) { +		memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos); +		crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes); +	} +} + +static void crypto_aegis128_aesni_process_crypt( +		struct aegis_state *state, struct aead_request *req, +		const struct aegis_crypt_ops *ops) +{ +	struct skcipher_walk walk; +	u8 *src, *dst; +	unsigned int chunksize, base; + +	ops->skcipher_walk_init(&walk, req, false); + +	while (walk.nbytes) { +		src = walk.src.virt.addr; +		dst = walk.dst.virt.addr; +		chunksize = walk.nbytes; + +		ops->crypt_blocks(state, chunksize, src, dst); + +		base = chunksize & ~(AEGIS128_BLOCK_SIZE - 1); +		src += base; +		dst += base; +		chunksize &= AEGIS128_BLOCK_SIZE - 1; + +		if (chunksize > 0) +			ops->crypt_tail(state, chunksize, src, dst); + +		skcipher_walk_done(&walk, 0); +	} +} + +static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead) +{ +	u8 *ctx = crypto_aead_ctx(aead); +	ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); +	return (void *)ctx; +} + +static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key, +					unsigned int keylen) +{ +	struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead); + +	if (keylen != AEGIS128_KEY_SIZE) { +		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); +		return -EINVAL; +	} + +	memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE); + +	return 0; +} + +static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, +						unsigned int authsize) +{ +	if (authsize > AEGIS128_MAX_AUTH_SIZE) +		return -EINVAL; +	if (authsize < AEGIS128_MIN_AUTH_SIZE) +		return -EINVAL; +	return 0; +} + +static void crypto_aegis128_aesni_crypt(struct aead_request *req, +					struct aegis_block *tag_xor, +					unsigned int cryptlen, +					const struct aegis_crypt_ops *ops) +{ +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); +	struct aegis_state state; + +	kernel_fpu_begin(); + +	crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv); +	crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); +	crypto_aegis128_aesni_process_crypt(&state, req, ops); +	crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + +	kernel_fpu_end(); +} + +static int crypto_aegis128_aesni_encrypt(struct aead_request *req) +{ +	static const struct aegis_crypt_ops OPS = { +		.skcipher_walk_init = skcipher_walk_aead_encrypt, +		.crypt_blocks = crypto_aegis128_aesni_enc, +		.crypt_tail = crypto_aegis128_aesni_enc_tail, +	}; + +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct aegis_block tag = {}; +	unsigned int authsize = crypto_aead_authsize(tfm); +	unsigned int cryptlen = req->cryptlen; + +	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + +	scatterwalk_map_and_copy(tag.bytes, req->dst, +				 req->assoclen + cryptlen, authsize, 1); +	return 0; +} + +static int crypto_aegis128_aesni_decrypt(struct aead_request *req) +{ +	static const struct aegis_block zeros = {}; + +	static const struct aegis_crypt_ops OPS = { +		.skcipher_walk_init = skcipher_walk_aead_decrypt, +		.crypt_blocks = crypto_aegis128_aesni_dec, +		.crypt_tail = crypto_aegis128_aesni_dec_tail, +	}; + +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct aegis_block tag; +	unsigned int authsize = crypto_aead_authsize(tfm); +	unsigned int cryptlen = req->cryptlen - authsize; + +	scatterwalk_map_and_copy(tag.bytes, req->src, +				 req->assoclen + cryptlen, authsize, 0); + +	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + +	return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; +} + +static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead) +{ +	return 0; +} + +static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead) +{ +} + +static int cryptd_aegis128_aesni_setkey(struct crypto_aead *aead, +					const u8 *key, unsigned int keylen) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} + +static int cryptd_aegis128_aesni_setauthsize(struct crypto_aead *aead, +					     unsigned int authsize) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} + +static int cryptd_aegis128_aesni_encrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	aead = &cryptd_tfm->base; +	if (irq_fpu_usable() && (!in_atomic() || +				 !cryptd_aead_queued(cryptd_tfm))) +		aead = cryptd_aead_child(cryptd_tfm); + +	aead_request_set_tfm(req, aead); + +	return crypto_aead_encrypt(req); +} + +static int cryptd_aegis128_aesni_decrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	aead = &cryptd_tfm->base; +	if (irq_fpu_usable() && (!in_atomic() || +				 !cryptd_aead_queued(cryptd_tfm))) +		aead = cryptd_aead_child(cryptd_tfm); + +	aead_request_set_tfm(req, aead); + +	return crypto_aead_decrypt(req); +} + +static int cryptd_aegis128_aesni_init_tfm(struct crypto_aead *aead) +{ +	struct cryptd_aead *cryptd_tfm; +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); + +	cryptd_tfm = cryptd_alloc_aead("__aegis128-aesni", CRYPTO_ALG_INTERNAL, +				       CRYPTO_ALG_INTERNAL); +	if (IS_ERR(cryptd_tfm)) +		return PTR_ERR(cryptd_tfm); + +	*ctx = cryptd_tfm; +	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); +	return 0; +} + +static void cryptd_aegis128_aesni_exit_tfm(struct crypto_aead *aead) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); + +	cryptd_free_aead(*ctx); +} + +static struct aead_alg crypto_aegis128_aesni_alg[] = { +	{ +		.setkey = crypto_aegis128_aesni_setkey, +		.setauthsize = crypto_aegis128_aesni_setauthsize, +		.encrypt = crypto_aegis128_aesni_encrypt, +		.decrypt = crypto_aegis128_aesni_decrypt, +		.init = crypto_aegis128_aesni_init_tfm, +		.exit = crypto_aegis128_aesni_exit_tfm, + +		.ivsize = AEGIS128_NONCE_SIZE, +		.maxauthsize = AEGIS128_MAX_AUTH_SIZE, +		.chunksize = AEGIS128_BLOCK_SIZE, + +		.base = { +			.cra_flags = CRYPTO_ALG_INTERNAL, +			.cra_blocksize = 1, +			.cra_ctxsize = sizeof(struct aegis_ctx) + +				__alignof__(struct aegis_ctx), +			.cra_alignmask = 0, + +			.cra_name = "__aegis128", +			.cra_driver_name = "__aegis128-aesni", + +			.cra_module = THIS_MODULE, +		} +	}, { +		.setkey = cryptd_aegis128_aesni_setkey, +		.setauthsize = cryptd_aegis128_aesni_setauthsize, +		.encrypt = cryptd_aegis128_aesni_encrypt, +		.decrypt = cryptd_aegis128_aesni_decrypt, +		.init = cryptd_aegis128_aesni_init_tfm, +		.exit = cryptd_aegis128_aesni_exit_tfm, + +		.ivsize = AEGIS128_NONCE_SIZE, +		.maxauthsize = AEGIS128_MAX_AUTH_SIZE, +		.chunksize = AEGIS128_BLOCK_SIZE, + +		.base = { +			.cra_flags = CRYPTO_ALG_ASYNC, +			.cra_blocksize = 1, +			.cra_ctxsize = sizeof(struct cryptd_aead *), +			.cra_alignmask = 0, + +			.cra_priority = 400, + +			.cra_name = "aegis128", +			.cra_driver_name = "aegis128-aesni", + +			.cra_module = THIS_MODULE, +		} +	} +}; + +static const struct x86_cpu_id aesni_cpu_id[] = { +	X86_FEATURE_MATCH(X86_FEATURE_AES), +	X86_FEATURE_MATCH(X86_FEATURE_XMM2), +	{} +}; +MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); + +static int __init crypto_aegis128_aesni_module_init(void) +{ +	if (!x86_match_cpu(aesni_cpu_id)) +		return -ENODEV; + +	return crypto_register_aeads(crypto_aegis128_aesni_alg, +				     ARRAY_SIZE(crypto_aegis128_aesni_alg)); +} + +static void __exit crypto_aegis128_aesni_module_exit(void) +{ +	crypto_unregister_aeads(crypto_aegis128_aesni_alg, +				ARRAY_SIZE(crypto_aegis128_aesni_alg)); +} + +module_init(crypto_aegis128_aesni_module_init); +module_exit(crypto_aegis128_aesni_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <[email protected]>"); +MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_ALIAS_CRYPTO("aegis128"); +MODULE_ALIAS_CRYPTO("aegis128-aesni"); diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S new file mode 100644 index 000000000000..9263c344f2c7 --- /dev/null +++ b/arch/x86/crypto/aegis128l-aesni-asm.S @@ -0,0 +1,825 @@ +/* + * AES-NI + SSE2 implementation of AEGIS-128L + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define STATE0	%xmm0 +#define STATE1	%xmm1 +#define STATE2	%xmm2 +#define STATE3	%xmm3 +#define STATE4	%xmm4 +#define STATE5	%xmm5 +#define STATE6	%xmm6 +#define STATE7	%xmm7 +#define MSG0	%xmm8 +#define MSG1	%xmm9 +#define T0	%xmm10 +#define T1	%xmm11 +#define T2	%xmm12 +#define T3	%xmm13 + +#define STATEP	%rdi +#define LEN	%rsi +#define SRC	%rdx +#define DST	%rcx + +.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32 +.align 16 +.Laegis128l_const_0: +	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d +	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Laegis128l_const_1: +	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 +	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16 +.align 16 +.Laegis128l_counter0: +	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 +	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.Laegis128l_counter1: +	.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 +	.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + +.text + +/* + * __load_partial: internal ABI + * input: + *   LEN - bytes + *   SRC - src + * output: + *   MSG0 - first message block + *   MSG1 - second message block + * changed: + *   T0 + *   %r8 + *   %r9 + */ +__load_partial: +	xor %r9, %r9 +	pxor MSG0, MSG0 +	pxor MSG1, MSG1 + +	mov LEN, %r8 +	and $0x1, %r8 +	jz .Lld_partial_1 + +	mov LEN, %r8 +	and $0x1E, %r8 +	add SRC, %r8 +	mov (%r8), %r9b + +.Lld_partial_1: +	mov LEN, %r8 +	and $0x2, %r8 +	jz .Lld_partial_2 + +	mov LEN, %r8 +	and $0x1C, %r8 +	add SRC, %r8 +	shl $0x10, %r9 +	mov (%r8), %r9w + +.Lld_partial_2: +	mov LEN, %r8 +	and $0x4, %r8 +	jz .Lld_partial_4 + +	mov LEN, %r8 +	and $0x18, %r8 +	add SRC, %r8 +	shl $32, %r9 +	mov (%r8), %r8d +	xor %r8, %r9 + +.Lld_partial_4: +	movq %r9, MSG0 + +	mov LEN, %r8 +	and $0x8, %r8 +	jz .Lld_partial_8 + +	mov LEN, %r8 +	and $0x10, %r8 +	add SRC, %r8 +	pslldq $8, MSG0 +	movq (%r8), T0 +	pxor T0, MSG0 + +.Lld_partial_8: +	mov LEN, %r8 +	and $0x10, %r8 +	jz .Lld_partial_16 + +	movdqa MSG0, MSG1 +	movdqu (SRC), MSG0 + +.Lld_partial_16: +	ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + *   LEN - bytes + *   DST - dst + * output: + *   T0   - first message block + *   T1   - second message block + * changed: + *   %r8 + *   %r9 + *   %r10 + */ +__store_partial: +	mov LEN, %r8 +	mov DST, %r9 + +	cmp $16, %r8 +	jl .Lst_partial_16 + +	movdqu T0, (%r9) +	movdqa T1, T0 + +	sub $16, %r8 +	add $16, %r9 + +.Lst_partial_16: +	movq T0, %r10 + +	cmp $8, %r8 +	jl .Lst_partial_8 + +	mov %r10, (%r9) +	psrldq $8, T0 +	movq T0, %r10 + +	sub $8, %r8 +	add $8, %r9 + +.Lst_partial_8: +	cmp $4, %r8 +	jl .Lst_partial_4 + +	mov %r10d, (%r9) +	shr $32, %r10 + +	sub $4, %r8 +	add $4, %r9 + +.Lst_partial_4: +	cmp $2, %r8 +	jl .Lst_partial_2 + +	mov %r10w, (%r9) +	shr $0x10, %r10 + +	sub $2, %r8 +	add $2, %r9 + +.Lst_partial_2: +	cmp $1, %r8 +	jl .Lst_partial_1 + +	mov %r10b, (%r9) + +.Lst_partial_1: +	ret +ENDPROC(__store_partial) + +.macro update +	movdqa STATE7, T0 +	aesenc STATE0, STATE7 +	aesenc STATE1, STATE0 +	aesenc STATE2, STATE1 +	aesenc STATE3, STATE2 +	aesenc STATE4, STATE3 +	aesenc STATE5, STATE4 +	aesenc STATE6, STATE5 +	aesenc T0,     STATE6 +.endm + +.macro update0 +	update +	pxor MSG0, STATE7 +	pxor MSG1, STATE3 +.endm + +.macro update1 +	update +	pxor MSG0, STATE6 +	pxor MSG1, STATE2 +.endm + +.macro update2 +	update +	pxor MSG0, STATE5 +	pxor MSG1, STATE1 +.endm + +.macro update3 +	update +	pxor MSG0, STATE4 +	pxor MSG1, STATE0 +.endm + +.macro update4 +	update +	pxor MSG0, STATE3 +	pxor MSG1, STATE7 +.endm + +.macro update5 +	update +	pxor MSG0, STATE2 +	pxor MSG1, STATE6 +.endm + +.macro update6 +	update +	pxor MSG0, STATE1 +	pxor MSG1, STATE5 +.endm + +.macro update7 +	update +	pxor MSG0, STATE0 +	pxor MSG1, STATE4 +.endm + +.macro state_load +	movdqu 0x00(STATEP), STATE0 +	movdqu 0x10(STATEP), STATE1 +	movdqu 0x20(STATEP), STATE2 +	movdqu 0x30(STATEP), STATE3 +	movdqu 0x40(STATEP), STATE4 +	movdqu 0x50(STATEP), STATE5 +	movdqu 0x60(STATEP), STATE6 +	movdqu 0x70(STATEP), STATE7 +.endm + +.macro state_store s0 s1 s2 s3 s4 s5 s6 s7 +	movdqu \s7, 0x00(STATEP) +	movdqu \s0, 0x10(STATEP) +	movdqu \s1, 0x20(STATEP) +	movdqu \s2, 0x30(STATEP) +	movdqu \s3, 0x40(STATEP) +	movdqu \s4, 0x50(STATEP) +	movdqu \s5, 0x60(STATEP) +	movdqu \s6, 0x70(STATEP) +.endm + +.macro state_store0 +	state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 +.endm + +.macro state_store1 +	state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 +.endm + +.macro state_store2 +	state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro state_store3 +	state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro state_store4 +	state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro state_store5 +	state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 +.endm + +.macro state_store6 +	state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 +.endm + +.macro state_store7 +	state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 +.endm + +/* + * void crypto_aegis128l_aesni_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_aegis128l_aesni_init) +	FRAME_BEGIN + +	/* load key: */ +	movdqa (%rsi), MSG1 +	movdqa MSG1, STATE0 +	movdqa MSG1, STATE4 +	movdqa MSG1, STATE5 +	movdqa MSG1, STATE6 +	movdqa MSG1, STATE7 + +	/* load IV: */ +	movdqu (%rdx), MSG0 +	pxor MSG0, STATE0 +	pxor MSG0, STATE4 + +	/* load the constants: */ +	movdqa .Laegis128l_const_0, STATE2 +	movdqa .Laegis128l_const_1, STATE1 +	movdqa STATE1, STATE3 +	pxor STATE2, STATE5 +	pxor STATE1, STATE6 +	pxor STATE2, STATE7 + +	/* update 10 times with IV and KEY: */ +	update0 +	update1 +	update2 +	update3 +	update4 +	update5 +	update6 +	update7 +	update0 +	update1 + +	state_store1 + +	FRAME_END +	ret +ENDPROC(crypto_aegis128l_aesni_init) + +.macro ad_block a i +	movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 +	movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 +	update\i +	sub $0x20, LEN +	cmp $0x20, LEN +	jl .Lad_out_\i +.endm + +/* + * void crypto_aegis128l_aesni_ad(void *state, unsigned int length, + *                                const void *data); + */ +ENTRY(crypto_aegis128l_aesni_ad) +	FRAME_BEGIN + +	cmp $0x20, LEN +	jb .Lad_out + +	state_load + +	mov  SRC, %r8 +	and $0xf, %r8 +	jnz .Lad_u_loop + +.align 8 +.Lad_a_loop: +	ad_block a 0 +	ad_block a 1 +	ad_block a 2 +	ad_block a 3 +	ad_block a 4 +	ad_block a 5 +	ad_block a 6 +	ad_block a 7 + +	add $0x100, SRC +	jmp .Lad_a_loop + +.align 8 +.Lad_u_loop: +	ad_block u 0 +	ad_block u 1 +	ad_block u 2 +	ad_block u 3 +	ad_block u 4 +	ad_block u 5 +	ad_block u 6 +	ad_block u 7 + +	add $0x100, SRC +	jmp .Lad_u_loop + +.Lad_out_0: +	state_store0 +	FRAME_END +	ret + +.Lad_out_1: +	state_store1 +	FRAME_END +	ret + +.Lad_out_2: +	state_store2 +	FRAME_END +	ret + +.Lad_out_3: +	state_store3 +	FRAME_END +	ret + +.Lad_out_4: +	state_store4 +	FRAME_END +	ret + +.Lad_out_5: +	state_store5 +	FRAME_END +	ret + +.Lad_out_6: +	state_store6 +	FRAME_END +	ret + +.Lad_out_7: +	state_store7 +	FRAME_END +	ret + +.Lad_out: +	FRAME_END +	ret +ENDPROC(crypto_aegis128l_aesni_ad) + +.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7 +	pxor \s1, \m0 +	pxor \s6, \m0 +	movdqa \s2, T3 +	pand \s3, T3 +	pxor T3, \m0 + +	pxor \s2, \m1 +	pxor \s5, \m1 +	movdqa \s6, T3 +	pand \s7, T3 +	pxor T3, \m1 +.endm + +.macro crypt0 m0 m1 +	crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 +.endm + +.macro crypt1 m0 m1 +	crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 +.endm + +.macro crypt2 m0 m1 +	crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro crypt3 m0 m1 +	crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro crypt4 m0 m1 +	crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro crypt5 m0 m1 +	crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 +.endm + +.macro crypt6 m0 m1 +	crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 +.endm + +.macro crypt7 m0 m1 +	crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 +.endm + +.macro encrypt_block a i +	movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 +	movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 +	movdqa MSG0, T0 +	movdqa MSG1, T1 +	crypt\i T0, T1 +	movdq\a T0, (\i * 0x20 + 0x00)(DST) +	movdq\a T1, (\i * 0x20 + 0x10)(DST) + +	update\i + +	sub $0x20, LEN +	cmp $0x20, LEN +	jl .Lenc_out_\i +.endm + +.macro decrypt_block a i +	movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 +	movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 +	crypt\i MSG0, MSG1 +	movdq\a MSG0, (\i * 0x20 + 0x00)(DST) +	movdq\a MSG1, (\i * 0x20 + 0x10)(DST) + +	update\i + +	sub $0x20, LEN +	cmp $0x20, LEN +	jl .Ldec_out_\i +.endm + +/* + * void crypto_aegis128l_aesni_enc(void *state, unsigned int length, + *                                 const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_enc) +	FRAME_BEGIN + +	cmp $0x20, LEN +	jb .Lenc_out + +	state_load + +	mov  SRC, %r8 +	or   DST, %r8 +	and $0xf, %r8 +	jnz .Lenc_u_loop + +.align 8 +.Lenc_a_loop: +	encrypt_block a 0 +	encrypt_block a 1 +	encrypt_block a 2 +	encrypt_block a 3 +	encrypt_block a 4 +	encrypt_block a 5 +	encrypt_block a 6 +	encrypt_block a 7 + +	add $0x100, SRC +	add $0x100, DST +	jmp .Lenc_a_loop + +.align 8 +.Lenc_u_loop: +	encrypt_block u 0 +	encrypt_block u 1 +	encrypt_block u 2 +	encrypt_block u 3 +	encrypt_block u 4 +	encrypt_block u 5 +	encrypt_block u 6 +	encrypt_block u 7 + +	add $0x100, SRC +	add $0x100, DST +	jmp .Lenc_u_loop + +.Lenc_out_0: +	state_store0 +	FRAME_END +	ret + +.Lenc_out_1: +	state_store1 +	FRAME_END +	ret + +.Lenc_out_2: +	state_store2 +	FRAME_END +	ret + +.Lenc_out_3: +	state_store3 +	FRAME_END +	ret + +.Lenc_out_4: +	state_store4 +	FRAME_END +	ret + +.Lenc_out_5: +	state_store5 +	FRAME_END +	ret + +.Lenc_out_6: +	state_store6 +	FRAME_END +	ret + +.Lenc_out_7: +	state_store7 +	FRAME_END +	ret + +.Lenc_out: +	FRAME_END +	ret +ENDPROC(crypto_aegis128l_aesni_enc) + +/* + * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length, + *                                      const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_enc_tail) +	FRAME_BEGIN + +	state_load + +	/* encrypt message: */ +	call __load_partial + +	movdqa MSG0, T0 +	movdqa MSG1, T1 +	crypt0 T0, T1 + +	call __store_partial + +	update0 + +	state_store0 + +	FRAME_END +ENDPROC(crypto_aegis128l_aesni_enc_tail) + +/* + * void crypto_aegis128l_aesni_dec(void *state, unsigned int length, + *                                 const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_dec) +	FRAME_BEGIN + +	cmp $0x20, LEN +	jb .Ldec_out + +	state_load + +	mov  SRC, %r8 +	or   DST, %r8 +	and $0xF, %r8 +	jnz .Ldec_u_loop + +.align 8 +.Ldec_a_loop: +	decrypt_block a 0 +	decrypt_block a 1 +	decrypt_block a 2 +	decrypt_block a 3 +	decrypt_block a 4 +	decrypt_block a 5 +	decrypt_block a 6 +	decrypt_block a 7 + +	add $0x100, SRC +	add $0x100, DST +	jmp .Ldec_a_loop + +.align 8 +.Ldec_u_loop: +	decrypt_block u 0 +	decrypt_block u 1 +	decrypt_block u 2 +	decrypt_block u 3 +	decrypt_block u 4 +	decrypt_block u 5 +	decrypt_block u 6 +	decrypt_block u 7 + +	add $0x100, SRC +	add $0x100, DST +	jmp .Ldec_u_loop + +.Ldec_out_0: +	state_store0 +	FRAME_END +	ret + +.Ldec_out_1: +	state_store1 +	FRAME_END +	ret + +.Ldec_out_2: +	state_store2 +	FRAME_END +	ret + +.Ldec_out_3: +	state_store3 +	FRAME_END +	ret + +.Ldec_out_4: +	state_store4 +	FRAME_END +	ret + +.Ldec_out_5: +	state_store5 +	FRAME_END +	ret + +.Ldec_out_6: +	state_store6 +	FRAME_END +	ret + +.Ldec_out_7: +	state_store7 +	FRAME_END +	ret + +.Ldec_out: +	FRAME_END +	ret +ENDPROC(crypto_aegis128l_aesni_dec) + +/* + * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length, + *                                      const void *src, void *dst); + */ +ENTRY(crypto_aegis128l_aesni_dec_tail) +	FRAME_BEGIN + +	state_load + +	/* decrypt message: */ +	call __load_partial + +	crypt0 MSG0, MSG1 + +	movdqa MSG0, T0 +	movdqa MSG1, T1 +	call __store_partial + +	/* mask with byte count: */ +	movq LEN, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	movdqa T0, T1 +	movdqa .Laegis128l_counter0, T2 +	movdqa .Laegis128l_counter1, T3 +	pcmpgtb T2, T0 +	pcmpgtb T3, T1 +	pand T0, MSG0 +	pand T1, MSG1 + +	update0 + +	state_store0 + +	FRAME_END +	ret +ENDPROC(crypto_aegis128l_aesni_dec_tail) + +/* + * void crypto_aegis128l_aesni_final(void *state, void *tag_xor, + *                                   u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_aegis128l_aesni_final) +	FRAME_BEGIN + +	state_load + +	/* prepare length block: */ +	movq %rdx, MSG0 +	movq %rcx, T0 +	pslldq $8, T0 +	pxor T0, MSG0 +	psllq $3, MSG0 /* multiply by 8 (to get bit count) */ + +	pxor STATE2, MSG0 +	movdqa MSG0, MSG1 + +	/* update state: */ +	update0 +	update1 +	update2 +	update3 +	update4 +	update5 +	update6 + +	/* xor tag: */ +	movdqu (%rsi), T0 + +	pxor STATE1, T0 +	pxor STATE2, T0 +	pxor STATE3, T0 +	pxor STATE4, T0 +	pxor STATE5, T0 +	pxor STATE6, T0 +	pxor STATE7, T0 + +	movdqu T0, (%rsi) + +	FRAME_END +	ret +ENDPROC(crypto_aegis128l_aesni_final) diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c new file mode 100644 index 000000000000..876e4866e633 --- /dev/null +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -0,0 +1,407 @@ +/* + * The AEGIS-128L Authenticated-Encryption Algorithm + *   Glue for AES-NI + SSE2 implementation + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/cryptd.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +#define AEGIS128L_BLOCK_ALIGN 16 +#define AEGIS128L_BLOCK_SIZE 32 +#define AEGIS128L_NONCE_SIZE 16 +#define AEGIS128L_STATE_BLOCKS 8 +#define AEGIS128L_KEY_SIZE 16 +#define AEGIS128L_MIN_AUTH_SIZE 8 +#define AEGIS128L_MAX_AUTH_SIZE 16 + +asmlinkage void crypto_aegis128l_aesni_init(void *state, void *key, void *iv); + +asmlinkage void crypto_aegis128l_aesni_ad( +		void *state, unsigned int length, const void *data); + +asmlinkage void crypto_aegis128l_aesni_enc( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_dec( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_enc_tail( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_dec_tail( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis128l_aesni_final( +		void *state, void *tag_xor, unsigned int cryptlen, +		unsigned int assoclen); + +struct aegis_block { +	u8 bytes[AEGIS128L_BLOCK_SIZE] __aligned(AEGIS128L_BLOCK_ALIGN); +}; + +struct aegis_state { +	struct aegis_block blocks[AEGIS128L_STATE_BLOCKS]; +}; + +struct aegis_ctx { +	struct aegis_block key; +}; + +struct aegis_crypt_ops { +	int (*skcipher_walk_init)(struct skcipher_walk *walk, +				  struct aead_request *req, bool atomic); + +	void (*crypt_blocks)(void *state, unsigned int length, const void *src, +			     void *dst); +	void (*crypt_tail)(void *state, unsigned int length, const void *src, +			   void *dst); +}; + +static void crypto_aegis128l_aesni_process_ad( +		struct aegis_state *state, struct scatterlist *sg_src, +		unsigned int assoclen) +{ +	struct scatter_walk walk; +	struct aegis_block buf; +	unsigned int pos = 0; + +	scatterwalk_start(&walk, sg_src); +	while (assoclen != 0) { +		unsigned int size = scatterwalk_clamp(&walk, assoclen); +		unsigned int left = size; +		void *mapped = scatterwalk_map(&walk); +		const u8 *src = (const u8 *)mapped; + +		if (pos + size >= AEGIS128L_BLOCK_SIZE) { +			if (pos > 0) { +				unsigned int fill = AEGIS128L_BLOCK_SIZE - pos; +				memcpy(buf.bytes + pos, src, fill); +				crypto_aegis128l_aesni_ad(state, +							  AEGIS128L_BLOCK_SIZE, +							  buf.bytes); +				pos = 0; +				left -= fill; +				src += fill; +			} + +			crypto_aegis128l_aesni_ad(state, left, src); + +			src += left & ~(AEGIS128L_BLOCK_SIZE - 1); +			left &= AEGIS128L_BLOCK_SIZE - 1; +		} + +		memcpy(buf.bytes + pos, src, left); +		pos += left; +		assoclen -= size; + +		scatterwalk_unmap(mapped); +		scatterwalk_advance(&walk, size); +		scatterwalk_done(&walk, 0, assoclen); +	} + +	if (pos > 0) { +		memset(buf.bytes + pos, 0, AEGIS128L_BLOCK_SIZE - pos); +		crypto_aegis128l_aesni_ad(state, AEGIS128L_BLOCK_SIZE, buf.bytes); +	} +} + +static void crypto_aegis128l_aesni_process_crypt( +		struct aegis_state *state, struct aead_request *req, +		const struct aegis_crypt_ops *ops) +{ +	struct skcipher_walk walk; +	u8 *src, *dst; +	unsigned int chunksize, base; + +	ops->skcipher_walk_init(&walk, req, false); + +	while (walk.nbytes) { +		src = walk.src.virt.addr; +		dst = walk.dst.virt.addr; +		chunksize = walk.nbytes; + +		ops->crypt_blocks(state, chunksize, src, dst); + +		base = chunksize & ~(AEGIS128L_BLOCK_SIZE - 1); +		src += base; +		dst += base; +		chunksize &= AEGIS128L_BLOCK_SIZE - 1; + +		if (chunksize > 0) +			ops->crypt_tail(state, chunksize, src, dst); + +		skcipher_walk_done(&walk, 0); +	} +} + +static struct aegis_ctx *crypto_aegis128l_aesni_ctx(struct crypto_aead *aead) +{ +	u8 *ctx = crypto_aead_ctx(aead); +	ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); +	return (void *)ctx; +} + +static int crypto_aegis128l_aesni_setkey(struct crypto_aead *aead, +					 const u8 *key, unsigned int keylen) +{ +	struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(aead); + +	if (keylen != AEGIS128L_KEY_SIZE) { +		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); +		return -EINVAL; +	} + +	memcpy(ctx->key.bytes, key, AEGIS128L_KEY_SIZE); + +	return 0; +} + +static int crypto_aegis128l_aesni_setauthsize(struct crypto_aead *tfm, +					      unsigned int authsize) +{ +	if (authsize > AEGIS128L_MAX_AUTH_SIZE) +		return -EINVAL; +	if (authsize < AEGIS128L_MIN_AUTH_SIZE) +		return -EINVAL; +	return 0; +} + +static void crypto_aegis128l_aesni_crypt(struct aead_request *req, +					 struct aegis_block *tag_xor, +					 unsigned int cryptlen, +					 const struct aegis_crypt_ops *ops) +{ +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm); +	struct aegis_state state; + +	kernel_fpu_begin(); + +	crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv); +	crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen); +	crypto_aegis128l_aesni_process_crypt(&state, req, ops); +	crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + +	kernel_fpu_end(); +} + +static int crypto_aegis128l_aesni_encrypt(struct aead_request *req) +{ +	static const struct aegis_crypt_ops OPS = { +		.skcipher_walk_init = skcipher_walk_aead_encrypt, +		.crypt_blocks = crypto_aegis128l_aesni_enc, +		.crypt_tail = crypto_aegis128l_aesni_enc_tail, +	}; + +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct aegis_block tag = {}; +	unsigned int authsize = crypto_aead_authsize(tfm); +	unsigned int cryptlen = req->cryptlen; + +	crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS); + +	scatterwalk_map_and_copy(tag.bytes, req->dst, +				 req->assoclen + cryptlen, authsize, 1); +	return 0; +} + +static int crypto_aegis128l_aesni_decrypt(struct aead_request *req) +{ +	static const struct aegis_block zeros = {}; + +	static const struct aegis_crypt_ops OPS = { +		.skcipher_walk_init = skcipher_walk_aead_decrypt, +		.crypt_blocks = crypto_aegis128l_aesni_dec, +		.crypt_tail = crypto_aegis128l_aesni_dec_tail, +	}; + +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct aegis_block tag; +	unsigned int authsize = crypto_aead_authsize(tfm); +	unsigned int cryptlen = req->cryptlen - authsize; + +	scatterwalk_map_and_copy(tag.bytes, req->src, +				 req->assoclen + cryptlen, authsize, 0); + +	crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS); + +	return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; +} + +static int crypto_aegis128l_aesni_init_tfm(struct crypto_aead *aead) +{ +	return 0; +} + +static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) +{ +} + +static int cryptd_aegis128l_aesni_setkey(struct crypto_aead *aead, +					 const u8 *key, unsigned int keylen) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} + +static int cryptd_aegis128l_aesni_setauthsize(struct crypto_aead *aead, +					      unsigned int authsize) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} + +static int cryptd_aegis128l_aesni_encrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	aead = &cryptd_tfm->base; +	if (irq_fpu_usable() && (!in_atomic() || +				 !cryptd_aead_queued(cryptd_tfm))) +		aead = cryptd_aead_child(cryptd_tfm); + +	aead_request_set_tfm(req, aead); + +	return crypto_aead_encrypt(req); +} + +static int cryptd_aegis128l_aesni_decrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	aead = &cryptd_tfm->base; +	if (irq_fpu_usable() && (!in_atomic() || +				 !cryptd_aead_queued(cryptd_tfm))) +		aead = cryptd_aead_child(cryptd_tfm); + +	aead_request_set_tfm(req, aead); + +	return crypto_aead_decrypt(req); +} + +static int cryptd_aegis128l_aesni_init_tfm(struct crypto_aead *aead) +{ +	struct cryptd_aead *cryptd_tfm; +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); + +	cryptd_tfm = cryptd_alloc_aead("__aegis128l-aesni", CRYPTO_ALG_INTERNAL, +				       CRYPTO_ALG_INTERNAL); +	if (IS_ERR(cryptd_tfm)) +		return PTR_ERR(cryptd_tfm); + +	*ctx = cryptd_tfm; +	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); +	return 0; +} + +static void cryptd_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); + +	cryptd_free_aead(*ctx); +} + +static struct aead_alg crypto_aegis128l_aesni_alg[] = { +	{ +		.setkey = crypto_aegis128l_aesni_setkey, +		.setauthsize = crypto_aegis128l_aesni_setauthsize, +		.encrypt = crypto_aegis128l_aesni_encrypt, +		.decrypt = crypto_aegis128l_aesni_decrypt, +		.init = crypto_aegis128l_aesni_init_tfm, +		.exit = crypto_aegis128l_aesni_exit_tfm, + +		.ivsize = AEGIS128L_NONCE_SIZE, +		.maxauthsize = AEGIS128L_MAX_AUTH_SIZE, +		.chunksize = AEGIS128L_BLOCK_SIZE, + +		.base = { +			.cra_flags = CRYPTO_ALG_INTERNAL, +			.cra_blocksize = 1, +			.cra_ctxsize = sizeof(struct aegis_ctx) + +				__alignof__(struct aegis_ctx), +			.cra_alignmask = 0, + +			.cra_name = "__aegis128l", +			.cra_driver_name = "__aegis128l-aesni", + +			.cra_module = THIS_MODULE, +		} +	}, { +		.setkey = cryptd_aegis128l_aesni_setkey, +		.setauthsize = cryptd_aegis128l_aesni_setauthsize, +		.encrypt = cryptd_aegis128l_aesni_encrypt, +		.decrypt = cryptd_aegis128l_aesni_decrypt, +		.init = cryptd_aegis128l_aesni_init_tfm, +		.exit = cryptd_aegis128l_aesni_exit_tfm, + +		.ivsize = AEGIS128L_NONCE_SIZE, +		.maxauthsize = AEGIS128L_MAX_AUTH_SIZE, +		.chunksize = AEGIS128L_BLOCK_SIZE, + +		.base = { +			.cra_flags = CRYPTO_ALG_ASYNC, +			.cra_blocksize = 1, +			.cra_ctxsize = sizeof(struct cryptd_aead *), +			.cra_alignmask = 0, + +			.cra_priority = 400, + +			.cra_name = "aegis128l", +			.cra_driver_name = "aegis128l-aesni", + +			.cra_module = THIS_MODULE, +		} +	} +}; + +static const struct x86_cpu_id aesni_cpu_id[] = { +	X86_FEATURE_MATCH(X86_FEATURE_AES), +	X86_FEATURE_MATCH(X86_FEATURE_XMM2), +	{} +}; +MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); + +static int __init crypto_aegis128l_aesni_module_init(void) +{ +	if (!x86_match_cpu(aesni_cpu_id)) +		return -ENODEV; + +	return crypto_register_aeads(crypto_aegis128l_aesni_alg, +				     ARRAY_SIZE(crypto_aegis128l_aesni_alg)); +} + +static void __exit crypto_aegis128l_aesni_module_exit(void) +{ +	crypto_unregister_aeads(crypto_aegis128l_aesni_alg, +				ARRAY_SIZE(crypto_aegis128l_aesni_alg)); +} + +module_init(crypto_aegis128l_aesni_module_init); +module_exit(crypto_aegis128l_aesni_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <[email protected]>"); +MODULE_DESCRIPTION("AEGIS-128L AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_ALIAS_CRYPTO("aegis128l"); +MODULE_ALIAS_CRYPTO("aegis128l-aesni"); diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S new file mode 100644 index 000000000000..1d977d515bf9 --- /dev/null +++ b/arch/x86/crypto/aegis256-aesni-asm.S @@ -0,0 +1,702 @@ +/* + * AES-NI + SSE2 implementation of AEGIS-128L + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define STATE0	%xmm0 +#define STATE1	%xmm1 +#define STATE2	%xmm2 +#define STATE3	%xmm3 +#define STATE4	%xmm4 +#define STATE5	%xmm5 +#define MSG	%xmm6 +#define T0	%xmm7 +#define T1	%xmm8 +#define T2	%xmm9 +#define T3	%xmm10 + +#define STATEP	%rdi +#define LEN	%rsi +#define SRC	%rdx +#define DST	%rcx + +.section .rodata.cst16.aegis256_const, "aM", @progbits, 32 +.align 16 +.Laegis256_const_0: +	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d +	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Laegis256_const_1: +	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 +	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.aegis256_counter, "aM", @progbits, 16 +.align 16 +.Laegis256_counter: +	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 +	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + +.text + +/* + * __load_partial: internal ABI + * input: + *   LEN - bytes + *   SRC - src + * output: + *   MSG  - message block + * changed: + *   T0 + *   %r8 + *   %r9 + */ +__load_partial: +	xor %r9, %r9 +	pxor MSG, MSG + +	mov LEN, %r8 +	and $0x1, %r8 +	jz .Lld_partial_1 + +	mov LEN, %r8 +	and $0x1E, %r8 +	add SRC, %r8 +	mov (%r8), %r9b + +.Lld_partial_1: +	mov LEN, %r8 +	and $0x2, %r8 +	jz .Lld_partial_2 + +	mov LEN, %r8 +	and $0x1C, %r8 +	add SRC, %r8 +	shl $0x10, %r9 +	mov (%r8), %r9w + +.Lld_partial_2: +	mov LEN, %r8 +	and $0x4, %r8 +	jz .Lld_partial_4 + +	mov LEN, %r8 +	and $0x18, %r8 +	add SRC, %r8 +	shl $32, %r9 +	mov (%r8), %r8d +	xor %r8, %r9 + +.Lld_partial_4: +	movq %r9, MSG + +	mov LEN, %r8 +	and $0x8, %r8 +	jz .Lld_partial_8 + +	mov LEN, %r8 +	and $0x10, %r8 +	add SRC, %r8 +	pslldq $8, MSG +	movq (%r8), T0 +	pxor T0, MSG + +.Lld_partial_8: +	ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + *   LEN - bytes + *   DST - dst + * output: + *   T0   - message block + * changed: + *   %r8 + *   %r9 + *   %r10 + */ +__store_partial: +	mov LEN, %r8 +	mov DST, %r9 + +	movq T0, %r10 + +	cmp $8, %r8 +	jl .Lst_partial_8 + +	mov %r10, (%r9) +	psrldq $8, T0 +	movq T0, %r10 + +	sub $8, %r8 +	add $8, %r9 + +.Lst_partial_8: +	cmp $4, %r8 +	jl .Lst_partial_4 + +	mov %r10d, (%r9) +	shr $32, %r10 + +	sub $4, %r8 +	add $4, %r9 + +.Lst_partial_4: +	cmp $2, %r8 +	jl .Lst_partial_2 + +	mov %r10w, (%r9) +	shr $0x10, %r10 + +	sub $2, %r8 +	add $2, %r9 + +.Lst_partial_2: +	cmp $1, %r8 +	jl .Lst_partial_1 + +	mov %r10b, (%r9) + +.Lst_partial_1: +	ret +ENDPROC(__store_partial) + +.macro update +	movdqa STATE5, T0 +	aesenc STATE0, STATE5 +	aesenc STATE1, STATE0 +	aesenc STATE2, STATE1 +	aesenc STATE3, STATE2 +	aesenc STATE4, STATE3 +	aesenc T0,     STATE4 +.endm + +.macro update0 m +	update +	pxor \m, STATE5 +.endm + +.macro update1 m +	update +	pxor \m, STATE4 +.endm + +.macro update2 m +	update +	pxor \m, STATE3 +.endm + +.macro update3 m +	update +	pxor \m, STATE2 +.endm + +.macro update4 m +	update +	pxor \m, STATE1 +.endm + +.macro update5 m +	update +	pxor \m, STATE0 +.endm + +.macro state_load +	movdqu 0x00(STATEP), STATE0 +	movdqu 0x10(STATEP), STATE1 +	movdqu 0x20(STATEP), STATE2 +	movdqu 0x30(STATEP), STATE3 +	movdqu 0x40(STATEP), STATE4 +	movdqu 0x50(STATEP), STATE5 +.endm + +.macro state_store s0 s1 s2 s3 s4 s5 +	movdqu \s5, 0x00(STATEP) +	movdqu \s0, 0x10(STATEP) +	movdqu \s1, 0x20(STATEP) +	movdqu \s2, 0x30(STATEP) +	movdqu \s3, 0x40(STATEP) +	movdqu \s4, 0x50(STATEP) +.endm + +.macro state_store0 +	state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro state_store1 +	state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro state_store2 +	state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro state_store3 +	state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2 +.endm + +.macro state_store4 +	state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1 +.endm + +.macro state_store5 +	state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0 +.endm + +/* + * void crypto_aegis256_aesni_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_aegis256_aesni_init) +	FRAME_BEGIN + +	/* load key: */ +	movdqa 0x00(%rsi), MSG +	movdqa 0x10(%rsi), T1 +	movdqa MSG, STATE4 +	movdqa T1, STATE5 + +	/* load IV: */ +	movdqu 0x00(%rdx), T2 +	movdqu 0x10(%rdx), T3 +	pxor MSG, T2 +	pxor T1, T3 +	movdqa T2, STATE0 +	movdqa T3, STATE1 + +	/* load the constants: */ +	movdqa .Laegis256_const_0, STATE3 +	movdqa .Laegis256_const_1, STATE2 +	pxor STATE3, STATE4 +	pxor STATE2, STATE5 + +	/* update 10 times with IV and KEY: */ +	update0 MSG +	update1 T1 +	update2 T2 +	update3 T3 +	update4 MSG +	update5 T1 +	update0 T2 +	update1 T3 +	update2 MSG +	update3 T1 +	update4 T2 +	update5 T3 +	update0 MSG +	update1 T1 +	update2 T2 +	update3 T3 + +	state_store3 + +	FRAME_END +	ret +ENDPROC(crypto_aegis256_aesni_init) + +.macro ad_block a i +	movdq\a (\i * 0x10)(SRC), MSG +	update\i MSG +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lad_out_\i +.endm + +/* + * void crypto_aegis256_aesni_ad(void *state, unsigned int length, + *                               const void *data); + */ +ENTRY(crypto_aegis256_aesni_ad) +	FRAME_BEGIN + +	cmp $0x10, LEN +	jb .Lad_out + +	state_load + +	mov  SRC, %r8 +	and $0xf, %r8 +	jnz .Lad_u_loop + +.align 8 +.Lad_a_loop: +	ad_block a 0 +	ad_block a 1 +	ad_block a 2 +	ad_block a 3 +	ad_block a 4 +	ad_block a 5 + +	add $0x60, SRC +	jmp .Lad_a_loop + +.align 8 +.Lad_u_loop: +	ad_block u 0 +	ad_block u 1 +	ad_block u 2 +	ad_block u 3 +	ad_block u 4 +	ad_block u 5 + +	add $0x60, SRC +	jmp .Lad_u_loop + +.Lad_out_0: +	state_store0 +	FRAME_END +	ret + +.Lad_out_1: +	state_store1 +	FRAME_END +	ret + +.Lad_out_2: +	state_store2 +	FRAME_END +	ret + +.Lad_out_3: +	state_store3 +	FRAME_END +	ret + +.Lad_out_4: +	state_store4 +	FRAME_END +	ret + +.Lad_out_5: +	state_store5 +	FRAME_END +	ret + +.Lad_out: +	FRAME_END +	ret +ENDPROC(crypto_aegis256_aesni_ad) + +.macro crypt m s0 s1 s2 s3 s4 s5 +	pxor \s1, \m +	pxor \s4, \m +	pxor \s5, \m +	movdqa \s2, T3 +	pand \s3, T3 +	pxor T3, \m +.endm + +.macro crypt0 m +	crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 +.endm + +.macro crypt1 m +	crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4 +.endm + +.macro crypt2 m +	crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3 +.endm + +.macro crypt3 m +	crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2 +.endm + +.macro crypt4 m +	crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1 +.endm + +.macro crypt5 m +	crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0 +.endm + +.macro encrypt_block a i +	movdq\a (\i * 0x10)(SRC), MSG +	movdqa MSG, T0 +	crypt\i T0 +	movdq\a T0, (\i * 0x10)(DST) + +	update\i MSG + +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Lenc_out_\i +.endm + +.macro decrypt_block a i +	movdq\a (\i * 0x10)(SRC), MSG +	crypt\i MSG +	movdq\a MSG, (\i * 0x10)(DST) + +	update\i MSG + +	sub $0x10, LEN +	cmp $0x10, LEN +	jl .Ldec_out_\i +.endm + +/* + * void crypto_aegis256_aesni_enc(void *state, unsigned int length, + *                                const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_enc) +	FRAME_BEGIN + +	cmp $0x10, LEN +	jb .Lenc_out + +	state_load + +	mov  SRC, %r8 +	or   DST, %r8 +	and $0xf, %r8 +	jnz .Lenc_u_loop + +.align 8 +.Lenc_a_loop: +	encrypt_block a 0 +	encrypt_block a 1 +	encrypt_block a 2 +	encrypt_block a 3 +	encrypt_block a 4 +	encrypt_block a 5 + +	add $0x60, SRC +	add $0x60, DST +	jmp .Lenc_a_loop + +.align 8 +.Lenc_u_loop: +	encrypt_block u 0 +	encrypt_block u 1 +	encrypt_block u 2 +	encrypt_block u 3 +	encrypt_block u 4 +	encrypt_block u 5 + +	add $0x60, SRC +	add $0x60, DST +	jmp .Lenc_u_loop + +.Lenc_out_0: +	state_store0 +	FRAME_END +	ret + +.Lenc_out_1: +	state_store1 +	FRAME_END +	ret + +.Lenc_out_2: +	state_store2 +	FRAME_END +	ret + +.Lenc_out_3: +	state_store3 +	FRAME_END +	ret + +.Lenc_out_4: +	state_store4 +	FRAME_END +	ret + +.Lenc_out_5: +	state_store5 +	FRAME_END +	ret + +.Lenc_out: +	FRAME_END +	ret +ENDPROC(crypto_aegis256_aesni_enc) + +/* + * void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length, + *                                     const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_enc_tail) +	FRAME_BEGIN + +	state_load + +	/* encrypt message: */ +	call __load_partial + +	movdqa MSG, T0 +	crypt0 T0 + +	call __store_partial + +	update0 MSG + +	state_store0 + +	FRAME_END +ENDPROC(crypto_aegis256_aesni_enc_tail) + +/* + * void crypto_aegis256_aesni_dec(void *state, unsigned int length, + *                                const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_dec) +	FRAME_BEGIN + +	cmp $0x10, LEN +	jb .Ldec_out + +	state_load + +	mov  SRC, %r8 +	or   DST, %r8 +	and $0xF, %r8 +	jnz .Ldec_u_loop + +.align 8 +.Ldec_a_loop: +	decrypt_block a 0 +	decrypt_block a 1 +	decrypt_block a 2 +	decrypt_block a 3 +	decrypt_block a 4 +	decrypt_block a 5 + +	add $0x60, SRC +	add $0x60, DST +	jmp .Ldec_a_loop + +.align 8 +.Ldec_u_loop: +	decrypt_block u 0 +	decrypt_block u 1 +	decrypt_block u 2 +	decrypt_block u 3 +	decrypt_block u 4 +	decrypt_block u 5 + +	add $0x60, SRC +	add $0x60, DST +	jmp .Ldec_u_loop + +.Ldec_out_0: +	state_store0 +	FRAME_END +	ret + +.Ldec_out_1: +	state_store1 +	FRAME_END +	ret + +.Ldec_out_2: +	state_store2 +	FRAME_END +	ret + +.Ldec_out_3: +	state_store3 +	FRAME_END +	ret + +.Ldec_out_4: +	state_store4 +	FRAME_END +	ret + +.Ldec_out_5: +	state_store5 +	FRAME_END +	ret + +.Ldec_out: +	FRAME_END +	ret +ENDPROC(crypto_aegis256_aesni_dec) + +/* + * void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length, + *                                     const void *src, void *dst); + */ +ENTRY(crypto_aegis256_aesni_dec_tail) +	FRAME_BEGIN + +	state_load + +	/* decrypt message: */ +	call __load_partial + +	crypt0 MSG + +	movdqa MSG, T0 +	call __store_partial + +	/* mask with byte count: */ +	movq LEN, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	movdqa .Laegis256_counter, T1 +	pcmpgtb T1, T0 +	pand T0, MSG + +	update0 MSG + +	state_store0 + +	FRAME_END +	ret +ENDPROC(crypto_aegis256_aesni_dec_tail) + +/* + * void crypto_aegis256_aesni_final(void *state, void *tag_xor, + *                                  u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_aegis256_aesni_final) +	FRAME_BEGIN + +	state_load + +	/* prepare length block: */ +	movq %rdx, MSG +	movq %rcx, T0 +	pslldq $8, T0 +	pxor T0, MSG +	psllq $3, MSG /* multiply by 8 (to get bit count) */ + +	pxor STATE3, MSG + +	/* update state: */ +	update0 MSG +	update1 MSG +	update2 MSG +	update3 MSG +	update4 MSG +	update5 MSG +	update0 MSG + +	/* xor tag: */ +	movdqu (%rsi), MSG + +	pxor STATE0, MSG +	pxor STATE1, MSG +	pxor STATE2, MSG +	pxor STATE3, MSG +	pxor STATE4, MSG +	pxor STATE5, MSG + +	movdqu MSG, (%rsi) + +	FRAME_END +	ret +ENDPROC(crypto_aegis256_aesni_final) diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c new file mode 100644 index 000000000000..2b5dd3af8f4d --- /dev/null +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -0,0 +1,407 @@ +/* + * The AEGIS-256 Authenticated-Encryption Algorithm + *   Glue for AES-NI + SSE2 implementation + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/cryptd.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +#define AEGIS256_BLOCK_ALIGN 16 +#define AEGIS256_BLOCK_SIZE 16 +#define AEGIS256_NONCE_SIZE 32 +#define AEGIS256_STATE_BLOCKS 6 +#define AEGIS256_KEY_SIZE 32 +#define AEGIS256_MIN_AUTH_SIZE 8 +#define AEGIS256_MAX_AUTH_SIZE 16 + +asmlinkage void crypto_aegis256_aesni_init(void *state, void *key, void *iv); + +asmlinkage void crypto_aegis256_aesni_ad( +		void *state, unsigned int length, const void *data); + +asmlinkage void crypto_aegis256_aesni_enc( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_dec( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_enc_tail( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_dec_tail( +		void *state, unsigned int length, const void *src, void *dst); + +asmlinkage void crypto_aegis256_aesni_final( +		void *state, void *tag_xor, unsigned int cryptlen, +		unsigned int assoclen); + +struct aegis_block { +	u8 bytes[AEGIS256_BLOCK_SIZE] __aligned(AEGIS256_BLOCK_ALIGN); +}; + +struct aegis_state { +	struct aegis_block blocks[AEGIS256_STATE_BLOCKS]; +}; + +struct aegis_ctx { +	struct aegis_block key[AEGIS256_KEY_SIZE / AEGIS256_BLOCK_SIZE]; +}; + +struct aegis_crypt_ops { +	int (*skcipher_walk_init)(struct skcipher_walk *walk, +				  struct aead_request *req, bool atomic); + +	void (*crypt_blocks)(void *state, unsigned int length, const void *src, +			     void *dst); +	void (*crypt_tail)(void *state, unsigned int length, const void *src, +			   void *dst); +}; + +static void crypto_aegis256_aesni_process_ad( +		struct aegis_state *state, struct scatterlist *sg_src, +		unsigned int assoclen) +{ +	struct scatter_walk walk; +	struct aegis_block buf; +	unsigned int pos = 0; + +	scatterwalk_start(&walk, sg_src); +	while (assoclen != 0) { +		unsigned int size = scatterwalk_clamp(&walk, assoclen); +		unsigned int left = size; +		void *mapped = scatterwalk_map(&walk); +		const u8 *src = (const u8 *)mapped; + +		if (pos + size >= AEGIS256_BLOCK_SIZE) { +			if (pos > 0) { +				unsigned int fill = AEGIS256_BLOCK_SIZE - pos; +				memcpy(buf.bytes + pos, src, fill); +				crypto_aegis256_aesni_ad(state, +							 AEGIS256_BLOCK_SIZE, +							 buf.bytes); +				pos = 0; +				left -= fill; +				src += fill; +			} + +			crypto_aegis256_aesni_ad(state, left, src); + +			src += left & ~(AEGIS256_BLOCK_SIZE - 1); +			left &= AEGIS256_BLOCK_SIZE - 1; +		} + +		memcpy(buf.bytes + pos, src, left); +		pos += left; +		assoclen -= size; + +		scatterwalk_unmap(mapped); +		scatterwalk_advance(&walk, size); +		scatterwalk_done(&walk, 0, assoclen); +	} + +	if (pos > 0) { +		memset(buf.bytes + pos, 0, AEGIS256_BLOCK_SIZE - pos); +		crypto_aegis256_aesni_ad(state, AEGIS256_BLOCK_SIZE, buf.bytes); +	} +} + +static void crypto_aegis256_aesni_process_crypt( +		struct aegis_state *state, struct aead_request *req, +		const struct aegis_crypt_ops *ops) +{ +	struct skcipher_walk walk; +	u8 *src, *dst; +	unsigned int chunksize, base; + +	ops->skcipher_walk_init(&walk, req, false); + +	while (walk.nbytes) { +		src = walk.src.virt.addr; +		dst = walk.dst.virt.addr; +		chunksize = walk.nbytes; + +		ops->crypt_blocks(state, chunksize, src, dst); + +		base = chunksize & ~(AEGIS256_BLOCK_SIZE - 1); +		src += base; +		dst += base; +		chunksize &= AEGIS256_BLOCK_SIZE - 1; + +		if (chunksize > 0) +			ops->crypt_tail(state, chunksize, src, dst); + +		skcipher_walk_done(&walk, 0); +	} +} + +static struct aegis_ctx *crypto_aegis256_aesni_ctx(struct crypto_aead *aead) +{ +	u8 *ctx = crypto_aead_ctx(aead); +	ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); +	return (void *)ctx; +} + +static int crypto_aegis256_aesni_setkey(struct crypto_aead *aead, const u8 *key, +					unsigned int keylen) +{ +	struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(aead); + +	if (keylen != AEGIS256_KEY_SIZE) { +		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); +		return -EINVAL; +	} + +	memcpy(ctx->key, key, AEGIS256_KEY_SIZE); + +	return 0; +} + +static int crypto_aegis256_aesni_setauthsize(struct crypto_aead *tfm, +						unsigned int authsize) +{ +	if (authsize > AEGIS256_MAX_AUTH_SIZE) +		return -EINVAL; +	if (authsize < AEGIS256_MIN_AUTH_SIZE) +		return -EINVAL; +	return 0; +} + +static void crypto_aegis256_aesni_crypt(struct aead_request *req, +					struct aegis_block *tag_xor, +					unsigned int cryptlen, +					const struct aegis_crypt_ops *ops) +{ +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm); +	struct aegis_state state; + +	kernel_fpu_begin(); + +	crypto_aegis256_aesni_init(&state, ctx->key, req->iv); +	crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen); +	crypto_aegis256_aesni_process_crypt(&state, req, ops); +	crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + +	kernel_fpu_end(); +} + +static int crypto_aegis256_aesni_encrypt(struct aead_request *req) +{ +	static const struct aegis_crypt_ops OPS = { +		.skcipher_walk_init = skcipher_walk_aead_encrypt, +		.crypt_blocks = crypto_aegis256_aesni_enc, +		.crypt_tail = crypto_aegis256_aesni_enc_tail, +	}; + +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct aegis_block tag = {}; +	unsigned int authsize = crypto_aead_authsize(tfm); +	unsigned int cryptlen = req->cryptlen; + +	crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS); + +	scatterwalk_map_and_copy(tag.bytes, req->dst, +				 req->assoclen + cryptlen, authsize, 1); +	return 0; +} + +static int crypto_aegis256_aesni_decrypt(struct aead_request *req) +{ +	static const struct aegis_block zeros = {}; + +	static const struct aegis_crypt_ops OPS = { +		.skcipher_walk_init = skcipher_walk_aead_decrypt, +		.crypt_blocks = crypto_aegis256_aesni_dec, +		.crypt_tail = crypto_aegis256_aesni_dec_tail, +	}; + +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct aegis_block tag; +	unsigned int authsize = crypto_aead_authsize(tfm); +	unsigned int cryptlen = req->cryptlen - authsize; + +	scatterwalk_map_and_copy(tag.bytes, req->src, +				 req->assoclen + cryptlen, authsize, 0); + +	crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS); + +	return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; +} + +static int crypto_aegis256_aesni_init_tfm(struct crypto_aead *aead) +{ +	return 0; +} + +static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead) +{ +} + +static int cryptd_aegis256_aesni_setkey(struct crypto_aead *aead, +					const u8 *key, unsigned int keylen) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} + +static int cryptd_aegis256_aesni_setauthsize(struct crypto_aead *aead, +					     unsigned int authsize) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} + +static int cryptd_aegis256_aesni_encrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	aead = &cryptd_tfm->base; +	if (irq_fpu_usable() && (!in_atomic() || +				 !cryptd_aead_queued(cryptd_tfm))) +		aead = cryptd_aead_child(cryptd_tfm); + +	aead_request_set_tfm(req, aead); + +	return crypto_aead_encrypt(req); +} + +static int cryptd_aegis256_aesni_decrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	aead = &cryptd_tfm->base; +	if (irq_fpu_usable() && (!in_atomic() || +				 !cryptd_aead_queued(cryptd_tfm))) +		aead = cryptd_aead_child(cryptd_tfm); + +	aead_request_set_tfm(req, aead); + +	return crypto_aead_decrypt(req); +} + +static int cryptd_aegis256_aesni_init_tfm(struct crypto_aead *aead) +{ +	struct cryptd_aead *cryptd_tfm; +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); + +	cryptd_tfm = cryptd_alloc_aead("__aegis256-aesni", CRYPTO_ALG_INTERNAL, +				       CRYPTO_ALG_INTERNAL); +	if (IS_ERR(cryptd_tfm)) +		return PTR_ERR(cryptd_tfm); + +	*ctx = cryptd_tfm; +	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); +	return 0; +} + +static void cryptd_aegis256_aesni_exit_tfm(struct crypto_aead *aead) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); + +	cryptd_free_aead(*ctx); +} + +static struct aead_alg crypto_aegis256_aesni_alg[] = { +	{ +		.setkey = crypto_aegis256_aesni_setkey, +		.setauthsize = crypto_aegis256_aesni_setauthsize, +		.encrypt = crypto_aegis256_aesni_encrypt, +		.decrypt = crypto_aegis256_aesni_decrypt, +		.init = crypto_aegis256_aesni_init_tfm, +		.exit = crypto_aegis256_aesni_exit_tfm, + +		.ivsize = AEGIS256_NONCE_SIZE, +		.maxauthsize = AEGIS256_MAX_AUTH_SIZE, +		.chunksize = AEGIS256_BLOCK_SIZE, + +		.base = { +			.cra_flags = CRYPTO_ALG_INTERNAL, +			.cra_blocksize = 1, +			.cra_ctxsize = sizeof(struct aegis_ctx) + +				__alignof__(struct aegis_ctx), +			.cra_alignmask = 0, + +			.cra_name = "__aegis256", +			.cra_driver_name = "__aegis256-aesni", + +			.cra_module = THIS_MODULE, +		} +	}, { +		.setkey = cryptd_aegis256_aesni_setkey, +		.setauthsize = cryptd_aegis256_aesni_setauthsize, +		.encrypt = cryptd_aegis256_aesni_encrypt, +		.decrypt = cryptd_aegis256_aesni_decrypt, +		.init = cryptd_aegis256_aesni_init_tfm, +		.exit = cryptd_aegis256_aesni_exit_tfm, + +		.ivsize = AEGIS256_NONCE_SIZE, +		.maxauthsize = AEGIS256_MAX_AUTH_SIZE, +		.chunksize = AEGIS256_BLOCK_SIZE, + +		.base = { +			.cra_flags = CRYPTO_ALG_ASYNC, +			.cra_blocksize = 1, +			.cra_ctxsize = sizeof(struct cryptd_aead *), +			.cra_alignmask = 0, + +			.cra_priority = 400, + +			.cra_name = "aegis256", +			.cra_driver_name = "aegis256-aesni", + +			.cra_module = THIS_MODULE, +		} +	} +}; + +static const struct x86_cpu_id aesni_cpu_id[] = { +	X86_FEATURE_MATCH(X86_FEATURE_AES), +	X86_FEATURE_MATCH(X86_FEATURE_XMM2), +	{} +}; +MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); + +static int __init crypto_aegis256_aesni_module_init(void) +{ +	if (!x86_match_cpu(aesni_cpu_id)) +		return -ENODEV; + +	return crypto_register_aeads(crypto_aegis256_aesni_alg, +				    ARRAY_SIZE(crypto_aegis256_aesni_alg)); +} + +static void __exit crypto_aegis256_aesni_module_exit(void) +{ +	crypto_unregister_aeads(crypto_aegis256_aesni_alg, +				ARRAY_SIZE(crypto_aegis256_aesni_alg)); +} + +module_init(crypto_aegis256_aesni_module_init); +module_exit(crypto_aegis256_aesni_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <[email protected]>"); +MODULE_DESCRIPTION("AEGIS-256 AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_ALIAS_CRYPTO("aegis256"); +MODULE_ALIAS_CRYPTO("aegis256-aesni"); diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 0420bab19efb..2ddbe3a1868b 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -364,5 +364,5 @@ module_exit(ghash_pclmulqdqni_mod_exit);  MODULE_LICENSE("GPL");  MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " -		   "acclerated by PCLMULQDQ-NI"); +		   "accelerated by PCLMULQDQ-NI");  MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S new file mode 100644 index 000000000000..37d422e77931 --- /dev/null +++ b/arch/x86/crypto/morus1280-avx2-asm.S @@ -0,0 +1,621 @@ +/* + * AVX2 implementation of MORUS-1280 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define SHUFFLE_MASK(i0, i1, i2, i3) \ +	(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) + +#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) +#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) +#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) + +#define STATE0		%ymm0 +#define STATE0_LOW	%xmm0 +#define STATE1		%ymm1 +#define STATE2		%ymm2 +#define STATE3		%ymm3 +#define STATE4		%ymm4 +#define KEY		%ymm5 +#define MSG		%ymm5 +#define MSG_LOW		%xmm5 +#define T0		%ymm6 +#define T0_LOW		%xmm6 +#define T1		%ymm7 + +.section .rodata.cst32.morus1280_const, "aM", @progbits, 32 +.align 32 +.Lmorus1280_const: +	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d +	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 +	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32 +.align 32 +.Lmorus1280_counter: +	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 +	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +	.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 +	.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + +.text + +.macro morus1280_round s0, s1, s2, s3, s4, b, w +	vpand \s1, \s2, T0 +	vpxor T0, \s0, \s0 +	vpxor \s3, \s0, \s0 +	vpsllq $\b, \s0, T0 +	vpsrlq $(64 - \b), \s0, \s0 +	vpxor T0, \s0, \s0 +	vpermq $\w, \s3, \s3 +.endm + +/* + * __morus1280_update: internal ABI + * input: + *   STATE[0-4] - input state + *   MSG        - message block + * output: + *   STATE[0-4] - output state + * changed: + *   T0 + */ +__morus1280_update: +	morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 +	vpxor MSG, STATE1, STATE1 +	morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 +	vpxor MSG, STATE2, STATE2 +	morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 +	vpxor MSG, STATE3, STATE3 +	morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2,  7, MASK2 +	vpxor MSG, STATE4, STATE4 +	morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3,  4, MASK1 +	ret +ENDPROC(__morus1280_update) + +/* + * __morus1280_update_zero: internal ABI + * input: + *   STATE[0-4] - input state + * output: + *   STATE[0-4] - output state + * changed: + *   T0 + */ +__morus1280_update_zero: +	morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 +	morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 +	morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 +	morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2,  7, MASK2 +	morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3,  4, MASK1 +	ret +ENDPROC(__morus1280_update_zero) + +/* + * __load_partial: internal ABI + * input: + *   %rsi - src + *   %rcx - bytes + * output: + *   MSG  - message block + * changed: + *   %r8 + *   %r9 + */ +__load_partial: +	xor %r9, %r9 +	vpxor MSG, MSG, MSG + +	mov %rcx, %r8 +	and $0x1, %r8 +	jz .Lld_partial_1 + +	mov %rcx, %r8 +	and $0x1E, %r8 +	add %rsi, %r8 +	mov (%r8), %r9b + +.Lld_partial_1: +	mov %rcx, %r8 +	and $0x2, %r8 +	jz .Lld_partial_2 + +	mov %rcx, %r8 +	and $0x1C, %r8 +	add %rsi, %r8 +	shl $16, %r9 +	mov (%r8), %r9w + +.Lld_partial_2: +	mov %rcx, %r8 +	and $0x4, %r8 +	jz .Lld_partial_4 + +	mov %rcx, %r8 +	and $0x18, %r8 +	add %rsi, %r8 +	shl $32, %r9 +	mov (%r8), %r8d +	xor %r8, %r9 + +.Lld_partial_4: +	movq %r9, MSG_LOW + +	mov %rcx, %r8 +	and $0x8, %r8 +	jz .Lld_partial_8 + +	mov %rcx, %r8 +	and $0x10, %r8 +	add %rsi, %r8 +	pshufd $MASK2, MSG_LOW, MSG_LOW +	pinsrq $0, (%r8), MSG_LOW + +.Lld_partial_8: +	mov %rcx, %r8 +	and $0x10, %r8 +	jz .Lld_partial_16 + +	vpermq $MASK2, MSG, MSG +	movdqu (%rsi), MSG_LOW + +.Lld_partial_16: +	ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + *   %rdx - dst + *   %rcx - bytes + * output: + *   T0   - message block + * changed: + *   %r8 + *   %r9 + *   %r10 + */ +__store_partial: +	mov %rcx, %r8 +	mov %rdx, %r9 + +	cmp $16, %r8 +	jl .Lst_partial_16 + +	movdqu T0_LOW, (%r9) +	vpermq $MASK2, T0, T0 + +	sub $16, %r8 +	add $16, %r9 + +.Lst_partial_16: +	movq T0_LOW, %r10 + +	cmp $8, %r8 +	jl .Lst_partial_8 + +	mov %r10, (%r9) +	pextrq $1, T0_LOW, %r10 + +	sub $8, %r8 +	add $8, %r9 + +.Lst_partial_8: +	cmp $4, %r8 +	jl .Lst_partial_4 + +	mov %r10d, (%r9) +	shr $32, %r10 + +	sub $4, %r8 +	add $4, %r9 + +.Lst_partial_4: +	cmp $2, %r8 +	jl .Lst_partial_2 + +	mov %r10w, (%r9) +	shr $16, %r10 + +	sub $2, %r8 +	add $2, %r9 + +.Lst_partial_2: +	cmp $1, %r8 +	jl .Lst_partial_1 + +	mov %r10b, (%r9) + +.Lst_partial_1: +	ret +ENDPROC(__store_partial) + +/* + * void crypto_morus1280_avx2_init(void *state, const void *key, + *                                 const void *iv); + */ +ENTRY(crypto_morus1280_avx2_init) +	FRAME_BEGIN + +	/* load IV: */ +	vpxor STATE0, STATE0, STATE0 +	movdqu (%rdx), STATE0_LOW +	/* load key: */ +	vmovdqu (%rsi), KEY +	vmovdqa KEY, STATE1 +	/* load all ones: */ +	vpcmpeqd STATE2, STATE2, STATE2 +	/* load all zeros: */ +	vpxor STATE3, STATE3, STATE3 +	/* load the constant: */ +	vmovdqa .Lmorus1280_const, STATE4 + +	/* update 16 times with zero: */ +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero + +	/* xor-in the key again after updates: */ +	vpxor KEY, STATE1, STATE1 + +	/* store the state: */ +	vmovdqu STATE0, (0 * 32)(%rdi) +	vmovdqu STATE1, (1 * 32)(%rdi) +	vmovdqu STATE2, (2 * 32)(%rdi) +	vmovdqu STATE3, (3 * 32)(%rdi) +	vmovdqu STATE4, (4 * 32)(%rdi) + +	FRAME_END +	ret +ENDPROC(crypto_morus1280_avx2_init) + +/* + * void crypto_morus1280_avx2_ad(void *state, const void *data, + *                               unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_ad) +	FRAME_BEGIN + +	cmp $32, %rdx +	jb .Lad_out + +	/* load the state: */ +	vmovdqu (0 * 32)(%rdi), STATE0 +	vmovdqu (1 * 32)(%rdi), STATE1 +	vmovdqu (2 * 32)(%rdi), STATE2 +	vmovdqu (3 * 32)(%rdi), STATE3 +	vmovdqu (4 * 32)(%rdi), STATE4 + +	mov %rsi,  %r8 +	and $0x1F, %r8 +	jnz .Lad_u_loop + +.align 4 +.Lad_a_loop: +	vmovdqa (%rsi), MSG +	call __morus1280_update +	sub $32, %rdx +	add $32, %rsi +	cmp $32, %rdx +	jge .Lad_a_loop + +	jmp .Lad_cont +.align 4 +.Lad_u_loop: +	vmovdqu (%rsi), MSG +	call __morus1280_update +	sub $32, %rdx +	add $32, %rsi +	cmp $32, %rdx +	jge .Lad_u_loop + +.Lad_cont: +	/* store the state: */ +	vmovdqu STATE0, (0 * 32)(%rdi) +	vmovdqu STATE1, (1 * 32)(%rdi) +	vmovdqu STATE2, (2 * 32)(%rdi) +	vmovdqu STATE3, (3 * 32)(%rdi) +	vmovdqu STATE4, (4 * 32)(%rdi) + +.Lad_out: +	FRAME_END +	ret +ENDPROC(crypto_morus1280_avx2_ad) + +/* + * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst, + *                                unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_enc) +	FRAME_BEGIN + +	cmp $32, %rcx +	jb .Lenc_out + +	/* load the state: */ +	vmovdqu (0 * 32)(%rdi), STATE0 +	vmovdqu (1 * 32)(%rdi), STATE1 +	vmovdqu (2 * 32)(%rdi), STATE2 +	vmovdqu (3 * 32)(%rdi), STATE3 +	vmovdqu (4 * 32)(%rdi), STATE4 + +	mov %rsi,  %r8 +	or  %rdx,  %r8 +	and $0x1F, %r8 +	jnz .Lenc_u_loop + +.align 4 +.Lenc_a_loop: +	vmovdqa (%rsi), MSG +	vmovdqa MSG, T0 +	vpxor STATE0, T0, T0 +	vpermq $MASK3, STATE1, T1 +	vpxor T1, T0, T0 +	vpand STATE2, STATE3, T1 +	vpxor T1, T0, T0 +	vmovdqa T0, (%rdx) + +	call __morus1280_update +	sub $32, %rcx +	add $32, %rsi +	add $32, %rdx +	cmp $32, %rcx +	jge .Lenc_a_loop + +	jmp .Lenc_cont +.align 4 +.Lenc_u_loop: +	vmovdqu (%rsi), MSG +	vmovdqa MSG, T0 +	vpxor STATE0, T0, T0 +	vpermq $MASK3, STATE1, T1 +	vpxor T1, T0, T0 +	vpand STATE2, STATE3, T1 +	vpxor T1, T0, T0 +	vmovdqu T0, (%rdx) + +	call __morus1280_update +	sub $32, %rcx +	add $32, %rsi +	add $32, %rdx +	cmp $32, %rcx +	jge .Lenc_u_loop + +.Lenc_cont: +	/* store the state: */ +	vmovdqu STATE0, (0 * 32)(%rdi) +	vmovdqu STATE1, (1 * 32)(%rdi) +	vmovdqu STATE2, (2 * 32)(%rdi) +	vmovdqu STATE3, (3 * 32)(%rdi) +	vmovdqu STATE4, (4 * 32)(%rdi) + +.Lenc_out: +	FRAME_END +	ret +ENDPROC(crypto_morus1280_avx2_enc) + +/* + * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst, + *                                     unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_enc_tail) +	FRAME_BEGIN + +	/* load the state: */ +	vmovdqu (0 * 32)(%rdi), STATE0 +	vmovdqu (1 * 32)(%rdi), STATE1 +	vmovdqu (2 * 32)(%rdi), STATE2 +	vmovdqu (3 * 32)(%rdi), STATE3 +	vmovdqu (4 * 32)(%rdi), STATE4 + +	/* encrypt message: */ +	call __load_partial + +	vmovdqa MSG, T0 +	vpxor STATE0, T0, T0 +	vpermq $MASK3, STATE1, T1 +	vpxor T1, T0, T0 +	vpand STATE2, STATE3, T1 +	vpxor T1, T0, T0 + +	call __store_partial + +	call __morus1280_update + +	/* store the state: */ +	vmovdqu STATE0, (0 * 32)(%rdi) +	vmovdqu STATE1, (1 * 32)(%rdi) +	vmovdqu STATE2, (2 * 32)(%rdi) +	vmovdqu STATE3, (3 * 32)(%rdi) +	vmovdqu STATE4, (4 * 32)(%rdi) + +	FRAME_END +ENDPROC(crypto_morus1280_avx2_enc_tail) + +/* + * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst, + *                                unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_dec) +	FRAME_BEGIN + +	cmp $32, %rcx +	jb .Ldec_out + +	/* load the state: */ +	vmovdqu (0 * 32)(%rdi), STATE0 +	vmovdqu (1 * 32)(%rdi), STATE1 +	vmovdqu (2 * 32)(%rdi), STATE2 +	vmovdqu (3 * 32)(%rdi), STATE3 +	vmovdqu (4 * 32)(%rdi), STATE4 + +	mov %rsi,  %r8 +	or  %rdx,  %r8 +	and $0x1F, %r8 +	jnz .Ldec_u_loop + +.align 4 +.Ldec_a_loop: +	vmovdqa (%rsi), MSG +	vpxor STATE0, MSG, MSG +	vpermq $MASK3, STATE1, T0 +	vpxor T0, MSG, MSG +	vpand STATE2, STATE3, T0 +	vpxor T0, MSG, MSG +	vmovdqa MSG, (%rdx) + +	call __morus1280_update +	sub $32, %rcx +	add $32, %rsi +	add $32, %rdx +	cmp $32, %rcx +	jge .Ldec_a_loop + +	jmp .Ldec_cont +.align 4 +.Ldec_u_loop: +	vmovdqu (%rsi), MSG +	vpxor STATE0, MSG, MSG +	vpermq $MASK3, STATE1, T0 +	vpxor T0, MSG, MSG +	vpand STATE2, STATE3, T0 +	vpxor T0, MSG, MSG +	vmovdqu MSG, (%rdx) + +	call __morus1280_update +	sub $32, %rcx +	add $32, %rsi +	add $32, %rdx +	cmp $32, %rcx +	jge .Ldec_u_loop + +.Ldec_cont: +	/* store the state: */ +	vmovdqu STATE0, (0 * 32)(%rdi) +	vmovdqu STATE1, (1 * 32)(%rdi) +	vmovdqu STATE2, (2 * 32)(%rdi) +	vmovdqu STATE3, (3 * 32)(%rdi) +	vmovdqu STATE4, (4 * 32)(%rdi) + +.Ldec_out: +	FRAME_END +	ret +ENDPROC(crypto_morus1280_avx2_dec) + +/* + * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst, + *                                     unsigned int length); + */ +ENTRY(crypto_morus1280_avx2_dec_tail) +	FRAME_BEGIN + +	/* load the state: */ +	vmovdqu (0 * 32)(%rdi), STATE0 +	vmovdqu (1 * 32)(%rdi), STATE1 +	vmovdqu (2 * 32)(%rdi), STATE2 +	vmovdqu (3 * 32)(%rdi), STATE3 +	vmovdqu (4 * 32)(%rdi), STATE4 + +	/* decrypt message: */ +	call __load_partial + +	vpxor STATE0, MSG, MSG +	vpermq $MASK3, STATE1, T0 +	vpxor T0, MSG, MSG +	vpand STATE2, STATE3, T0 +	vpxor T0, MSG, MSG +	vmovdqa MSG, T0 + +	call __store_partial + +	/* mask with byte count: */ +	movq %rcx, T0_LOW +	vpbroadcastb T0_LOW, T0 +	vmovdqa .Lmorus1280_counter, T1 +	vpcmpgtb T1, T0, T0 +	vpand T0, MSG, MSG + +	call __morus1280_update + +	/* store the state: */ +	vmovdqu STATE0, (0 * 32)(%rdi) +	vmovdqu STATE1, (1 * 32)(%rdi) +	vmovdqu STATE2, (2 * 32)(%rdi) +	vmovdqu STATE3, (3 * 32)(%rdi) +	vmovdqu STATE4, (4 * 32)(%rdi) + +	FRAME_END +	ret +ENDPROC(crypto_morus1280_avx2_dec_tail) + +/* + * void crypto_morus1280_avx2_final(void *state, void *tag_xor, + *                                  u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_morus1280_avx2_final) +	FRAME_BEGIN + +	/* load the state: */ +	vmovdqu (0 * 32)(%rdi), STATE0 +	vmovdqu (1 * 32)(%rdi), STATE1 +	vmovdqu (2 * 32)(%rdi), STATE2 +	vmovdqu (3 * 32)(%rdi), STATE3 +	vmovdqu (4 * 32)(%rdi), STATE4 + +	/* xor state[0] into state[4]: */ +	vpxor STATE0, STATE4, STATE4 + +	/* prepare length block: */ +	vpxor MSG, MSG, MSG +	vpinsrq $0, %rdx, MSG_LOW, MSG_LOW +	vpinsrq $1, %rcx, MSG_LOW, MSG_LOW +	vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */ + +	/* update state: */ +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update + +	/* xor tag: */ +	vmovdqu (%rsi), MSG + +	vpxor STATE0, MSG, MSG +	vpermq $MASK3, STATE1, T0 +	vpxor T0, MSG, MSG +	vpand STATE2, STATE3, T0 +	vpxor T0, MSG, MSG +	vmovdqu MSG, (%rsi) + +	FRAME_END +	ret +ENDPROC(crypto_morus1280_avx2_final) diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c new file mode 100644 index 000000000000..f111f36d26dc --- /dev/null +++ b/arch/x86/crypto/morus1280-avx2-glue.c @@ -0,0 +1,68 @@ +/* + * The MORUS-1280 Authenticated-Encryption Algorithm + *   Glue for AVX2 implementation + * + * Copyright (c) 2016-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/internal/aead.h> +#include <crypto/morus1280_glue.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key, +					   const void *iv); +asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data, +					 unsigned int length); + +asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src, +					  void *dst, unsigned int length); +asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src, +					  void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src, +					       void *dst, unsigned int length); +asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src, +					       void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, +					    u64 assoclen, u64 cryptlen); + +MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400); + +static const struct x86_cpu_id avx2_cpu_id[] = { +    X86_FEATURE_MATCH(X86_FEATURE_AVX2), +    {} +}; +MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id); + +static int __init crypto_morus1280_avx2_module_init(void) +{ +	if (!x86_match_cpu(avx2_cpu_id)) +		return -ENODEV; + +	return crypto_register_aeads(crypto_morus1280_avx2_algs, +				     ARRAY_SIZE(crypto_morus1280_avx2_algs)); +} + +static void __exit crypto_morus1280_avx2_module_exit(void) +{ +	crypto_unregister_aeads(crypto_morus1280_avx2_algs, +				ARRAY_SIZE(crypto_morus1280_avx2_algs)); +} + +module_init(crypto_morus1280_avx2_module_init); +module_exit(crypto_morus1280_avx2_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <[email protected]>"); +MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation"); +MODULE_ALIAS_CRYPTO("morus1280"); +MODULE_ALIAS_CRYPTO("morus1280-avx2"); diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S new file mode 100644 index 000000000000..1fe637c7be9d --- /dev/null +++ b/arch/x86/crypto/morus1280-sse2-asm.S @@ -0,0 +1,895 @@ +/* + * SSE2 implementation of MORUS-1280 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define SHUFFLE_MASK(i0, i1, i2, i3) \ +	(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) + +#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) + +#define STATE0_LO	%xmm0 +#define STATE0_HI	%xmm1 +#define STATE1_LO	%xmm2 +#define STATE1_HI	%xmm3 +#define STATE2_LO	%xmm4 +#define STATE2_HI	%xmm5 +#define STATE3_LO	%xmm6 +#define STATE3_HI	%xmm7 +#define STATE4_LO	%xmm8 +#define STATE4_HI	%xmm9 +#define KEY_LO		%xmm10 +#define KEY_HI		%xmm11 +#define MSG_LO		%xmm10 +#define MSG_HI		%xmm11 +#define T0_LO		%xmm12 +#define T0_HI		%xmm13 +#define T1_LO		%xmm14 +#define T1_HI		%xmm15 + +.section .rodata.cst16.morus640_const, "aM", @progbits, 16 +.align 16 +.Lmorus640_const_0: +	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d +	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Lmorus640_const_1: +	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 +	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 +.align 16 +.Lmorus640_counter_0: +	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 +	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.Lmorus640_counter_1: +	.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 +	.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + +.text + +.macro rol1 hi, lo +	/* +	 * HI_1 | HI_0 || LO_1 | LO_0 +	 *  ==> +	 * HI_0 | HI_1 || LO_1 | LO_0 +	 *  ==> +	 * HI_0 | LO_1 || LO_0 | HI_1 +	 */ +	pshufd $MASK2, \hi, \hi +	movdqa \hi, T0_LO +	punpcklqdq \lo, T0_LO +	punpckhqdq \hi, \lo +	movdqa \lo, \hi +	movdqa T0_LO, \lo +.endm + +.macro rol2 hi, lo +	movdqa \lo, T0_LO +	movdqa \hi, \lo +	movdqa T0_LO, \hi +.endm + +.macro rol3 hi, lo +	/* +	 * HI_1 | HI_0 || LO_1 | LO_0 +	 *  ==> +	 * HI_0 | HI_1 || LO_1 | LO_0 +	 *  ==> +	 * LO_0 | HI_1 || HI_0 | LO_1 +	 */ +	pshufd $MASK2, \hi, \hi +	movdqa \lo, T0_LO +	punpckhqdq \hi, T0_LO +	punpcklqdq \lo, \hi +	movdqa T0_LO, \lo +.endm + +.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w +	movdqa \s1_l, T0_LO +	pand \s2_l, T0_LO +	pxor T0_LO, \s0_l + +	movdqa \s1_h, T0_LO +	pand \s2_h, T0_LO +	pxor T0_LO, \s0_h + +	pxor \s3_l, \s0_l +	pxor \s3_h, \s0_h + +	movdqa \s0_l, T0_LO +	psllq $\b, T0_LO +	psrlq $(64 - \b), \s0_l +	pxor T0_LO, \s0_l + +	movdqa \s0_h, T0_LO +	psllq $\b, T0_LO +	psrlq $(64 - \b), \s0_h +	pxor T0_LO, \s0_h + +	\w \s3_h, \s3_l +.endm + +/* + * __morus1280_update: internal ABI + * input: + *   STATE[0-4] - input state + *   MSG        - message block + * output: + *   STATE[0-4] - output state + * changed: + *   T0 + */ +__morus1280_update: +	morus1280_round \ +		STATE0_LO, STATE0_HI, \ +		STATE1_LO, STATE1_HI, \ +		STATE2_LO, STATE2_HI, \ +		STATE3_LO, STATE3_HI, \ +		STATE4_LO, STATE4_HI, \ +		13, rol1 +	pxor MSG_LO, STATE1_LO +	pxor MSG_HI, STATE1_HI +	morus1280_round \ +		STATE1_LO, STATE1_HI, \ +		STATE2_LO, STATE2_HI, \ +		STATE3_LO, STATE3_HI, \ +		STATE4_LO, STATE4_HI, \ +		STATE0_LO, STATE0_HI, \ +		46, rol2 +	pxor MSG_LO, STATE2_LO +	pxor MSG_HI, STATE2_HI +	morus1280_round \ +		STATE2_LO, STATE2_HI, \ +		STATE3_LO, STATE3_HI, \ +		STATE4_LO, STATE4_HI, \ +		STATE0_LO, STATE0_HI, \ +		STATE1_LO, STATE1_HI, \ +		38, rol3 +	pxor MSG_LO, STATE3_LO +	pxor MSG_HI, STATE3_HI +	morus1280_round \ +		STATE3_LO, STATE3_HI, \ +		STATE4_LO, STATE4_HI, \ +		STATE0_LO, STATE0_HI, \ +		STATE1_LO, STATE1_HI, \ +		STATE2_LO, STATE2_HI, \ +		7, rol2 +	pxor MSG_LO, STATE4_LO +	pxor MSG_HI, STATE4_HI +	morus1280_round \ +		STATE4_LO, STATE4_HI, \ +		STATE0_LO, STATE0_HI, \ +		STATE1_LO, STATE1_HI, \ +		STATE2_LO, STATE2_HI, \ +		STATE3_LO, STATE3_HI, \ +		4, rol1 +	ret +ENDPROC(__morus1280_update) + +/* + * __morus1280_update_zero: internal ABI + * input: + *   STATE[0-4] - input state + * output: + *   STATE[0-4] - output state + * changed: + *   T0 + */ +__morus1280_update_zero: +	morus1280_round \ +		STATE0_LO, STATE0_HI, \ +		STATE1_LO, STATE1_HI, \ +		STATE2_LO, STATE2_HI, \ +		STATE3_LO, STATE3_HI, \ +		STATE4_LO, STATE4_HI, \ +		13, rol1 +	morus1280_round \ +		STATE1_LO, STATE1_HI, \ +		STATE2_LO, STATE2_HI, \ +		STATE3_LO, STATE3_HI, \ +		STATE4_LO, STATE4_HI, \ +		STATE0_LO, STATE0_HI, \ +		46, rol2 +	morus1280_round \ +		STATE2_LO, STATE2_HI, \ +		STATE3_LO, STATE3_HI, \ +		STATE4_LO, STATE4_HI, \ +		STATE0_LO, STATE0_HI, \ +		STATE1_LO, STATE1_HI, \ +		38, rol3 +	morus1280_round \ +		STATE3_LO, STATE3_HI, \ +		STATE4_LO, STATE4_HI, \ +		STATE0_LO, STATE0_HI, \ +		STATE1_LO, STATE1_HI, \ +		STATE2_LO, STATE2_HI, \ +		7, rol2 +	morus1280_round \ +		STATE4_LO, STATE4_HI, \ +		STATE0_LO, STATE0_HI, \ +		STATE1_LO, STATE1_HI, \ +		STATE2_LO, STATE2_HI, \ +		STATE3_LO, STATE3_HI, \ +		4, rol1 +	ret +ENDPROC(__morus1280_update_zero) + +/* + * __load_partial: internal ABI + * input: + *   %rsi - src + *   %rcx - bytes + * output: + *   MSG  - message block + * changed: + *   %r8 + *   %r9 + */ +__load_partial: +	xor %r9, %r9 +	pxor MSG_LO, MSG_LO +	pxor MSG_HI, MSG_HI + +	mov %rcx, %r8 +	and $0x1, %r8 +	jz .Lld_partial_1 + +	mov %rcx, %r8 +	and $0x1E, %r8 +	add %rsi, %r8 +	mov (%r8), %r9b + +.Lld_partial_1: +	mov %rcx, %r8 +	and $0x2, %r8 +	jz .Lld_partial_2 + +	mov %rcx, %r8 +	and $0x1C, %r8 +	add %rsi, %r8 +	shl $16, %r9 +	mov (%r8), %r9w + +.Lld_partial_2: +	mov %rcx, %r8 +	and $0x4, %r8 +	jz .Lld_partial_4 + +	mov %rcx, %r8 +	and $0x18, %r8 +	add %rsi, %r8 +	shl $32, %r9 +	mov (%r8), %r8d +	xor %r8, %r9 + +.Lld_partial_4: +	movq %r9, MSG_LO + +	mov %rcx, %r8 +	and $0x8, %r8 +	jz .Lld_partial_8 + +	mov %rcx, %r8 +	and $0x10, %r8 +	add %rsi, %r8 +	pslldq $8, MSG_LO +	movq (%r8), T0_LO +	pxor T0_LO, MSG_LO + +.Lld_partial_8: +	mov %rcx, %r8 +	and $0x10, %r8 +	jz .Lld_partial_16 + +	movdqa MSG_LO, MSG_HI +	movdqu (%rsi), MSG_LO + +.Lld_partial_16: +	ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + *   %rdx - dst + *   %rcx - bytes + * output: + *   T0   - message block + * changed: + *   %r8 + *   %r9 + *   %r10 + */ +__store_partial: +	mov %rcx, %r8 +	mov %rdx, %r9 + +	cmp $16, %r8 +	jl .Lst_partial_16 + +	movdqu T0_LO, (%r9) +	movdqa T0_HI, T0_LO + +	sub $16, %r8 +	add $16, %r9 + +.Lst_partial_16: +	movq T0_LO, %r10 + +	cmp $8, %r8 +	jl .Lst_partial_8 + +	mov %r10, (%r9) +	psrldq $8, T0_LO +	movq T0_LO, %r10 + +	sub $8, %r8 +	add $8, %r9 + +.Lst_partial_8: +	cmp $4, %r8 +	jl .Lst_partial_4 + +	mov %r10d, (%r9) +	shr $32, %r10 + +	sub $4, %r8 +	add $4, %r9 + +.Lst_partial_4: +	cmp $2, %r8 +	jl .Lst_partial_2 + +	mov %r10w, (%r9) +	shr $16, %r10 + +	sub $2, %r8 +	add $2, %r9 + +.Lst_partial_2: +	cmp $1, %r8 +	jl .Lst_partial_1 + +	mov %r10b, (%r9) + +.Lst_partial_1: +	ret +ENDPROC(__store_partial) + +/* + * void crypto_morus1280_sse2_init(void *state, const void *key, + *                                 const void *iv); + */ +ENTRY(crypto_morus1280_sse2_init) +	FRAME_BEGIN + +	/* load IV: */ +	pxor STATE0_HI, STATE0_HI +	movdqu (%rdx), STATE0_LO +	/* load key: */ +	movdqu  0(%rsi), KEY_LO +	movdqu 16(%rsi), KEY_HI +	movdqa KEY_LO, STATE1_LO +	movdqa KEY_HI, STATE1_HI +	/* load all ones: */ +	pcmpeqd STATE2_LO, STATE2_LO +	pcmpeqd STATE2_HI, STATE2_HI +	/* load all zeros: */ +	pxor STATE3_LO, STATE3_LO +	pxor STATE3_HI, STATE3_HI +	/* load the constant: */ +	movdqa .Lmorus640_const_0, STATE4_LO +	movdqa .Lmorus640_const_1, STATE4_HI + +	/* update 16 times with zero: */ +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero +	call __morus1280_update_zero + +	/* xor-in the key again after updates: */ +	pxor KEY_LO, STATE1_LO +	pxor KEY_HI, STATE1_HI + +	/* store the state: */ +	movdqu STATE0_LO, (0 * 16)(%rdi) +	movdqu STATE0_HI, (1 * 16)(%rdi) +	movdqu STATE1_LO, (2 * 16)(%rdi) +	movdqu STATE1_HI, (3 * 16)(%rdi) +	movdqu STATE2_LO, (4 * 16)(%rdi) +	movdqu STATE2_HI, (5 * 16)(%rdi) +	movdqu STATE3_LO, (6 * 16)(%rdi) +	movdqu STATE3_HI, (7 * 16)(%rdi) +	movdqu STATE4_LO, (8 * 16)(%rdi) +	movdqu STATE4_HI, (9 * 16)(%rdi) + +	FRAME_END +	ret +ENDPROC(crypto_morus1280_sse2_init) + +/* + * void crypto_morus1280_sse2_ad(void *state, const void *data, + *                               unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_ad) +	FRAME_BEGIN + +	cmp $32, %rdx +	jb .Lad_out + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0_LO +	movdqu (1 * 16)(%rdi), STATE0_HI +	movdqu (2 * 16)(%rdi), STATE1_LO +	movdqu (3 * 16)(%rdi), STATE1_HI +	movdqu (4 * 16)(%rdi), STATE2_LO +	movdqu (5 * 16)(%rdi), STATE2_HI +	movdqu (6 * 16)(%rdi), STATE3_LO +	movdqu (7 * 16)(%rdi), STATE3_HI +	movdqu (8 * 16)(%rdi), STATE4_LO +	movdqu (9 * 16)(%rdi), STATE4_HI + +	mov %rsi, %r8 +	and $0xF, %r8 +	jnz .Lad_u_loop + +.align 4 +.Lad_a_loop: +	movdqa  0(%rsi), MSG_LO +	movdqa 16(%rsi), MSG_HI +	call __morus1280_update +	sub $32, %rdx +	add $32, %rsi +	cmp $32, %rdx +	jge .Lad_a_loop + +	jmp .Lad_cont +.align 4 +.Lad_u_loop: +	movdqu  0(%rsi), MSG_LO +	movdqu 16(%rsi), MSG_HI +	call __morus1280_update +	sub $32, %rdx +	add $32, %rsi +	cmp $32, %rdx +	jge .Lad_u_loop + +.Lad_cont: +	/* store the state: */ +	movdqu STATE0_LO, (0 * 16)(%rdi) +	movdqu STATE0_HI, (1 * 16)(%rdi) +	movdqu STATE1_LO, (2 * 16)(%rdi) +	movdqu STATE1_HI, (3 * 16)(%rdi) +	movdqu STATE2_LO, (4 * 16)(%rdi) +	movdqu STATE2_HI, (5 * 16)(%rdi) +	movdqu STATE3_LO, (6 * 16)(%rdi) +	movdqu STATE3_HI, (7 * 16)(%rdi) +	movdqu STATE4_LO, (8 * 16)(%rdi) +	movdqu STATE4_HI, (9 * 16)(%rdi) + +.Lad_out: +	FRAME_END +	ret +ENDPROC(crypto_morus1280_sse2_ad) + +/* + * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst, + *                                unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_enc) +	FRAME_BEGIN + +	cmp $32, %rcx +	jb .Lenc_out + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0_LO +	movdqu (1 * 16)(%rdi), STATE0_HI +	movdqu (2 * 16)(%rdi), STATE1_LO +	movdqu (3 * 16)(%rdi), STATE1_HI +	movdqu (4 * 16)(%rdi), STATE2_LO +	movdqu (5 * 16)(%rdi), STATE2_HI +	movdqu (6 * 16)(%rdi), STATE3_LO +	movdqu (7 * 16)(%rdi), STATE3_HI +	movdqu (8 * 16)(%rdi), STATE4_LO +	movdqu (9 * 16)(%rdi), STATE4_HI + +	mov %rsi, %r8 +	or  %rdx, %r8 +	and $0xF, %r8 +	jnz .Lenc_u_loop + +.align 4 +.Lenc_a_loop: +	movdqa  0(%rsi), MSG_LO +	movdqa 16(%rsi), MSG_HI +	movdqa STATE1_LO, T1_LO +	movdqa STATE1_HI, T1_HI +	rol3 T1_HI, T1_LO +	movdqa MSG_LO, T0_LO +	movdqa MSG_HI, T0_HI +	pxor T1_LO, T0_LO +	pxor T1_HI, T0_HI +	pxor STATE0_LO, T0_LO +	pxor STATE0_HI, T0_HI +	movdqa STATE2_LO, T1_LO +	movdqa STATE2_HI, T1_HI +	pand STATE3_LO, T1_LO +	pand STATE3_HI, T1_HI +	pxor T1_LO, T0_LO +	pxor T1_HI, T0_HI +	movdqa T0_LO,  0(%rdx) +	movdqa T0_HI, 16(%rdx) + +	call __morus1280_update +	sub $32, %rcx +	add $32, %rsi +	add $32, %rdx +	cmp $32, %rcx +	jge .Lenc_a_loop + +	jmp .Lenc_cont +.align 4 +.Lenc_u_loop: +	movdqu  0(%rsi), MSG_LO +	movdqu 16(%rsi), MSG_HI +	movdqa STATE1_LO, T1_LO +	movdqa STATE1_HI, T1_HI +	rol3 T1_HI, T1_LO +	movdqa MSG_LO, T0_LO +	movdqa MSG_HI, T0_HI +	pxor T1_LO, T0_LO +	pxor T1_HI, T0_HI +	pxor STATE0_LO, T0_LO +	pxor STATE0_HI, T0_HI +	movdqa STATE2_LO, T1_LO +	movdqa STATE2_HI, T1_HI +	pand STATE3_LO, T1_LO +	pand STATE3_HI, T1_HI +	pxor T1_LO, T0_LO +	pxor T1_HI, T0_HI +	movdqu T0_LO,  0(%rdx) +	movdqu T0_HI, 16(%rdx) + +	call __morus1280_update +	sub $32, %rcx +	add $32, %rsi +	add $32, %rdx +	cmp $32, %rcx +	jge .Lenc_u_loop + +.Lenc_cont: +	/* store the state: */ +	movdqu STATE0_LO, (0 * 16)(%rdi) +	movdqu STATE0_HI, (1 * 16)(%rdi) +	movdqu STATE1_LO, (2 * 16)(%rdi) +	movdqu STATE1_HI, (3 * 16)(%rdi) +	movdqu STATE2_LO, (4 * 16)(%rdi) +	movdqu STATE2_HI, (5 * 16)(%rdi) +	movdqu STATE3_LO, (6 * 16)(%rdi) +	movdqu STATE3_HI, (7 * 16)(%rdi) +	movdqu STATE4_LO, (8 * 16)(%rdi) +	movdqu STATE4_HI, (9 * 16)(%rdi) + +.Lenc_out: +	FRAME_END +	ret +ENDPROC(crypto_morus1280_sse2_enc) + +/* + * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst, + *                                     unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_enc_tail) +	FRAME_BEGIN + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0_LO +	movdqu (1 * 16)(%rdi), STATE0_HI +	movdqu (2 * 16)(%rdi), STATE1_LO +	movdqu (3 * 16)(%rdi), STATE1_HI +	movdqu (4 * 16)(%rdi), STATE2_LO +	movdqu (5 * 16)(%rdi), STATE2_HI +	movdqu (6 * 16)(%rdi), STATE3_LO +	movdqu (7 * 16)(%rdi), STATE3_HI +	movdqu (8 * 16)(%rdi), STATE4_LO +	movdqu (9 * 16)(%rdi), STATE4_HI + +	/* encrypt message: */ +	call __load_partial + +	movdqa STATE1_LO, T1_LO +	movdqa STATE1_HI, T1_HI +	rol3 T1_HI, T1_LO +	movdqa MSG_LO, T0_LO +	movdqa MSG_HI, T0_HI +	pxor T1_LO, T0_LO +	pxor T1_HI, T0_HI +	pxor STATE0_LO, T0_LO +	pxor STATE0_HI, T0_HI +	movdqa STATE2_LO, T1_LO +	movdqa STATE2_HI, T1_HI +	pand STATE3_LO, T1_LO +	pand STATE3_HI, T1_HI +	pxor T1_LO, T0_LO +	pxor T1_HI, T0_HI + +	call __store_partial + +	call __morus1280_update + +	/* store the state: */ +	movdqu STATE0_LO, (0 * 16)(%rdi) +	movdqu STATE0_HI, (1 * 16)(%rdi) +	movdqu STATE1_LO, (2 * 16)(%rdi) +	movdqu STATE1_HI, (3 * 16)(%rdi) +	movdqu STATE2_LO, (4 * 16)(%rdi) +	movdqu STATE2_HI, (5 * 16)(%rdi) +	movdqu STATE3_LO, (6 * 16)(%rdi) +	movdqu STATE3_HI, (7 * 16)(%rdi) +	movdqu STATE4_LO, (8 * 16)(%rdi) +	movdqu STATE4_HI, (9 * 16)(%rdi) + +	FRAME_END +ENDPROC(crypto_morus1280_sse2_enc_tail) + +/* + * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst, + *                                unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_dec) +	FRAME_BEGIN + +	cmp $32, %rcx +	jb .Ldec_out + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0_LO +	movdqu (1 * 16)(%rdi), STATE0_HI +	movdqu (2 * 16)(%rdi), STATE1_LO +	movdqu (3 * 16)(%rdi), STATE1_HI +	movdqu (4 * 16)(%rdi), STATE2_LO +	movdqu (5 * 16)(%rdi), STATE2_HI +	movdqu (6 * 16)(%rdi), STATE3_LO +	movdqu (7 * 16)(%rdi), STATE3_HI +	movdqu (8 * 16)(%rdi), STATE4_LO +	movdqu (9 * 16)(%rdi), STATE4_HI + +	mov %rsi, %r8 +	or  %rdx, %r8 +	and $0xF, %r8 +	jnz .Ldec_u_loop + +.align 4 +.Ldec_a_loop: +	movdqa  0(%rsi), MSG_LO +	movdqa 16(%rsi), MSG_HI +	pxor STATE0_LO, MSG_LO +	pxor STATE0_HI, MSG_HI +	movdqa STATE1_LO, T1_LO +	movdqa STATE1_HI, T1_HI +	rol3 T1_HI, T1_LO +	pxor T1_LO, MSG_LO +	pxor T1_HI, MSG_HI +	movdqa STATE2_LO, T1_LO +	movdqa STATE2_HI, T1_HI +	pand STATE3_LO, T1_LO +	pand STATE3_HI, T1_HI +	pxor T1_LO, MSG_LO +	pxor T1_HI, MSG_HI +	movdqa MSG_LO,  0(%rdx) +	movdqa MSG_HI, 16(%rdx) + +	call __morus1280_update +	sub $32, %rcx +	add $32, %rsi +	add $32, %rdx +	cmp $32, %rcx +	jge .Ldec_a_loop + +	jmp .Ldec_cont +.align 4 +.Ldec_u_loop: +	movdqu  0(%rsi), MSG_LO +	movdqu 16(%rsi), MSG_HI +	pxor STATE0_LO, MSG_LO +	pxor STATE0_HI, MSG_HI +	movdqa STATE1_LO, T1_LO +	movdqa STATE1_HI, T1_HI +	rol3 T1_HI, T1_LO +	pxor T1_LO, MSG_LO +	pxor T1_HI, MSG_HI +	movdqa STATE2_LO, T1_LO +	movdqa STATE2_HI, T1_HI +	pand STATE3_LO, T1_LO +	pand STATE3_HI, T1_HI +	pxor T1_LO, MSG_LO +	pxor T1_HI, MSG_HI +	movdqu MSG_LO,  0(%rdx) +	movdqu MSG_HI, 16(%rdx) + +	call __morus1280_update +	sub $32, %rcx +	add $32, %rsi +	add $32, %rdx +	cmp $32, %rcx +	jge .Ldec_u_loop + +.Ldec_cont: +	/* store the state: */ +	movdqu STATE0_LO, (0 * 16)(%rdi) +	movdqu STATE0_HI, (1 * 16)(%rdi) +	movdqu STATE1_LO, (2 * 16)(%rdi) +	movdqu STATE1_HI, (3 * 16)(%rdi) +	movdqu STATE2_LO, (4 * 16)(%rdi) +	movdqu STATE2_HI, (5 * 16)(%rdi) +	movdqu STATE3_LO, (6 * 16)(%rdi) +	movdqu STATE3_HI, (7 * 16)(%rdi) +	movdqu STATE4_LO, (8 * 16)(%rdi) +	movdqu STATE4_HI, (9 * 16)(%rdi) + +.Ldec_out: +	FRAME_END +	ret +ENDPROC(crypto_morus1280_sse2_dec) + +/* + * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst, + *                                     unsigned int length); + */ +ENTRY(crypto_morus1280_sse2_dec_tail) +	FRAME_BEGIN + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0_LO +	movdqu (1 * 16)(%rdi), STATE0_HI +	movdqu (2 * 16)(%rdi), STATE1_LO +	movdqu (3 * 16)(%rdi), STATE1_HI +	movdqu (4 * 16)(%rdi), STATE2_LO +	movdqu (5 * 16)(%rdi), STATE2_HI +	movdqu (6 * 16)(%rdi), STATE3_LO +	movdqu (7 * 16)(%rdi), STATE3_HI +	movdqu (8 * 16)(%rdi), STATE4_LO +	movdqu (9 * 16)(%rdi), STATE4_HI + +	/* decrypt message: */ +	call __load_partial + +	pxor STATE0_LO, MSG_LO +	pxor STATE0_HI, MSG_HI +	movdqa STATE1_LO, T1_LO +	movdqa STATE1_HI, T1_HI +	rol3 T1_HI, T1_LO +	pxor T1_LO, MSG_LO +	pxor T1_HI, MSG_HI +	movdqa STATE2_LO, T1_LO +	movdqa STATE2_HI, T1_HI +	pand STATE3_LO, T1_LO +	pand STATE3_HI, T1_HI +	pxor T1_LO, MSG_LO +	pxor T1_HI, MSG_HI +	movdqa MSG_LO, T0_LO +	movdqa MSG_HI, T0_HI + +	call __store_partial + +	/* mask with byte count: */ +	movq %rcx, T0_LO +	punpcklbw T0_LO, T0_LO +	punpcklbw T0_LO, T0_LO +	punpcklbw T0_LO, T0_LO +	punpcklbw T0_LO, T0_LO +	movdqa T0_LO, T0_HI +	movdqa .Lmorus640_counter_0, T1_LO +	movdqa .Lmorus640_counter_1, T1_HI +	pcmpgtb T1_LO, T0_LO +	pcmpgtb T1_HI, T0_HI +	pand T0_LO, MSG_LO +	pand T0_HI, MSG_HI + +	call __morus1280_update + +	/* store the state: */ +	movdqu STATE0_LO, (0 * 16)(%rdi) +	movdqu STATE0_HI, (1 * 16)(%rdi) +	movdqu STATE1_LO, (2 * 16)(%rdi) +	movdqu STATE1_HI, (3 * 16)(%rdi) +	movdqu STATE2_LO, (4 * 16)(%rdi) +	movdqu STATE2_HI, (5 * 16)(%rdi) +	movdqu STATE3_LO, (6 * 16)(%rdi) +	movdqu STATE3_HI, (7 * 16)(%rdi) +	movdqu STATE4_LO, (8 * 16)(%rdi) +	movdqu STATE4_HI, (9 * 16)(%rdi) + +	FRAME_END +	ret +ENDPROC(crypto_morus1280_sse2_dec_tail) + +/* + * void crypto_morus1280_sse2_final(void *state, void *tag_xor, + *                                  u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_morus1280_sse2_final) +	FRAME_BEGIN + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0_LO +	movdqu (1 * 16)(%rdi), STATE0_HI +	movdqu (2 * 16)(%rdi), STATE1_LO +	movdqu (3 * 16)(%rdi), STATE1_HI +	movdqu (4 * 16)(%rdi), STATE2_LO +	movdqu (5 * 16)(%rdi), STATE2_HI +	movdqu (6 * 16)(%rdi), STATE3_LO +	movdqu (7 * 16)(%rdi), STATE3_HI +	movdqu (8 * 16)(%rdi), STATE4_LO +	movdqu (9 * 16)(%rdi), STATE4_HI + +	/* xor state[0] into state[4]: */ +	pxor STATE0_LO, STATE4_LO +	pxor STATE0_HI, STATE4_HI + +	/* prepare length block: */ +	movq %rdx, MSG_LO +	movq %rcx, T0_LO +	pslldq $8, T0_LO +	pxor T0_LO, MSG_LO +	psllq $3, MSG_LO /* multiply by 8 (to get bit count) */ +	pxor MSG_HI, MSG_HI + +	/* update state: */ +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update +	call __morus1280_update + +	/* xor tag: */ +	movdqu  0(%rsi), MSG_LO +	movdqu 16(%rsi), MSG_HI + +	pxor STATE0_LO, MSG_LO +	pxor STATE0_HI, MSG_HI +	movdqa STATE1_LO, T0_LO +	movdqa STATE1_HI, T0_HI +	rol3 T0_HI, T0_LO +	pxor T0_LO, MSG_LO +	pxor T0_HI, MSG_HI +	movdqa STATE2_LO, T0_LO +	movdqa STATE2_HI, T0_HI +	pand STATE3_LO, T0_LO +	pand STATE3_HI, T0_HI +	pxor T0_LO, MSG_LO +	pxor T0_HI, MSG_HI + +	movdqu MSG_LO,  0(%rsi) +	movdqu MSG_HI, 16(%rsi) + +	FRAME_END +	ret +ENDPROC(crypto_morus1280_sse2_final) diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c new file mode 100644 index 000000000000..839270aa713c --- /dev/null +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -0,0 +1,68 @@ +/* + * The MORUS-1280 Authenticated-Encryption Algorithm + *   Glue for SSE2 implementation + * + * Copyright (c) 2016-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/internal/aead.h> +#include <crypto/morus1280_glue.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key, +					   const void *iv); +asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data, +					 unsigned int length); + +asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src, +					  void *dst, unsigned int length); +asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src, +					  void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src, +					       void *dst, unsigned int length); +asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src, +					       void *dst, unsigned int length); + +asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, +					    u64 assoclen, u64 cryptlen); + +MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); + +static const struct x86_cpu_id sse2_cpu_id[] = { +    X86_FEATURE_MATCH(X86_FEATURE_XMM2), +    {} +}; +MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); + +static int __init crypto_morus1280_sse2_module_init(void) +{ +	if (!x86_match_cpu(sse2_cpu_id)) +		return -ENODEV; + +	return crypto_register_aeads(crypto_morus1280_sse2_algs, +				     ARRAY_SIZE(crypto_morus1280_sse2_algs)); +} + +static void __exit crypto_morus1280_sse2_module_exit(void) +{ +	crypto_unregister_aeads(crypto_morus1280_sse2_algs, +				ARRAY_SIZE(crypto_morus1280_sse2_algs)); +} + +module_init(crypto_morus1280_sse2_module_init); +module_exit(crypto_morus1280_sse2_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <[email protected]>"); +MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation"); +MODULE_ALIAS_CRYPTO("morus1280"); +MODULE_ALIAS_CRYPTO("morus1280-sse2"); diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c new file mode 100644 index 000000000000..0dccdda1eb3a --- /dev/null +++ b/arch/x86/crypto/morus1280_glue.c @@ -0,0 +1,302 @@ +/* + * The MORUS-1280 Authenticated-Encryption Algorithm + *   Common x86 SIMD glue skeleton + * + * Copyright (c) 2016-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/cryptd.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> +#include <crypto/morus1280_glue.h> +#include <crypto/scatterwalk.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/scatterlist.h> +#include <asm/fpu/api.h> + +struct morus1280_state { +	struct morus1280_block s[MORUS_STATE_BLOCKS]; +}; + +struct morus1280_ops { +	int (*skcipher_walk_init)(struct skcipher_walk *walk, +				  struct aead_request *req, bool atomic); + +	void (*crypt_blocks)(void *state, const void *src, void *dst, +			     unsigned int length); +	void (*crypt_tail)(void *state, const void *src, void *dst, +			   unsigned int length); +}; + +static void crypto_morus1280_glue_process_ad( +		struct morus1280_state *state, +		const struct morus1280_glue_ops *ops, +		struct scatterlist *sg_src, unsigned int assoclen) +{ +	struct scatter_walk walk; +	struct morus1280_block buf; +	unsigned int pos = 0; + +	scatterwalk_start(&walk, sg_src); +	while (assoclen != 0) { +		unsigned int size = scatterwalk_clamp(&walk, assoclen); +		unsigned int left = size; +		void *mapped = scatterwalk_map(&walk); +		const u8 *src = (const u8 *)mapped; + +		if (pos + size >= MORUS1280_BLOCK_SIZE) { +			if (pos > 0) { +				unsigned int fill = MORUS1280_BLOCK_SIZE - pos; +				memcpy(buf.bytes + pos, src, fill); +				ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); +				pos = 0; +				left -= fill; +				src += fill; +			} + +			ops->ad(state, src, left); +			src += left & ~(MORUS1280_BLOCK_SIZE - 1); +			left &= MORUS1280_BLOCK_SIZE - 1; +		} + +		memcpy(buf.bytes + pos, src, left); + +		pos += left; +		assoclen -= size; +		scatterwalk_unmap(mapped); +		scatterwalk_advance(&walk, size); +		scatterwalk_done(&walk, 0, assoclen); +	} + +	if (pos > 0) { +		memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos); +		ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); +	} +} + +static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state, +						struct morus1280_ops ops, +						struct aead_request *req) +{ +	struct skcipher_walk walk; +	u8 *cursor_src, *cursor_dst; +	unsigned int chunksize, base; + +	ops.skcipher_walk_init(&walk, req, false); + +	while (walk.nbytes) { +		cursor_src = walk.src.virt.addr; +		cursor_dst = walk.dst.virt.addr; +		chunksize = walk.nbytes; + +		ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize); + +		base = chunksize & ~(MORUS1280_BLOCK_SIZE - 1); +		cursor_src += base; +		cursor_dst += base; +		chunksize &= MORUS1280_BLOCK_SIZE - 1; + +		if (chunksize > 0) +			ops.crypt_tail(state, cursor_src, cursor_dst, +				       chunksize); + +		skcipher_walk_done(&walk, 0); +	} +} + +int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, +				 unsigned int keylen) +{ +	struct morus1280_ctx *ctx = crypto_aead_ctx(aead); + +	if (keylen == MORUS1280_BLOCK_SIZE) { +		memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE); +	} else if (keylen == MORUS1280_BLOCK_SIZE / 2) { +		memcpy(ctx->key.bytes, key, keylen); +		memcpy(ctx->key.bytes + keylen, key, keylen); +	} else { +		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); +		return -EINVAL; +	} + +	return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey); + +int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm, +				      unsigned int authsize) +{ +	return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize); + +static void crypto_morus1280_glue_crypt(struct aead_request *req, +					struct morus1280_ops ops, +					unsigned int cryptlen, +					struct morus1280_block *tag_xor) +{ +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); +	struct morus1280_state state; + +	kernel_fpu_begin(); + +	ctx->ops->init(&state, &ctx->key, req->iv); +	crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); +	crypto_morus1280_glue_process_crypt(&state, ops, req); +	ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); + +	kernel_fpu_end(); +} + +int crypto_morus1280_glue_encrypt(struct aead_request *req) +{ +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); +	struct morus1280_ops OPS = { +		.skcipher_walk_init = skcipher_walk_aead_encrypt, +		.crypt_blocks = ctx->ops->enc, +		.crypt_tail = ctx->ops->enc_tail, +	}; + +	struct morus1280_block tag = {}; +	unsigned int authsize = crypto_aead_authsize(tfm); +	unsigned int cryptlen = req->cryptlen; + +	crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); + +	scatterwalk_map_and_copy(tag.bytes, req->dst, +				 req->assoclen + cryptlen, authsize, 1); +	return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt); + +int crypto_morus1280_glue_decrypt(struct aead_request *req) +{ +	static const u8 zeros[MORUS1280_BLOCK_SIZE] = {}; + +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); +	struct morus1280_ops OPS = { +		.skcipher_walk_init = skcipher_walk_aead_decrypt, +		.crypt_blocks = ctx->ops->dec, +		.crypt_tail = ctx->ops->dec_tail, +	}; + +	struct morus1280_block tag; +	unsigned int authsize = crypto_aead_authsize(tfm); +	unsigned int cryptlen = req->cryptlen - authsize; + +	scatterwalk_map_and_copy(tag.bytes, req->src, +				 req->assoclen + cryptlen, authsize, 0); + +	crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); + +	return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt); + +void crypto_morus1280_glue_init_ops(struct crypto_aead *aead, +				    const struct morus1280_glue_ops *ops) +{ +	struct morus1280_ctx *ctx = crypto_aead_ctx(aead); +	ctx->ops = ops; +} +EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops); + +int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, +				 unsigned int keylen) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey); + +int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead, +				      unsigned int authsize) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize); + +int cryptd_morus1280_glue_encrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	aead = &cryptd_tfm->base; +	if (irq_fpu_usable() && (!in_atomic() || +				 !cryptd_aead_queued(cryptd_tfm))) +		aead = cryptd_aead_child(cryptd_tfm); + +	aead_request_set_tfm(req, aead); + +	return crypto_aead_encrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt); + +int cryptd_morus1280_glue_decrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	aead = &cryptd_tfm->base; +	if (irq_fpu_usable() && (!in_atomic() || +				 !cryptd_aead_queued(cryptd_tfm))) +		aead = cryptd_aead_child(cryptd_tfm); + +	aead_request_set_tfm(req, aead); + +	return crypto_aead_decrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt); + +int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead) +{ +	struct cryptd_aead *cryptd_tfm; +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	const char *name = crypto_aead_alg(aead)->base.cra_driver_name; +	char internal_name[CRYPTO_MAX_ALG_NAME]; + +	if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) +			>= CRYPTO_MAX_ALG_NAME) +		return -ENAMETOOLONG; + +	cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, +				       CRYPTO_ALG_INTERNAL); +	if (IS_ERR(cryptd_tfm)) +		return PTR_ERR(cryptd_tfm); + +	*ctx = cryptd_tfm; +	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); +	return 0; +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm); + +void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); + +	cryptd_free_aead(*ctx); +} +EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <[email protected]>"); +MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S new file mode 100644 index 000000000000..71c72a0a0862 --- /dev/null +++ b/arch/x86/crypto/morus640-sse2-asm.S @@ -0,0 +1,614 @@ +/* + * SSE2 implementation of MORUS-640 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define SHUFFLE_MASK(i0, i1, i2, i3) \ +	(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) + +#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) +#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) +#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) + +#define STATE0	%xmm0 +#define STATE1	%xmm1 +#define STATE2	%xmm2 +#define STATE3	%xmm3 +#define STATE4	%xmm4 +#define KEY	%xmm5 +#define MSG	%xmm5 +#define T0	%xmm6 +#define T1	%xmm7 + +.section .rodata.cst16.morus640_const, "aM", @progbits, 32 +.align 16 +.Lmorus640_const_0: +	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d +	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 +.Lmorus640_const_1: +	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 +	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 +.align 16 +.Lmorus640_counter: +	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 +	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + +.text + +.macro morus640_round s0, s1, s2, s3, s4, b, w +	movdqa \s1, T0 +	pand \s2, T0 +	pxor T0, \s0 +	pxor \s3, \s0 +	movdqa \s0, T0 +	pslld $\b, T0 +	psrld $(32 - \b), \s0 +	pxor T0, \s0 +	pshufd $\w, \s3, \s3 +.endm + +/* + * __morus640_update: internal ABI + * input: + *   STATE[0-4] - input state + *   MSG        - message block + * output: + *   STATE[0-4] - output state + * changed: + *   T0 + */ +__morus640_update: +	morus640_round STATE0, STATE1, STATE2, STATE3, STATE4,  5, MASK1 +	pxor MSG, STATE1 +	morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 +	pxor MSG, STATE2 +	morus640_round STATE2, STATE3, STATE4, STATE0, STATE1,  7, MASK3 +	pxor MSG, STATE3 +	morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 +	pxor MSG, STATE4 +	morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 +	ret +ENDPROC(__morus640_update) + + +/* + * __morus640_update_zero: internal ABI + * input: + *   STATE[0-4] - input state + * output: + *   STATE[0-4] - output state + * changed: + *   T0 + */ +__morus640_update_zero: +	morus640_round STATE0, STATE1, STATE2, STATE3, STATE4,  5, MASK1 +	morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 +	morus640_round STATE2, STATE3, STATE4, STATE0, STATE1,  7, MASK3 +	morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 +	morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 +	ret +ENDPROC(__morus640_update_zero) + +/* + * __load_partial: internal ABI + * input: + *   %rsi - src + *   %rcx - bytes + * output: + *   MSG  - message block + * changed: + *   T0 + *   %r8 + *   %r9 + */ +__load_partial: +	xor %r9, %r9 +	pxor MSG, MSG + +	mov %rcx, %r8 +	and $0x1, %r8 +	jz .Lld_partial_1 + +	mov %rcx, %r8 +	and $0x1E, %r8 +	add %rsi, %r8 +	mov (%r8), %r9b + +.Lld_partial_1: +	mov %rcx, %r8 +	and $0x2, %r8 +	jz .Lld_partial_2 + +	mov %rcx, %r8 +	and $0x1C, %r8 +	add %rsi, %r8 +	shl $16, %r9 +	mov (%r8), %r9w + +.Lld_partial_2: +	mov %rcx, %r8 +	and $0x4, %r8 +	jz .Lld_partial_4 + +	mov %rcx, %r8 +	and $0x18, %r8 +	add %rsi, %r8 +	shl $32, %r9 +	mov (%r8), %r8d +	xor %r8, %r9 + +.Lld_partial_4: +	movq %r9, MSG + +	mov %rcx, %r8 +	and $0x8, %r8 +	jz .Lld_partial_8 + +	mov %rcx, %r8 +	and $0x10, %r8 +	add %rsi, %r8 +	pslldq $8, MSG +	movq (%r8), T0 +	pxor T0, MSG + +.Lld_partial_8: +	ret +ENDPROC(__load_partial) + +/* + * __store_partial: internal ABI + * input: + *   %rdx - dst + *   %rcx - bytes + * output: + *   T0   - message block + * changed: + *   %r8 + *   %r9 + *   %r10 + */ +__store_partial: +	mov %rcx, %r8 +	mov %rdx, %r9 + +	movq T0, %r10 + +	cmp $8, %r8 +	jl .Lst_partial_8 + +	mov %r10, (%r9) +	psrldq $8, T0 +	movq T0, %r10 + +	sub $8, %r8 +	add $8, %r9 + +.Lst_partial_8: +	cmp $4, %r8 +	jl .Lst_partial_4 + +	mov %r10d, (%r9) +	shr $32, %r10 + +	sub $4, %r8 +	add $4, %r9 + +.Lst_partial_4: +	cmp $2, %r8 +	jl .Lst_partial_2 + +	mov %r10w, (%r9) +	shr $16, %r10 + +	sub $2, %r8 +	add $2, %r9 + +.Lst_partial_2: +	cmp $1, %r8 +	jl .Lst_partial_1 + +	mov %r10b, (%r9) + +.Lst_partial_1: +	ret +ENDPROC(__store_partial) + +/* + * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv); + */ +ENTRY(crypto_morus640_sse2_init) +	FRAME_BEGIN + +	/* load IV: */ +	movdqu (%rdx), STATE0 +	/* load key: */ +	movdqu (%rsi), KEY +	movdqa KEY, STATE1 +	/* load all ones: */ +	pcmpeqd STATE2, STATE2 +	/* load the constants: */ +	movdqa .Lmorus640_const_0, STATE3 +	movdqa .Lmorus640_const_1, STATE4 + +	/* update 16 times with zero: */ +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero +	call __morus640_update_zero + +	/* xor-in the key again after updates: */ +	pxor KEY, STATE1 + +	/* store the state: */ +	movdqu STATE0, (0 * 16)(%rdi) +	movdqu STATE1, (1 * 16)(%rdi) +	movdqu STATE2, (2 * 16)(%rdi) +	movdqu STATE3, (3 * 16)(%rdi) +	movdqu STATE4, (4 * 16)(%rdi) + +	FRAME_END +	ret +ENDPROC(crypto_morus640_sse2_init) + +/* + * void crypto_morus640_sse2_ad(void *state, const void *data, + *                              unsigned int length); + */ +ENTRY(crypto_morus640_sse2_ad) +	FRAME_BEGIN + +	cmp $16, %rdx +	jb .Lad_out + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0 +	movdqu (1 * 16)(%rdi), STATE1 +	movdqu (2 * 16)(%rdi), STATE2 +	movdqu (3 * 16)(%rdi), STATE3 +	movdqu (4 * 16)(%rdi), STATE4 + +	mov %rsi, %r8 +	and $0xF, %r8 +	jnz .Lad_u_loop + +.align 4 +.Lad_a_loop: +	movdqa (%rsi), MSG +	call __morus640_update +	sub $16, %rdx +	add $16, %rsi +	cmp $16, %rdx +	jge .Lad_a_loop + +	jmp .Lad_cont +.align 4 +.Lad_u_loop: +	movdqu (%rsi), MSG +	call __morus640_update +	sub $16, %rdx +	add $16, %rsi +	cmp $16, %rdx +	jge .Lad_u_loop + +.Lad_cont: +	/* store the state: */ +	movdqu STATE0, (0 * 16)(%rdi) +	movdqu STATE1, (1 * 16)(%rdi) +	movdqu STATE2, (2 * 16)(%rdi) +	movdqu STATE3, (3 * 16)(%rdi) +	movdqu STATE4, (4 * 16)(%rdi) + +.Lad_out: +	FRAME_END +	ret +ENDPROC(crypto_morus640_sse2_ad) + +/* + * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst, + *                               unsigned int length); + */ +ENTRY(crypto_morus640_sse2_enc) +	FRAME_BEGIN + +	cmp $16, %rcx +	jb .Lenc_out + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0 +	movdqu (1 * 16)(%rdi), STATE1 +	movdqu (2 * 16)(%rdi), STATE2 +	movdqu (3 * 16)(%rdi), STATE3 +	movdqu (4 * 16)(%rdi), STATE4 + +	mov %rsi, %r8 +	or  %rdx, %r8 +	and $0xF, %r8 +	jnz .Lenc_u_loop + +.align 4 +.Lenc_a_loop: +	movdqa (%rsi), MSG +	movdqa MSG, T0 +	pxor STATE0, T0 +	pshufd $MASK3, STATE1, T1 +	pxor T1, T0 +	movdqa STATE2, T1 +	pand STATE3, T1 +	pxor T1, T0 +	movdqa T0, (%rdx) + +	call __morus640_update +	sub $16, %rcx +	add $16, %rsi +	add $16, %rdx +	cmp $16, %rcx +	jge .Lenc_a_loop + +	jmp .Lenc_cont +.align 4 +.Lenc_u_loop: +	movdqu (%rsi), MSG +	movdqa MSG, T0 +	pxor STATE0, T0 +	pshufd $MASK3, STATE1, T1 +	pxor T1, T0 +	movdqa STATE2, T1 +	pand STATE3, T1 +	pxor T1, T0 +	movdqu T0, (%rdx) + +	call __morus640_update +	sub $16, %rcx +	add $16, %rsi +	add $16, %rdx +	cmp $16, %rcx +	jge .Lenc_u_loop + +.Lenc_cont: +	/* store the state: */ +	movdqu STATE0, (0 * 16)(%rdi) +	movdqu STATE1, (1 * 16)(%rdi) +	movdqu STATE2, (2 * 16)(%rdi) +	movdqu STATE3, (3 * 16)(%rdi) +	movdqu STATE4, (4 * 16)(%rdi) + +.Lenc_out: +	FRAME_END +	ret +ENDPROC(crypto_morus640_sse2_enc) + +/* + * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst, + *                                    unsigned int length); + */ +ENTRY(crypto_morus640_sse2_enc_tail) +	FRAME_BEGIN + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0 +	movdqu (1 * 16)(%rdi), STATE1 +	movdqu (2 * 16)(%rdi), STATE2 +	movdqu (3 * 16)(%rdi), STATE3 +	movdqu (4 * 16)(%rdi), STATE4 + +	/* encrypt message: */ +	call __load_partial + +	movdqa MSG, T0 +	pxor STATE0, T0 +	pshufd $MASK3, STATE1, T1 +	pxor T1, T0 +	movdqa STATE2, T1 +	pand STATE3, T1 +	pxor T1, T0 + +	call __store_partial + +	call __morus640_update + +	/* store the state: */ +	movdqu STATE0, (0 * 16)(%rdi) +	movdqu STATE1, (1 * 16)(%rdi) +	movdqu STATE2, (2 * 16)(%rdi) +	movdqu STATE3, (3 * 16)(%rdi) +	movdqu STATE4, (4 * 16)(%rdi) + +	FRAME_END +ENDPROC(crypto_morus640_sse2_enc_tail) + +/* + * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst, + *                               unsigned int length); + */ +ENTRY(crypto_morus640_sse2_dec) +	FRAME_BEGIN + +	cmp $16, %rcx +	jb .Ldec_out + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0 +	movdqu (1 * 16)(%rdi), STATE1 +	movdqu (2 * 16)(%rdi), STATE2 +	movdqu (3 * 16)(%rdi), STATE3 +	movdqu (4 * 16)(%rdi), STATE4 + +	mov %rsi, %r8 +	or  %rdx, %r8 +	and $0xF, %r8 +	jnz .Ldec_u_loop + +.align 4 +.Ldec_a_loop: +	movdqa (%rsi), MSG +	pxor STATE0, MSG +	pshufd $MASK3, STATE1, T0 +	pxor T0, MSG +	movdqa STATE2, T0 +	pand STATE3, T0 +	pxor T0, MSG +	movdqa MSG, (%rdx) + +	call __morus640_update +	sub $16, %rcx +	add $16, %rsi +	add $16, %rdx +	cmp $16, %rcx +	jge .Ldec_a_loop + +	jmp .Ldec_cont +.align 4 +.Ldec_u_loop: +	movdqu (%rsi), MSG +	pxor STATE0, MSG +	pshufd $MASK3, STATE1, T0 +	pxor T0, MSG +	movdqa STATE2, T0 +	pand STATE3, T0 +	pxor T0, MSG +	movdqu MSG, (%rdx) + +	call __morus640_update +	sub $16, %rcx +	add $16, %rsi +	add $16, %rdx +	cmp $16, %rcx +	jge .Ldec_u_loop + +.Ldec_cont: +	/* store the state: */ +	movdqu STATE0, (0 * 16)(%rdi) +	movdqu STATE1, (1 * 16)(%rdi) +	movdqu STATE2, (2 * 16)(%rdi) +	movdqu STATE3, (3 * 16)(%rdi) +	movdqu STATE4, (4 * 16)(%rdi) + +.Ldec_out: +	FRAME_END +	ret +ENDPROC(crypto_morus640_sse2_dec) + +/* + * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst, + *                                    unsigned int length); + */ +ENTRY(crypto_morus640_sse2_dec_tail) +	FRAME_BEGIN + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0 +	movdqu (1 * 16)(%rdi), STATE1 +	movdqu (2 * 16)(%rdi), STATE2 +	movdqu (3 * 16)(%rdi), STATE3 +	movdqu (4 * 16)(%rdi), STATE4 + +	/* decrypt message: */ +	call __load_partial + +	pxor STATE0, MSG +	pshufd $MASK3, STATE1, T0 +	pxor T0, MSG +	movdqa STATE2, T0 +	pand STATE3, T0 +	pxor T0, MSG +	movdqa MSG, T0 + +	call __store_partial + +	/* mask with byte count: */ +	movq %rcx, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	punpcklbw T0, T0 +	movdqa .Lmorus640_counter, T1 +	pcmpgtb T1, T0 +	pand T0, MSG + +	call __morus640_update + +	/* store the state: */ +	movdqu STATE0, (0 * 16)(%rdi) +	movdqu STATE1, (1 * 16)(%rdi) +	movdqu STATE2, (2 * 16)(%rdi) +	movdqu STATE3, (3 * 16)(%rdi) +	movdqu STATE4, (4 * 16)(%rdi) + +	FRAME_END +	ret +ENDPROC(crypto_morus640_sse2_dec_tail) + +/* + * void crypto_morus640_sse2_final(void *state, void *tag_xor, + *	                           u64 assoclen, u64 cryptlen); + */ +ENTRY(crypto_morus640_sse2_final) +	FRAME_BEGIN + +	/* load the state: */ +	movdqu (0 * 16)(%rdi), STATE0 +	movdqu (1 * 16)(%rdi), STATE1 +	movdqu (2 * 16)(%rdi), STATE2 +	movdqu (3 * 16)(%rdi), STATE3 +	movdqu (4 * 16)(%rdi), STATE4 + +	/* xor state[0] into state[4]: */ +	pxor STATE0, STATE4 + +	/* prepare length block: */ +	movq %rdx, MSG +	movq %rcx, T0 +	pslldq $8, T0 +	pxor T0, MSG +	psllq $3, MSG /* multiply by 8 (to get bit count) */ + +	/* update state: */ +	call __morus640_update +	call __morus640_update +	call __morus640_update +	call __morus640_update +	call __morus640_update +	call __morus640_update +	call __morus640_update +	call __morus640_update +	call __morus640_update +	call __morus640_update + +	/* xor tag: */ +	movdqu (%rsi), MSG + +	pxor STATE0, MSG +	pshufd $MASK3, STATE1, T0 +	pxor T0, MSG +	movdqa STATE2, T0 +	pand STATE3, T0 +	pxor T0, MSG + +	movdqu MSG, (%rsi) + +	FRAME_END +	ret +ENDPROC(crypto_morus640_sse2_final) diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c new file mode 100644 index 000000000000..26b47e2db8d2 --- /dev/null +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -0,0 +1,68 @@ +/* + * The MORUS-640 Authenticated-Encryption Algorithm + *   Glue for SSE2 implementation + * + * Copyright (c) 2016-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/internal/aead.h> +#include <crypto/morus640_glue.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/cpu_device_id.h> + +asmlinkage void crypto_morus640_sse2_init(void *state, const void *key, +					  const void *iv); +asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data, +					unsigned int length); + +asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src, +					 void *dst, unsigned int length); +asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src, +					 void *dst, unsigned int length); + +asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src, +					      void *dst, unsigned int length); +asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src, +					      void *dst, unsigned int length); + +asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, +					   u64 assoclen, u64 cryptlen); + +MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); + +static const struct x86_cpu_id sse2_cpu_id[] = { +    X86_FEATURE_MATCH(X86_FEATURE_XMM2), +    {} +}; +MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); + +static int __init crypto_morus640_sse2_module_init(void) +{ +	if (!x86_match_cpu(sse2_cpu_id)) +		return -ENODEV; + +	return crypto_register_aeads(crypto_morus640_sse2_algs, +				     ARRAY_SIZE(crypto_morus640_sse2_algs)); +} + +static void __exit crypto_morus640_sse2_module_exit(void) +{ +	crypto_unregister_aeads(crypto_morus640_sse2_algs, +				ARRAY_SIZE(crypto_morus640_sse2_algs)); +} + +module_init(crypto_morus640_sse2_module_init); +module_exit(crypto_morus640_sse2_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <[email protected]>"); +MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation"); +MODULE_ALIAS_CRYPTO("morus640"); +MODULE_ALIAS_CRYPTO("morus640-sse2"); diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c new file mode 100644 index 000000000000..7b58fe4d9bd1 --- /dev/null +++ b/arch/x86/crypto/morus640_glue.c @@ -0,0 +1,298 @@ +/* + * The MORUS-640 Authenticated-Encryption Algorithm + *   Common x86 SIMD glue skeleton + * + * Copyright (c) 2016-2018 Ondrej Mosnacek <[email protected]> + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <crypto/cryptd.h> +#include <crypto/internal/aead.h> +#include <crypto/internal/skcipher.h> +#include <crypto/morus640_glue.h> +#include <crypto/scatterwalk.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/scatterlist.h> +#include <asm/fpu/api.h> + +struct morus640_state { +	struct morus640_block s[MORUS_STATE_BLOCKS]; +}; + +struct morus640_ops { +	int (*skcipher_walk_init)(struct skcipher_walk *walk, +				  struct aead_request *req, bool atomic); + +	void (*crypt_blocks)(void *state, const void *src, void *dst, +			     unsigned int length); +	void (*crypt_tail)(void *state, const void *src, void *dst, +			   unsigned int length); +}; + +static void crypto_morus640_glue_process_ad( +		struct morus640_state *state, +		const struct morus640_glue_ops *ops, +		struct scatterlist *sg_src, unsigned int assoclen) +{ +	struct scatter_walk walk; +	struct morus640_block buf; +	unsigned int pos = 0; + +	scatterwalk_start(&walk, sg_src); +	while (assoclen != 0) { +		unsigned int size = scatterwalk_clamp(&walk, assoclen); +		unsigned int left = size; +		void *mapped = scatterwalk_map(&walk); +		const u8 *src = (const u8 *)mapped; + +		if (pos + size >= MORUS640_BLOCK_SIZE) { +			if (pos > 0) { +				unsigned int fill = MORUS640_BLOCK_SIZE - pos; +				memcpy(buf.bytes + pos, src, fill); +				ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); +				pos = 0; +				left -= fill; +				src += fill; +			} + +			ops->ad(state, src, left); +			src += left & ~(MORUS640_BLOCK_SIZE - 1); +			left &= MORUS640_BLOCK_SIZE - 1; +		} + +		memcpy(buf.bytes + pos, src, left); + +		pos += left; +		assoclen -= size; +		scatterwalk_unmap(mapped); +		scatterwalk_advance(&walk, size); +		scatterwalk_done(&walk, 0, assoclen); +	} + +	if (pos > 0) { +		memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos); +		ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); +	} +} + +static void crypto_morus640_glue_process_crypt(struct morus640_state *state, +					       struct morus640_ops ops, +					       struct aead_request *req) +{ +	struct skcipher_walk walk; +	u8 *cursor_src, *cursor_dst; +	unsigned int chunksize, base; + +	ops.skcipher_walk_init(&walk, req, false); + +	while (walk.nbytes) { +		cursor_src = walk.src.virt.addr; +		cursor_dst = walk.dst.virt.addr; +		chunksize = walk.nbytes; + +		ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize); + +		base = chunksize & ~(MORUS640_BLOCK_SIZE - 1); +		cursor_src += base; +		cursor_dst += base; +		chunksize &= MORUS640_BLOCK_SIZE - 1; + +		if (chunksize > 0) +			ops.crypt_tail(state, cursor_src, cursor_dst, +				       chunksize); + +		skcipher_walk_done(&walk, 0); +	} +} + +int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, +				unsigned int keylen) +{ +	struct morus640_ctx *ctx = crypto_aead_ctx(aead); + +	if (keylen != MORUS640_BLOCK_SIZE) { +		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); +		return -EINVAL; +	} + +	memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE); +	return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey); + +int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm, +				     unsigned int authsize) +{ +	return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize); + +static void crypto_morus640_glue_crypt(struct aead_request *req, +				       struct morus640_ops ops, +				       unsigned int cryptlen, +				       struct morus640_block *tag_xor) +{ +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct morus640_ctx *ctx = crypto_aead_ctx(tfm); +	struct morus640_state state; + +	kernel_fpu_begin(); + +	ctx->ops->init(&state, &ctx->key, req->iv); +	crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); +	crypto_morus640_glue_process_crypt(&state, ops, req); +	ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); + +	kernel_fpu_end(); +} + +int crypto_morus640_glue_encrypt(struct aead_request *req) +{ +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct morus640_ctx *ctx = crypto_aead_ctx(tfm); +	struct morus640_ops OPS = { +		.skcipher_walk_init = skcipher_walk_aead_encrypt, +		.crypt_blocks = ctx->ops->enc, +		.crypt_tail = ctx->ops->enc_tail, +	}; + +	struct morus640_block tag = {}; +	unsigned int authsize = crypto_aead_authsize(tfm); +	unsigned int cryptlen = req->cryptlen; + +	crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); + +	scatterwalk_map_and_copy(tag.bytes, req->dst, +				 req->assoclen + cryptlen, authsize, 1); +	return 0; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt); + +int crypto_morus640_glue_decrypt(struct aead_request *req) +{ +	static const u8 zeros[MORUS640_BLOCK_SIZE] = {}; + +	struct crypto_aead *tfm = crypto_aead_reqtfm(req); +	struct morus640_ctx *ctx = crypto_aead_ctx(tfm); +	struct morus640_ops OPS = { +		.skcipher_walk_init = skcipher_walk_aead_decrypt, +		.crypt_blocks = ctx->ops->dec, +		.crypt_tail = ctx->ops->dec_tail, +	}; + +	struct morus640_block tag; +	unsigned int authsize = crypto_aead_authsize(tfm); +	unsigned int cryptlen = req->cryptlen - authsize; + +	scatterwalk_map_and_copy(tag.bytes, req->src, +				 req->assoclen + cryptlen, authsize, 0); + +	crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); + +	return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt); + +void crypto_morus640_glue_init_ops(struct crypto_aead *aead, +				   const struct morus640_glue_ops *ops) +{ +	struct morus640_ctx *ctx = crypto_aead_ctx(aead); +	ctx->ops = ops; +} +EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops); + +int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, +				unsigned int keylen) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey); + +int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead, +				     unsigned int authsize) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize); + +int cryptd_morus640_glue_encrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	aead = &cryptd_tfm->base; +	if (irq_fpu_usable() && (!in_atomic() || +				 !cryptd_aead_queued(cryptd_tfm))) +		aead = cryptd_aead_child(cryptd_tfm); + +	aead_request_set_tfm(req, aead); + +	return crypto_aead_encrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt); + +int cryptd_morus640_glue_decrypt(struct aead_request *req) +{ +	struct crypto_aead *aead = crypto_aead_reqtfm(req); +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	struct cryptd_aead *cryptd_tfm = *ctx; + +	aead = &cryptd_tfm->base; +	if (irq_fpu_usable() && (!in_atomic() || +				 !cryptd_aead_queued(cryptd_tfm))) +		aead = cryptd_aead_child(cryptd_tfm); + +	aead_request_set_tfm(req, aead); + +	return crypto_aead_decrypt(req); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt); + +int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead) +{ +	struct cryptd_aead *cryptd_tfm; +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); +	const char *name = crypto_aead_alg(aead)->base.cra_driver_name; +	char internal_name[CRYPTO_MAX_ALG_NAME]; + +	if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) +			>= CRYPTO_MAX_ALG_NAME) +		return -ENAMETOOLONG; + +	cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, +				       CRYPTO_ALG_INTERNAL); +	if (IS_ERR(cryptd_tfm)) +		return PTR_ERR(cryptd_tfm); + +	*ctx = cryptd_tfm; +	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); +	return 0; +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm); + +void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead) +{ +	struct cryptd_aead **ctx = crypto_aead_ctx(aead); + +	cryptd_free_aead(*ctx); +} +EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ondrej Mosnacek <[email protected]>"); +MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S deleted file mode 100644 index 6014b7b9e52a..000000000000 --- a/arch/x86/crypto/salsa20-i586-asm_32.S +++ /dev/null @@ -1,938 +0,0 @@ -# Derived from: -#	salsa20_pm.s version 20051229 -#	D. J. Bernstein -#	Public domain. - -#include <linux/linkage.h> - -.text - -# enter salsa20_encrypt_bytes -ENTRY(salsa20_encrypt_bytes) -	mov	%esp,%eax -	and	$31,%eax -	add	$256,%eax -	sub	%eax,%esp -	# eax_stack = eax -	movl	%eax,80(%esp) -	# ebx_stack = ebx -	movl	%ebx,84(%esp) -	# esi_stack = esi -	movl	%esi,88(%esp) -	# edi_stack = edi -	movl	%edi,92(%esp) -	# ebp_stack = ebp -	movl	%ebp,96(%esp) -	# x = arg1 -	movl	4(%esp,%eax),%edx -	# m = arg2 -	movl	8(%esp,%eax),%esi -	# out = arg3 -	movl	12(%esp,%eax),%edi -	# bytes = arg4 -	movl	16(%esp,%eax),%ebx -	# bytes -= 0 -	sub	$0,%ebx -	# goto done if unsigned<= -	jbe	._done -._start: -	# in0 = *(uint32 *) (x + 0) -	movl	0(%edx),%eax -	# in1 = *(uint32 *) (x + 4) -	movl	4(%edx),%ecx -	# in2 = *(uint32 *) (x + 8) -	movl	8(%edx),%ebp -	# j0 = in0 -	movl	%eax,164(%esp) -	# in3 = *(uint32 *) (x + 12) -	movl	12(%edx),%eax -	# j1 = in1 -	movl	%ecx,168(%esp) -	# in4 = *(uint32 *) (x + 16) -	movl	16(%edx),%ecx -	# j2 = in2 -	movl	%ebp,172(%esp) -	# in5 = *(uint32 *) (x + 20) -	movl	20(%edx),%ebp -	# j3 = in3 -	movl	%eax,176(%esp) -	# in6 = *(uint32 *) (x + 24) -	movl	24(%edx),%eax -	# j4 = in4 -	movl	%ecx,180(%esp) -	# in7 = *(uint32 *) (x + 28) -	movl	28(%edx),%ecx -	# j5 = in5 -	movl	%ebp,184(%esp) -	# in8 = *(uint32 *) (x + 32) -	movl	32(%edx),%ebp -	# j6 = in6 -	movl	%eax,188(%esp) -	# in9 = *(uint32 *) (x + 36) -	movl	36(%edx),%eax -	# j7 = in7 -	movl	%ecx,192(%esp) -	# in10 = *(uint32 *) (x + 40) -	movl	40(%edx),%ecx -	# j8 = in8 -	movl	%ebp,196(%esp) -	# in11 = *(uint32 *) (x + 44) -	movl	44(%edx),%ebp -	# j9 = in9 -	movl	%eax,200(%esp) -	# in12 = *(uint32 *) (x + 48) -	movl	48(%edx),%eax -	# j10 = in10 -	movl	%ecx,204(%esp) -	# in13 = *(uint32 *) (x + 52) -	movl	52(%edx),%ecx -	# j11 = in11 -	movl	%ebp,208(%esp) -	# in14 = *(uint32 *) (x + 56) -	movl	56(%edx),%ebp -	# j12 = in12 -	movl	%eax,212(%esp) -	# in15 = *(uint32 *) (x + 60) -	movl	60(%edx),%eax -	# j13 = in13 -	movl	%ecx,216(%esp) -	# j14 = in14 -	movl	%ebp,220(%esp) -	# j15 = in15 -	movl	%eax,224(%esp) -	# x_backup = x -	movl	%edx,64(%esp) -._bytesatleast1: -	#   bytes - 64 -	cmp	$64,%ebx -	#   goto nocopy if unsigned>= -	jae	._nocopy -	#     ctarget = out -	movl	%edi,228(%esp) -	#     out = &tmp -	leal	0(%esp),%edi -	#     i = bytes -	mov	%ebx,%ecx -	#     while (i) { *out++ = *m++; --i } -	rep	movsb -	#     out = &tmp -	leal	0(%esp),%edi -	#     m = &tmp -	leal	0(%esp),%esi -._nocopy: -	#   out_backup = out -	movl	%edi,72(%esp) -	#   m_backup = m -	movl	%esi,68(%esp) -	#   bytes_backup = bytes -	movl	%ebx,76(%esp) -	#   in0 = j0 -	movl	164(%esp),%eax -	#   in1 = j1 -	movl	168(%esp),%ecx -	#   in2 = j2 -	movl	172(%esp),%edx -	#   in3 = j3 -	movl	176(%esp),%ebx -	#   x0 = in0 -	movl	%eax,100(%esp) -	#   x1 = in1 -	movl	%ecx,104(%esp) -	#   x2 = in2 -	movl	%edx,108(%esp) -	#   x3 = in3 -	movl	%ebx,112(%esp) -	#   in4 = j4 -	movl	180(%esp),%eax -	#   in5 = j5 -	movl	184(%esp),%ecx -	#   in6 = j6 -	movl	188(%esp),%edx -	#   in7 = j7 -	movl	192(%esp),%ebx -	#   x4 = in4 -	movl	%eax,116(%esp) -	#   x5 = in5 -	movl	%ecx,120(%esp) -	#   x6 = in6 -	movl	%edx,124(%esp) -	#   x7 = in7 -	movl	%ebx,128(%esp) -	#   in8 = j8 -	movl	196(%esp),%eax -	#   in9 = j9 -	movl	200(%esp),%ecx -	#   in10 = j10 -	movl	204(%esp),%edx -	#   in11 = j11 -	movl	208(%esp),%ebx -	#   x8 = in8 -	movl	%eax,132(%esp) -	#   x9 = in9 -	movl	%ecx,136(%esp) -	#   x10 = in10 -	movl	%edx,140(%esp) -	#   x11 = in11 -	movl	%ebx,144(%esp) -	#   in12 = j12 -	movl	212(%esp),%eax -	#   in13 = j13 -	movl	216(%esp),%ecx -	#   in14 = j14 -	movl	220(%esp),%edx -	#   in15 = j15 -	movl	224(%esp),%ebx -	#   x12 = in12 -	movl	%eax,148(%esp) -	#   x13 = in13 -	movl	%ecx,152(%esp) -	#   x14 = in14 -	movl	%edx,156(%esp) -	#   x15 = in15 -	movl	%ebx,160(%esp) -	#   i = 20 -	mov	$20,%ebp -	# p = x0 -	movl	100(%esp),%eax -	# s = x5 -	movl	120(%esp),%ecx -	# t = x10 -	movl	140(%esp),%edx -	# w = x15 -	movl	160(%esp),%ebx -._mainloop: -	# x0 = p -	movl	%eax,100(%esp) -	# 				x10 = t -	movl	%edx,140(%esp) -	# p += x12 -	addl	148(%esp),%eax -	# 		x5 = s -	movl	%ecx,120(%esp) -	# 				t += x6 -	addl	124(%esp),%edx -	# 						x15 = w -	movl	%ebx,160(%esp) -	# 		r = x1 -	movl	104(%esp),%esi -	# 		r += s -	add	%ecx,%esi -	# 						v = x11 -	movl	144(%esp),%edi -	# 						v += w -	add	%ebx,%edi -	# p <<<= 7 -	rol	$7,%eax -	# p ^= x4 -	xorl	116(%esp),%eax -	# 				t <<<= 7 -	rol	$7,%edx -	# 				t ^= x14 -	xorl	156(%esp),%edx -	# 		r <<<= 7 -	rol	$7,%esi -	# 		r ^= x9 -	xorl	136(%esp),%esi -	# 						v <<<= 7 -	rol	$7,%edi -	# 						v ^= x3 -	xorl	112(%esp),%edi -	# x4 = p -	movl	%eax,116(%esp) -	# 				x14 = t -	movl	%edx,156(%esp) -	# p += x0 -	addl	100(%esp),%eax -	# 		x9 = r -	movl	%esi,136(%esp) -	# 				t += x10 -	addl	140(%esp),%edx -	# 						x3 = v -	movl	%edi,112(%esp) -	# p <<<= 9 -	rol	$9,%eax -	# p ^= x8 -	xorl	132(%esp),%eax -	# 				t <<<= 9 -	rol	$9,%edx -	# 				t ^= x2 -	xorl	108(%esp),%edx -	# 		s += r -	add	%esi,%ecx -	# 		s <<<= 9 -	rol	$9,%ecx -	# 		s ^= x13 -	xorl	152(%esp),%ecx -	# 						w += v -	add	%edi,%ebx -	# 						w <<<= 9 -	rol	$9,%ebx -	# 						w ^= x7 -	xorl	128(%esp),%ebx -	# x8 = p -	movl	%eax,132(%esp) -	# 				x2 = t -	movl	%edx,108(%esp) -	# p += x4 -	addl	116(%esp),%eax -	# 		x13 = s -	movl	%ecx,152(%esp) -	# 				t += x14 -	addl	156(%esp),%edx -	# 						x7 = w -	movl	%ebx,128(%esp) -	# p <<<= 13 -	rol	$13,%eax -	# p ^= x12 -	xorl	148(%esp),%eax -	# 				t <<<= 13 -	rol	$13,%edx -	# 				t ^= x6 -	xorl	124(%esp),%edx -	# 		r += s -	add	%ecx,%esi -	# 		r <<<= 13 -	rol	$13,%esi -	# 		r ^= x1 -	xorl	104(%esp),%esi -	# 						v += w -	add	%ebx,%edi -	# 						v <<<= 13 -	rol	$13,%edi -	# 						v ^= x11 -	xorl	144(%esp),%edi -	# x12 = p -	movl	%eax,148(%esp) -	# 				x6 = t -	movl	%edx,124(%esp) -	# p += x8 -	addl	132(%esp),%eax -	# 		x1 = r -	movl	%esi,104(%esp) -	# 				t += x2 -	addl	108(%esp),%edx -	# 						x11 = v -	movl	%edi,144(%esp) -	# p <<<= 18 -	rol	$18,%eax -	# p ^= x0 -	xorl	100(%esp),%eax -	# 				t <<<= 18 -	rol	$18,%edx -	# 				t ^= x10 -	xorl	140(%esp),%edx -	# 		s += r -	add	%esi,%ecx -	# 		s <<<= 18 -	rol	$18,%ecx -	# 		s ^= x5 -	xorl	120(%esp),%ecx -	# 						w += v -	add	%edi,%ebx -	# 						w <<<= 18 -	rol	$18,%ebx -	# 						w ^= x15 -	xorl	160(%esp),%ebx -	# x0 = p -	movl	%eax,100(%esp) -	# 				x10 = t -	movl	%edx,140(%esp) -	# p += x3 -	addl	112(%esp),%eax -	# p <<<= 7 -	rol	$7,%eax -	# 		x5 = s -	movl	%ecx,120(%esp) -	# 				t += x9 -	addl	136(%esp),%edx -	# 						x15 = w -	movl	%ebx,160(%esp) -	# 		r = x4 -	movl	116(%esp),%esi -	# 		r += s -	add	%ecx,%esi -	# 						v = x14 -	movl	156(%esp),%edi -	# 						v += w -	add	%ebx,%edi -	# p ^= x1 -	xorl	104(%esp),%eax -	# 				t <<<= 7 -	rol	$7,%edx -	# 				t ^= x11 -	xorl	144(%esp),%edx -	# 		r <<<= 7 -	rol	$7,%esi -	# 		r ^= x6 -	xorl	124(%esp),%esi -	# 						v <<<= 7 -	rol	$7,%edi -	# 						v ^= x12 -	xorl	148(%esp),%edi -	# x1 = p -	movl	%eax,104(%esp) -	# 				x11 = t -	movl	%edx,144(%esp) -	# p += x0 -	addl	100(%esp),%eax -	# 		x6 = r -	movl	%esi,124(%esp) -	# 				t += x10 -	addl	140(%esp),%edx -	# 						x12 = v -	movl	%edi,148(%esp) -	# p <<<= 9 -	rol	$9,%eax -	# p ^= x2 -	xorl	108(%esp),%eax -	# 				t <<<= 9 -	rol	$9,%edx -	# 				t ^= x8 -	xorl	132(%esp),%edx -	# 		s += r -	add	%esi,%ecx -	# 		s <<<= 9 -	rol	$9,%ecx -	# 		s ^= x7 -	xorl	128(%esp),%ecx -	# 						w += v -	add	%edi,%ebx -	# 						w <<<= 9 -	rol	$9,%ebx -	# 						w ^= x13 -	xorl	152(%esp),%ebx -	# x2 = p -	movl	%eax,108(%esp) -	# 				x8 = t -	movl	%edx,132(%esp) -	# p += x1 -	addl	104(%esp),%eax -	# 		x7 = s -	movl	%ecx,128(%esp) -	# 				t += x11 -	addl	144(%esp),%edx -	# 						x13 = w -	movl	%ebx,152(%esp) -	# p <<<= 13 -	rol	$13,%eax -	# p ^= x3 -	xorl	112(%esp),%eax -	# 				t <<<= 13 -	rol	$13,%edx -	# 				t ^= x9 -	xorl	136(%esp),%edx -	# 		r += s -	add	%ecx,%esi -	# 		r <<<= 13 -	rol	$13,%esi -	# 		r ^= x4 -	xorl	116(%esp),%esi -	# 						v += w -	add	%ebx,%edi -	# 						v <<<= 13 -	rol	$13,%edi -	# 						v ^= x14 -	xorl	156(%esp),%edi -	# x3 = p -	movl	%eax,112(%esp) -	# 				x9 = t -	movl	%edx,136(%esp) -	# p += x2 -	addl	108(%esp),%eax -	# 		x4 = r -	movl	%esi,116(%esp) -	# 				t += x8 -	addl	132(%esp),%edx -	# 						x14 = v -	movl	%edi,156(%esp) -	# p <<<= 18 -	rol	$18,%eax -	# p ^= x0 -	xorl	100(%esp),%eax -	# 				t <<<= 18 -	rol	$18,%edx -	# 				t ^= x10 -	xorl	140(%esp),%edx -	# 		s += r -	add	%esi,%ecx -	# 		s <<<= 18 -	rol	$18,%ecx -	# 		s ^= x5 -	xorl	120(%esp),%ecx -	# 						w += v -	add	%edi,%ebx -	# 						w <<<= 18 -	rol	$18,%ebx -	# 						w ^= x15 -	xorl	160(%esp),%ebx -	# x0 = p -	movl	%eax,100(%esp) -	# 				x10 = t -	movl	%edx,140(%esp) -	# p += x12 -	addl	148(%esp),%eax -	# 		x5 = s -	movl	%ecx,120(%esp) -	# 				t += x6 -	addl	124(%esp),%edx -	# 						x15 = w -	movl	%ebx,160(%esp) -	# 		r = x1 -	movl	104(%esp),%esi -	# 		r += s -	add	%ecx,%esi -	# 						v = x11 -	movl	144(%esp),%edi -	# 						v += w -	add	%ebx,%edi -	# p <<<= 7 -	rol	$7,%eax -	# p ^= x4 -	xorl	116(%esp),%eax -	# 				t <<<= 7 -	rol	$7,%edx -	# 				t ^= x14 -	xorl	156(%esp),%edx -	# 		r <<<= 7 -	rol	$7,%esi -	# 		r ^= x9 -	xorl	136(%esp),%esi -	# 						v <<<= 7 -	rol	$7,%edi -	# 						v ^= x3 -	xorl	112(%esp),%edi -	# x4 = p -	movl	%eax,116(%esp) -	# 				x14 = t -	movl	%edx,156(%esp) -	# p += x0 -	addl	100(%esp),%eax -	# 		x9 = r -	movl	%esi,136(%esp) -	# 				t += x10 -	addl	140(%esp),%edx -	# 						x3 = v -	movl	%edi,112(%esp) -	# p <<<= 9 -	rol	$9,%eax -	# p ^= x8 -	xorl	132(%esp),%eax -	# 				t <<<= 9 -	rol	$9,%edx -	# 				t ^= x2 -	xorl	108(%esp),%edx -	# 		s += r -	add	%esi,%ecx -	# 		s <<<= 9 -	rol	$9,%ecx -	# 		s ^= x13 -	xorl	152(%esp),%ecx -	# 						w += v -	add	%edi,%ebx -	# 						w <<<= 9 -	rol	$9,%ebx -	# 						w ^= x7 -	xorl	128(%esp),%ebx -	# x8 = p -	movl	%eax,132(%esp) -	# 				x2 = t -	movl	%edx,108(%esp) -	# p += x4 -	addl	116(%esp),%eax -	# 		x13 = s -	movl	%ecx,152(%esp) -	# 				t += x14 -	addl	156(%esp),%edx -	# 						x7 = w -	movl	%ebx,128(%esp) -	# p <<<= 13 -	rol	$13,%eax -	# p ^= x12 -	xorl	148(%esp),%eax -	# 				t <<<= 13 -	rol	$13,%edx -	# 				t ^= x6 -	xorl	124(%esp),%edx -	# 		r += s -	add	%ecx,%esi -	# 		r <<<= 13 -	rol	$13,%esi -	# 		r ^= x1 -	xorl	104(%esp),%esi -	# 						v += w -	add	%ebx,%edi -	# 						v <<<= 13 -	rol	$13,%edi -	# 						v ^= x11 -	xorl	144(%esp),%edi -	# x12 = p -	movl	%eax,148(%esp) -	# 				x6 = t -	movl	%edx,124(%esp) -	# p += x8 -	addl	132(%esp),%eax -	# 		x1 = r -	movl	%esi,104(%esp) -	# 				t += x2 -	addl	108(%esp),%edx -	# 						x11 = v -	movl	%edi,144(%esp) -	# p <<<= 18 -	rol	$18,%eax -	# p ^= x0 -	xorl	100(%esp),%eax -	# 				t <<<= 18 -	rol	$18,%edx -	# 				t ^= x10 -	xorl	140(%esp),%edx -	# 		s += r -	add	%esi,%ecx -	# 		s <<<= 18 -	rol	$18,%ecx -	# 		s ^= x5 -	xorl	120(%esp),%ecx -	# 						w += v -	add	%edi,%ebx -	# 						w <<<= 18 -	rol	$18,%ebx -	# 						w ^= x15 -	xorl	160(%esp),%ebx -	# x0 = p -	movl	%eax,100(%esp) -	# 				x10 = t -	movl	%edx,140(%esp) -	# p += x3 -	addl	112(%esp),%eax -	# p <<<= 7 -	rol	$7,%eax -	# 		x5 = s -	movl	%ecx,120(%esp) -	# 				t += x9 -	addl	136(%esp),%edx -	# 						x15 = w -	movl	%ebx,160(%esp) -	# 		r = x4 -	movl	116(%esp),%esi -	# 		r += s -	add	%ecx,%esi -	# 						v = x14 -	movl	156(%esp),%edi -	# 						v += w -	add	%ebx,%edi -	# p ^= x1 -	xorl	104(%esp),%eax -	# 				t <<<= 7 -	rol	$7,%edx -	# 				t ^= x11 -	xorl	144(%esp),%edx -	# 		r <<<= 7 -	rol	$7,%esi -	# 		r ^= x6 -	xorl	124(%esp),%esi -	# 						v <<<= 7 -	rol	$7,%edi -	# 						v ^= x12 -	xorl	148(%esp),%edi -	# x1 = p -	movl	%eax,104(%esp) -	# 				x11 = t -	movl	%edx,144(%esp) -	# p += x0 -	addl	100(%esp),%eax -	# 		x6 = r -	movl	%esi,124(%esp) -	# 				t += x10 -	addl	140(%esp),%edx -	# 						x12 = v -	movl	%edi,148(%esp) -	# p <<<= 9 -	rol	$9,%eax -	# p ^= x2 -	xorl	108(%esp),%eax -	# 				t <<<= 9 -	rol	$9,%edx -	# 				t ^= x8 -	xorl	132(%esp),%edx -	# 		s += r -	add	%esi,%ecx -	# 		s <<<= 9 -	rol	$9,%ecx -	# 		s ^= x7 -	xorl	128(%esp),%ecx -	# 						w += v -	add	%edi,%ebx -	# 						w <<<= 9 -	rol	$9,%ebx -	# 						w ^= x13 -	xorl	152(%esp),%ebx -	# x2 = p -	movl	%eax,108(%esp) -	# 				x8 = t -	movl	%edx,132(%esp) -	# p += x1 -	addl	104(%esp),%eax -	# 		x7 = s -	movl	%ecx,128(%esp) -	# 				t += x11 -	addl	144(%esp),%edx -	# 						x13 = w -	movl	%ebx,152(%esp) -	# p <<<= 13 -	rol	$13,%eax -	# p ^= x3 -	xorl	112(%esp),%eax -	# 				t <<<= 13 -	rol	$13,%edx -	# 				t ^= x9 -	xorl	136(%esp),%edx -	# 		r += s -	add	%ecx,%esi -	# 		r <<<= 13 -	rol	$13,%esi -	# 		r ^= x4 -	xorl	116(%esp),%esi -	# 						v += w -	add	%ebx,%edi -	# 						v <<<= 13 -	rol	$13,%edi -	# 						v ^= x14 -	xorl	156(%esp),%edi -	# x3 = p -	movl	%eax,112(%esp) -	# 				x9 = t -	movl	%edx,136(%esp) -	# p += x2 -	addl	108(%esp),%eax -	# 		x4 = r -	movl	%esi,116(%esp) -	# 				t += x8 -	addl	132(%esp),%edx -	# 						x14 = v -	movl	%edi,156(%esp) -	# p <<<= 18 -	rol	$18,%eax -	# p ^= x0 -	xorl	100(%esp),%eax -	# 				t <<<= 18 -	rol	$18,%edx -	# 				t ^= x10 -	xorl	140(%esp),%edx -	# 		s += r -	add	%esi,%ecx -	# 		s <<<= 18 -	rol	$18,%ecx -	# 		s ^= x5 -	xorl	120(%esp),%ecx -	# 						w += v -	add	%edi,%ebx -	# 						w <<<= 18 -	rol	$18,%ebx -	# 						w ^= x15 -	xorl	160(%esp),%ebx -	# i -= 4 -	sub	$4,%ebp -	# goto mainloop if unsigned > -	ja	._mainloop -	# x0 = p -	movl	%eax,100(%esp) -	# x5 = s -	movl	%ecx,120(%esp) -	# x10 = t -	movl	%edx,140(%esp) -	# x15 = w -	movl	%ebx,160(%esp) -	#   out = out_backup -	movl	72(%esp),%edi -	#   m = m_backup -	movl	68(%esp),%esi -	#   in0 = x0 -	movl	100(%esp),%eax -	#   in1 = x1 -	movl	104(%esp),%ecx -	#   in0 += j0 -	addl	164(%esp),%eax -	#   in1 += j1 -	addl	168(%esp),%ecx -	#   in0 ^= *(uint32 *) (m + 0) -	xorl	0(%esi),%eax -	#   in1 ^= *(uint32 *) (m + 4) -	xorl	4(%esi),%ecx -	#   *(uint32 *) (out + 0) = in0 -	movl	%eax,0(%edi) -	#   *(uint32 *) (out + 4) = in1 -	movl	%ecx,4(%edi) -	#   in2 = x2 -	movl	108(%esp),%eax -	#   in3 = x3 -	movl	112(%esp),%ecx -	#   in2 += j2 -	addl	172(%esp),%eax -	#   in3 += j3 -	addl	176(%esp),%ecx -	#   in2 ^= *(uint32 *) (m + 8) -	xorl	8(%esi),%eax -	#   in3 ^= *(uint32 *) (m + 12) -	xorl	12(%esi),%ecx -	#   *(uint32 *) (out + 8) = in2 -	movl	%eax,8(%edi) -	#   *(uint32 *) (out + 12) = in3 -	movl	%ecx,12(%edi) -	#   in4 = x4 -	movl	116(%esp),%eax -	#   in5 = x5 -	movl	120(%esp),%ecx -	#   in4 += j4 -	addl	180(%esp),%eax -	#   in5 += j5 -	addl	184(%esp),%ecx -	#   in4 ^= *(uint32 *) (m + 16) -	xorl	16(%esi),%eax -	#   in5 ^= *(uint32 *) (m + 20) -	xorl	20(%esi),%ecx -	#   *(uint32 *) (out + 16) = in4 -	movl	%eax,16(%edi) -	#   *(uint32 *) (out + 20) = in5 -	movl	%ecx,20(%edi) -	#   in6 = x6 -	movl	124(%esp),%eax -	#   in7 = x7 -	movl	128(%esp),%ecx -	#   in6 += j6 -	addl	188(%esp),%eax -	#   in7 += j7 -	addl	192(%esp),%ecx -	#   in6 ^= *(uint32 *) (m + 24) -	xorl	24(%esi),%eax -	#   in7 ^= *(uint32 *) (m + 28) -	xorl	28(%esi),%ecx -	#   *(uint32 *) (out + 24) = in6 -	movl	%eax,24(%edi) -	#   *(uint32 *) (out + 28) = in7 -	movl	%ecx,28(%edi) -	#   in8 = x8 -	movl	132(%esp),%eax -	#   in9 = x9 -	movl	136(%esp),%ecx -	#   in8 += j8 -	addl	196(%esp),%eax -	#   in9 += j9 -	addl	200(%esp),%ecx -	#   in8 ^= *(uint32 *) (m + 32) -	xorl	32(%esi),%eax -	#   in9 ^= *(uint32 *) (m + 36) -	xorl	36(%esi),%ecx -	#   *(uint32 *) (out + 32) = in8 -	movl	%eax,32(%edi) -	#   *(uint32 *) (out + 36) = in9 -	movl	%ecx,36(%edi) -	#   in10 = x10 -	movl	140(%esp),%eax -	#   in11 = x11 -	movl	144(%esp),%ecx -	#   in10 += j10 -	addl	204(%esp),%eax -	#   in11 += j11 -	addl	208(%esp),%ecx -	#   in10 ^= *(uint32 *) (m + 40) -	xorl	40(%esi),%eax -	#   in11 ^= *(uint32 *) (m + 44) -	xorl	44(%esi),%ecx -	#   *(uint32 *) (out + 40) = in10 -	movl	%eax,40(%edi) -	#   *(uint32 *) (out + 44) = in11 -	movl	%ecx,44(%edi) -	#   in12 = x12 -	movl	148(%esp),%eax -	#   in13 = x13 -	movl	152(%esp),%ecx -	#   in12 += j12 -	addl	212(%esp),%eax -	#   in13 += j13 -	addl	216(%esp),%ecx -	#   in12 ^= *(uint32 *) (m + 48) -	xorl	48(%esi),%eax -	#   in13 ^= *(uint32 *) (m + 52) -	xorl	52(%esi),%ecx -	#   *(uint32 *) (out + 48) = in12 -	movl	%eax,48(%edi) -	#   *(uint32 *) (out + 52) = in13 -	movl	%ecx,52(%edi) -	#   in14 = x14 -	movl	156(%esp),%eax -	#   in15 = x15 -	movl	160(%esp),%ecx -	#   in14 += j14 -	addl	220(%esp),%eax -	#   in15 += j15 -	addl	224(%esp),%ecx -	#   in14 ^= *(uint32 *) (m + 56) -	xorl	56(%esi),%eax -	#   in15 ^= *(uint32 *) (m + 60) -	xorl	60(%esi),%ecx -	#   *(uint32 *) (out + 56) = in14 -	movl	%eax,56(%edi) -	#   *(uint32 *) (out + 60) = in15 -	movl	%ecx,60(%edi) -	#   bytes = bytes_backup -	movl	76(%esp),%ebx -	#   in8 = j8 -	movl	196(%esp),%eax -	#   in9 = j9 -	movl	200(%esp),%ecx -	#   in8 += 1 -	add	$1,%eax -	#   in9 += 0 + carry -	adc	$0,%ecx -	#   j8 = in8 -	movl	%eax,196(%esp) -	#   j9 = in9 -	movl	%ecx,200(%esp) -	#   bytes - 64 -	cmp	$64,%ebx -	#   goto bytesatleast65 if unsigned> -	ja	._bytesatleast65 -	#     goto bytesatleast64 if unsigned>= -	jae	._bytesatleast64 -	#       m = out -	mov	%edi,%esi -	#       out = ctarget -	movl	228(%esp),%edi -	#       i = bytes -	mov	%ebx,%ecx -	#       while (i) { *out++ = *m++; --i } -	rep	movsb -._bytesatleast64: -	#     x = x_backup -	movl	64(%esp),%eax -	#     in8 = j8 -	movl	196(%esp),%ecx -	#     in9 = j9 -	movl	200(%esp),%edx -	#     *(uint32 *) (x + 32) = in8 -	movl	%ecx,32(%eax) -	#     *(uint32 *) (x + 36) = in9 -	movl	%edx,36(%eax) -._done: -	#     eax = eax_stack -	movl	80(%esp),%eax -	#     ebx = ebx_stack -	movl	84(%esp),%ebx -	#     esi = esi_stack -	movl	88(%esp),%esi -	#     edi = edi_stack -	movl	92(%esp),%edi -	#     ebp = ebp_stack -	movl	96(%esp),%ebp -	#     leave -	add	%eax,%esp -	ret -._bytesatleast65: -	#   bytes -= 64 -	sub	$64,%ebx -	#   out += 64 -	add	$64,%edi -	#   m += 64 -	add	$64,%esi -	# goto bytesatleast1 -	jmp	._bytesatleast1 -ENDPROC(salsa20_encrypt_bytes) diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S deleted file mode 100644 index 03a4918f41ee..000000000000 --- a/arch/x86/crypto/salsa20-x86_64-asm_64.S +++ /dev/null @@ -1,805 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include <linux/linkage.h> - -# enter salsa20_encrypt_bytes -ENTRY(salsa20_encrypt_bytes) -	mov	%rsp,%r11 -	and	$31,%r11 -	add	$256,%r11 -	sub	%r11,%rsp -	# x = arg1 -	mov	%rdi,%r8 -	# m = arg2 -	mov	%rsi,%rsi -	# out = arg3 -	mov	%rdx,%rdi -	# bytes = arg4 -	mov	%rcx,%rdx -	#               unsigned>? bytes - 0 -	cmp	$0,%rdx -	# comment:fp stack unchanged by jump -	# goto done if !unsigned> -	jbe	._done -	# comment:fp stack unchanged by fallthrough -# start: -._start: -	# r11_stack = r11 -	movq	%r11,0(%rsp) -	# r12_stack = r12 -	movq	%r12,8(%rsp) -	# r13_stack = r13 -	movq	%r13,16(%rsp) -	# r14_stack = r14 -	movq	%r14,24(%rsp) -	# r15_stack = r15 -	movq	%r15,32(%rsp) -	# rbx_stack = rbx -	movq	%rbx,40(%rsp) -	# rbp_stack = rbp -	movq	%rbp,48(%rsp) -	# in0 = *(uint64 *) (x + 0) -	movq	0(%r8),%rcx -	# in2 = *(uint64 *) (x + 8) -	movq	8(%r8),%r9 -	# in4 = *(uint64 *) (x + 16) -	movq	16(%r8),%rax -	# in6 = *(uint64 *) (x + 24) -	movq	24(%r8),%r10 -	# in8 = *(uint64 *) (x + 32) -	movq	32(%r8),%r11 -	# in10 = *(uint64 *) (x + 40) -	movq	40(%r8),%r12 -	# in12 = *(uint64 *) (x + 48) -	movq	48(%r8),%r13 -	# in14 = *(uint64 *) (x + 56) -	movq	56(%r8),%r14 -	# j0 = in0 -	movq	%rcx,56(%rsp) -	# j2 = in2 -	movq	%r9,64(%rsp) -	# j4 = in4 -	movq	%rax,72(%rsp) -	# j6 = in6 -	movq	%r10,80(%rsp) -	# j8 = in8 -	movq	%r11,88(%rsp) -	# j10 = in10 -	movq	%r12,96(%rsp) -	# j12 = in12 -	movq	%r13,104(%rsp) -	# j14 = in14 -	movq	%r14,112(%rsp) -	# x_backup = x -	movq	%r8,120(%rsp) -# bytesatleast1: -._bytesatleast1: -	#                   unsigned<? bytes - 64 -	cmp	$64,%rdx -	# comment:fp stack unchanged by jump -	#   goto nocopy if !unsigned< -	jae	._nocopy -	#     ctarget = out -	movq	%rdi,128(%rsp) -	#     out = &tmp -	leaq	192(%rsp),%rdi -	#     i = bytes -	mov	%rdx,%rcx -	#     while (i) { *out++ = *m++; --i } -	rep	movsb -	#     out = &tmp -	leaq	192(%rsp),%rdi -	#     m = &tmp -	leaq	192(%rsp),%rsi -	# comment:fp stack unchanged by fallthrough -#   nocopy: -._nocopy: -	#   out_backup = out -	movq	%rdi,136(%rsp) -	#   m_backup = m -	movq	%rsi,144(%rsp) -	#   bytes_backup = bytes -	movq	%rdx,152(%rsp) -	#   x1 = j0 -	movq	56(%rsp),%rdi -	#   x0 = x1 -	mov	%rdi,%rdx -	#   (uint64) x1 >>= 32 -	shr	$32,%rdi -	#   		x3 = j2 -	movq	64(%rsp),%rsi -	#   		x2 = x3 -	mov	%rsi,%rcx -	#   		(uint64) x3 >>= 32 -	shr	$32,%rsi -	#   x5 = j4 -	movq	72(%rsp),%r8 -	#   x4 = x5 -	mov	%r8,%r9 -	#   (uint64) x5 >>= 32 -	shr	$32,%r8 -	#   x5_stack = x5 -	movq	%r8,160(%rsp) -	#   		x7 = j6 -	movq	80(%rsp),%r8 -	#   		x6 = x7 -	mov	%r8,%rax -	#   		(uint64) x7 >>= 32 -	shr	$32,%r8 -	#   x9 = j8 -	movq	88(%rsp),%r10 -	#   x8 = x9 -	mov	%r10,%r11 -	#   (uint64) x9 >>= 32 -	shr	$32,%r10 -	#   		x11 = j10 -	movq	96(%rsp),%r12 -	#   		x10 = x11 -	mov	%r12,%r13 -	#   		x10_stack = x10 -	movq	%r13,168(%rsp) -	#   		(uint64) x11 >>= 32 -	shr	$32,%r12 -	#   x13 = j12 -	movq	104(%rsp),%r13 -	#   x12 = x13 -	mov	%r13,%r14 -	#   (uint64) x13 >>= 32 -	shr	$32,%r13 -	#   		x15 = j14 -	movq	112(%rsp),%r15 -	#   		x14 = x15 -	mov	%r15,%rbx -	#   		(uint64) x15 >>= 32 -	shr	$32,%r15 -	#   		x15_stack = x15 -	movq	%r15,176(%rsp) -	#   i = 20 -	mov	$20,%r15 -#   mainloop: -._mainloop: -	#   i_backup = i -	movq	%r15,184(%rsp) -	# 		x5 = x5_stack -	movq	160(%rsp),%r15 -	# a = x12 + x0 -	lea	(%r14,%rdx),%rbp -	# (uint32) a <<<= 7 -	rol	$7,%ebp -	# x4 ^= a -	xor	%rbp,%r9 -	# 		b = x1 + x5 -	lea	(%rdi,%r15),%rbp -	# 		(uint32) b <<<= 7 -	rol	$7,%ebp -	# 		x9 ^= b -	xor	%rbp,%r10 -	# a = x0 + x4 -	lea	(%rdx,%r9),%rbp -	# (uint32) a <<<= 9 -	rol	$9,%ebp -	# x8 ^= a -	xor	%rbp,%r11 -	# 		b = x5 + x9 -	lea	(%r15,%r10),%rbp -	# 		(uint32) b <<<= 9 -	rol	$9,%ebp -	# 		x13 ^= b -	xor	%rbp,%r13 -	# a = x4 + x8 -	lea	(%r9,%r11),%rbp -	# (uint32) a <<<= 13 -	rol	$13,%ebp -	# x12 ^= a -	xor	%rbp,%r14 -	# 		b = x9 + x13 -	lea	(%r10,%r13),%rbp -	# 		(uint32) b <<<= 13 -	rol	$13,%ebp -	# 		x1 ^= b -	xor	%rbp,%rdi -	# a = x8 + x12 -	lea	(%r11,%r14),%rbp -	# (uint32) a <<<= 18 -	rol	$18,%ebp -	# x0 ^= a -	xor	%rbp,%rdx -	# 		b = x13 + x1 -	lea	(%r13,%rdi),%rbp -	# 		(uint32) b <<<= 18 -	rol	$18,%ebp -	# 		x5 ^= b -	xor	%rbp,%r15 -	# 				x10 = x10_stack -	movq	168(%rsp),%rbp -	# 		x5_stack = x5 -	movq	%r15,160(%rsp) -	# 				c = x6 + x10 -	lea	(%rax,%rbp),%r15 -	# 				(uint32) c <<<= 7 -	rol	$7,%r15d -	# 				x14 ^= c -	xor	%r15,%rbx -	# 				c = x10 + x14 -	lea	(%rbp,%rbx),%r15 -	# 				(uint32) c <<<= 9 -	rol	$9,%r15d -	# 				x2 ^= c -	xor	%r15,%rcx -	# 				c = x14 + x2 -	lea	(%rbx,%rcx),%r15 -	# 				(uint32) c <<<= 13 -	rol	$13,%r15d -	# 				x6 ^= c -	xor	%r15,%rax -	# 				c = x2 + x6 -	lea	(%rcx,%rax),%r15 -	# 				(uint32) c <<<= 18 -	rol	$18,%r15d -	# 				x10 ^= c -	xor	%r15,%rbp -	# 						x15 = x15_stack -	movq	176(%rsp),%r15 -	# 				x10_stack = x10 -	movq	%rbp,168(%rsp) -	# 						d = x11 + x15 -	lea	(%r12,%r15),%rbp -	# 						(uint32) d <<<= 7 -	rol	$7,%ebp -	# 						x3 ^= d -	xor	%rbp,%rsi -	# 						d = x15 + x3 -	lea	(%r15,%rsi),%rbp -	# 						(uint32) d <<<= 9 -	rol	$9,%ebp -	# 						x7 ^= d -	xor	%rbp,%r8 -	# 						d = x3 + x7 -	lea	(%rsi,%r8),%rbp -	# 						(uint32) d <<<= 13 -	rol	$13,%ebp -	# 						x11 ^= d -	xor	%rbp,%r12 -	# 						d = x7 + x11 -	lea	(%r8,%r12),%rbp -	# 						(uint32) d <<<= 18 -	rol	$18,%ebp -	# 						x15 ^= d -	xor	%rbp,%r15 -	# 						x15_stack = x15 -	movq	%r15,176(%rsp) -	# 		x5 = x5_stack -	movq	160(%rsp),%r15 -	# a = x3 + x0 -	lea	(%rsi,%rdx),%rbp -	# (uint32) a <<<= 7 -	rol	$7,%ebp -	# x1 ^= a -	xor	%rbp,%rdi -	# 		b = x4 + x5 -	lea	(%r9,%r15),%rbp -	# 		(uint32) b <<<= 7 -	rol	$7,%ebp -	# 		x6 ^= b -	xor	%rbp,%rax -	# a = x0 + x1 -	lea	(%rdx,%rdi),%rbp -	# (uint32) a <<<= 9 -	rol	$9,%ebp -	# x2 ^= a -	xor	%rbp,%rcx -	# 		b = x5 + x6 -	lea	(%r15,%rax),%rbp -	# 		(uint32) b <<<= 9 -	rol	$9,%ebp -	# 		x7 ^= b -	xor	%rbp,%r8 -	# a = x1 + x2 -	lea	(%rdi,%rcx),%rbp -	# (uint32) a <<<= 13 -	rol	$13,%ebp -	# x3 ^= a -	xor	%rbp,%rsi -	# 		b = x6 + x7 -	lea	(%rax,%r8),%rbp -	# 		(uint32) b <<<= 13 -	rol	$13,%ebp -	# 		x4 ^= b -	xor	%rbp,%r9 -	# a = x2 + x3 -	lea	(%rcx,%rsi),%rbp -	# (uint32) a <<<= 18 -	rol	$18,%ebp -	# x0 ^= a -	xor	%rbp,%rdx -	# 		b = x7 + x4 -	lea	(%r8,%r9),%rbp -	# 		(uint32) b <<<= 18 -	rol	$18,%ebp -	# 		x5 ^= b -	xor	%rbp,%r15 -	# 				x10 = x10_stack -	movq	168(%rsp),%rbp -	# 		x5_stack = x5 -	movq	%r15,160(%rsp) -	# 				c = x9 + x10 -	lea	(%r10,%rbp),%r15 -	# 				(uint32) c <<<= 7 -	rol	$7,%r15d -	# 				x11 ^= c -	xor	%r15,%r12 -	# 				c = x10 + x11 -	lea	(%rbp,%r12),%r15 -	# 				(uint32) c <<<= 9 -	rol	$9,%r15d -	# 				x8 ^= c -	xor	%r15,%r11 -	# 				c = x11 + x8 -	lea	(%r12,%r11),%r15 -	# 				(uint32) c <<<= 13 -	rol	$13,%r15d -	# 				x9 ^= c -	xor	%r15,%r10 -	# 				c = x8 + x9 -	lea	(%r11,%r10),%r15 -	# 				(uint32) c <<<= 18 -	rol	$18,%r15d -	# 				x10 ^= c -	xor	%r15,%rbp -	# 						x15 = x15_stack -	movq	176(%rsp),%r15 -	# 				x10_stack = x10 -	movq	%rbp,168(%rsp) -	# 						d = x14 + x15 -	lea	(%rbx,%r15),%rbp -	# 						(uint32) d <<<= 7 -	rol	$7,%ebp -	# 						x12 ^= d -	xor	%rbp,%r14 -	# 						d = x15 + x12 -	lea	(%r15,%r14),%rbp -	# 						(uint32) d <<<= 9 -	rol	$9,%ebp -	# 						x13 ^= d -	xor	%rbp,%r13 -	# 						d = x12 + x13 -	lea	(%r14,%r13),%rbp -	# 						(uint32) d <<<= 13 -	rol	$13,%ebp -	# 						x14 ^= d -	xor	%rbp,%rbx -	# 						d = x13 + x14 -	lea	(%r13,%rbx),%rbp -	# 						(uint32) d <<<= 18 -	rol	$18,%ebp -	# 						x15 ^= d -	xor	%rbp,%r15 -	# 						x15_stack = x15 -	movq	%r15,176(%rsp) -	# 		x5 = x5_stack -	movq	160(%rsp),%r15 -	# a = x12 + x0 -	lea	(%r14,%rdx),%rbp -	# (uint32) a <<<= 7 -	rol	$7,%ebp -	# x4 ^= a -	xor	%rbp,%r9 -	# 		b = x1 + x5 -	lea	(%rdi,%r15),%rbp -	# 		(uint32) b <<<= 7 -	rol	$7,%ebp -	# 		x9 ^= b -	xor	%rbp,%r10 -	# a = x0 + x4 -	lea	(%rdx,%r9),%rbp -	# (uint32) a <<<= 9 -	rol	$9,%ebp -	# x8 ^= a -	xor	%rbp,%r11 -	# 		b = x5 + x9 -	lea	(%r15,%r10),%rbp -	# 		(uint32) b <<<= 9 -	rol	$9,%ebp -	# 		x13 ^= b -	xor	%rbp,%r13 -	# a = x4 + x8 -	lea	(%r9,%r11),%rbp -	# (uint32) a <<<= 13 -	rol	$13,%ebp -	# x12 ^= a -	xor	%rbp,%r14 -	# 		b = x9 + x13 -	lea	(%r10,%r13),%rbp -	# 		(uint32) b <<<= 13 -	rol	$13,%ebp -	# 		x1 ^= b -	xor	%rbp,%rdi -	# a = x8 + x12 -	lea	(%r11,%r14),%rbp -	# (uint32) a <<<= 18 -	rol	$18,%ebp -	# x0 ^= a -	xor	%rbp,%rdx -	# 		b = x13 + x1 -	lea	(%r13,%rdi),%rbp -	# 		(uint32) b <<<= 18 -	rol	$18,%ebp -	# 		x5 ^= b -	xor	%rbp,%r15 -	# 				x10 = x10_stack -	movq	168(%rsp),%rbp -	# 		x5_stack = x5 -	movq	%r15,160(%rsp) -	# 				c = x6 + x10 -	lea	(%rax,%rbp),%r15 -	# 				(uint32) c <<<= 7 -	rol	$7,%r15d -	# 				x14 ^= c -	xor	%r15,%rbx -	# 				c = x10 + x14 -	lea	(%rbp,%rbx),%r15 -	# 				(uint32) c <<<= 9 -	rol	$9,%r15d -	# 				x2 ^= c -	xor	%r15,%rcx -	# 				c = x14 + x2 -	lea	(%rbx,%rcx),%r15 -	# 				(uint32) c <<<= 13 -	rol	$13,%r15d -	# 				x6 ^= c -	xor	%r15,%rax -	# 				c = x2 + x6 -	lea	(%rcx,%rax),%r15 -	# 				(uint32) c <<<= 18 -	rol	$18,%r15d -	# 				x10 ^= c -	xor	%r15,%rbp -	# 						x15 = x15_stack -	movq	176(%rsp),%r15 -	# 				x10_stack = x10 -	movq	%rbp,168(%rsp) -	# 						d = x11 + x15 -	lea	(%r12,%r15),%rbp -	# 						(uint32) d <<<= 7 -	rol	$7,%ebp -	# 						x3 ^= d -	xor	%rbp,%rsi -	# 						d = x15 + x3 -	lea	(%r15,%rsi),%rbp -	# 						(uint32) d <<<= 9 -	rol	$9,%ebp -	# 						x7 ^= d -	xor	%rbp,%r8 -	# 						d = x3 + x7 -	lea	(%rsi,%r8),%rbp -	# 						(uint32) d <<<= 13 -	rol	$13,%ebp -	# 						x11 ^= d -	xor	%rbp,%r12 -	# 						d = x7 + x11 -	lea	(%r8,%r12),%rbp -	# 						(uint32) d <<<= 18 -	rol	$18,%ebp -	# 						x15 ^= d -	xor	%rbp,%r15 -	# 						x15_stack = x15 -	movq	%r15,176(%rsp) -	# 		x5 = x5_stack -	movq	160(%rsp),%r15 -	# a = x3 + x0 -	lea	(%rsi,%rdx),%rbp -	# (uint32) a <<<= 7 -	rol	$7,%ebp -	# x1 ^= a -	xor	%rbp,%rdi -	# 		b = x4 + x5 -	lea	(%r9,%r15),%rbp -	# 		(uint32) b <<<= 7 -	rol	$7,%ebp -	# 		x6 ^= b -	xor	%rbp,%rax -	# a = x0 + x1 -	lea	(%rdx,%rdi),%rbp -	# (uint32) a <<<= 9 -	rol	$9,%ebp -	# x2 ^= a -	xor	%rbp,%rcx -	# 		b = x5 + x6 -	lea	(%r15,%rax),%rbp -	# 		(uint32) b <<<= 9 -	rol	$9,%ebp -	# 		x7 ^= b -	xor	%rbp,%r8 -	# a = x1 + x2 -	lea	(%rdi,%rcx),%rbp -	# (uint32) a <<<= 13 -	rol	$13,%ebp -	# x3 ^= a -	xor	%rbp,%rsi -	# 		b = x6 + x7 -	lea	(%rax,%r8),%rbp -	# 		(uint32) b <<<= 13 -	rol	$13,%ebp -	# 		x4 ^= b -	xor	%rbp,%r9 -	# a = x2 + x3 -	lea	(%rcx,%rsi),%rbp -	# (uint32) a <<<= 18 -	rol	$18,%ebp -	# x0 ^= a -	xor	%rbp,%rdx -	# 		b = x7 + x4 -	lea	(%r8,%r9),%rbp -	# 		(uint32) b <<<= 18 -	rol	$18,%ebp -	# 		x5 ^= b -	xor	%rbp,%r15 -	# 				x10 = x10_stack -	movq	168(%rsp),%rbp -	# 		x5_stack = x5 -	movq	%r15,160(%rsp) -	# 				c = x9 + x10 -	lea	(%r10,%rbp),%r15 -	# 				(uint32) c <<<= 7 -	rol	$7,%r15d -	# 				x11 ^= c -	xor	%r15,%r12 -	# 				c = x10 + x11 -	lea	(%rbp,%r12),%r15 -	# 				(uint32) c <<<= 9 -	rol	$9,%r15d -	# 				x8 ^= c -	xor	%r15,%r11 -	# 				c = x11 + x8 -	lea	(%r12,%r11),%r15 -	# 				(uint32) c <<<= 13 -	rol	$13,%r15d -	# 				x9 ^= c -	xor	%r15,%r10 -	# 				c = x8 + x9 -	lea	(%r11,%r10),%r15 -	# 				(uint32) c <<<= 18 -	rol	$18,%r15d -	# 				x10 ^= c -	xor	%r15,%rbp -	# 						x15 = x15_stack -	movq	176(%rsp),%r15 -	# 				x10_stack = x10 -	movq	%rbp,168(%rsp) -	# 						d = x14 + x15 -	lea	(%rbx,%r15),%rbp -	# 						(uint32) d <<<= 7 -	rol	$7,%ebp -	# 						x12 ^= d -	xor	%rbp,%r14 -	# 						d = x15 + x12 -	lea	(%r15,%r14),%rbp -	# 						(uint32) d <<<= 9 -	rol	$9,%ebp -	# 						x13 ^= d -	xor	%rbp,%r13 -	# 						d = x12 + x13 -	lea	(%r14,%r13),%rbp -	# 						(uint32) d <<<= 13 -	rol	$13,%ebp -	# 						x14 ^= d -	xor	%rbp,%rbx -	# 						d = x13 + x14 -	lea	(%r13,%rbx),%rbp -	# 						(uint32) d <<<= 18 -	rol	$18,%ebp -	# 						x15 ^= d -	xor	%rbp,%r15 -	# 						x15_stack = x15 -	movq	%r15,176(%rsp) -	#   i = i_backup -	movq	184(%rsp),%r15 -	#                  unsigned>? i -= 4 -	sub	$4,%r15 -	# comment:fp stack unchanged by jump -	# goto mainloop if unsigned> -	ja	._mainloop -	#   (uint32) x2 += j2 -	addl	64(%rsp),%ecx -	#   x3 <<= 32 -	shl	$32,%rsi -	#   x3 += j2 -	addq	64(%rsp),%rsi -	#   (uint64) x3 >>= 32 -	shr	$32,%rsi -	#   x3 <<= 32 -	shl	$32,%rsi -	#   x2 += x3 -	add	%rsi,%rcx -	#   (uint32) x6 += j6 -	addl	80(%rsp),%eax -	#   x7 <<= 32 -	shl	$32,%r8 -	#   x7 += j6 -	addq	80(%rsp),%r8 -	#   (uint64) x7 >>= 32 -	shr	$32,%r8 -	#   x7 <<= 32 -	shl	$32,%r8 -	#   x6 += x7 -	add	%r8,%rax -	#   (uint32) x8 += j8 -	addl	88(%rsp),%r11d -	#   x9 <<= 32 -	shl	$32,%r10 -	#   x9 += j8 -	addq	88(%rsp),%r10 -	#   (uint64) x9 >>= 32 -	shr	$32,%r10 -	#   x9 <<= 32 -	shl	$32,%r10 -	#   x8 += x9 -	add	%r10,%r11 -	#   (uint32) x12 += j12 -	addl	104(%rsp),%r14d -	#   x13 <<= 32 -	shl	$32,%r13 -	#   x13 += j12 -	addq	104(%rsp),%r13 -	#   (uint64) x13 >>= 32 -	shr	$32,%r13 -	#   x13 <<= 32 -	shl	$32,%r13 -	#   x12 += x13 -	add	%r13,%r14 -	#   (uint32) x0 += j0 -	addl	56(%rsp),%edx -	#   x1 <<= 32 -	shl	$32,%rdi -	#   x1 += j0 -	addq	56(%rsp),%rdi -	#   (uint64) x1 >>= 32 -	shr	$32,%rdi -	#   x1 <<= 32 -	shl	$32,%rdi -	#   x0 += x1 -	add	%rdi,%rdx -	#   x5 = x5_stack -	movq	160(%rsp),%rdi -	#   (uint32) x4 += j4 -	addl	72(%rsp),%r9d -	#   x5 <<= 32 -	shl	$32,%rdi -	#   x5 += j4 -	addq	72(%rsp),%rdi -	#   (uint64) x5 >>= 32 -	shr	$32,%rdi -	#   x5 <<= 32 -	shl	$32,%rdi -	#   x4 += x5 -	add	%rdi,%r9 -	#   x10 = x10_stack -	movq	168(%rsp),%r8 -	#   (uint32) x10 += j10 -	addl	96(%rsp),%r8d -	#   x11 <<= 32 -	shl	$32,%r12 -	#   x11 += j10 -	addq	96(%rsp),%r12 -	#   (uint64) x11 >>= 32 -	shr	$32,%r12 -	#   x11 <<= 32 -	shl	$32,%r12 -	#   x10 += x11 -	add	%r12,%r8 -	#   x15 = x15_stack -	movq	176(%rsp),%rdi -	#   (uint32) x14 += j14 -	addl	112(%rsp),%ebx -	#   x15 <<= 32 -	shl	$32,%rdi -	#   x15 += j14 -	addq	112(%rsp),%rdi -	#   (uint64) x15 >>= 32 -	shr	$32,%rdi -	#   x15 <<= 32 -	shl	$32,%rdi -	#   x14 += x15 -	add	%rdi,%rbx -	#   out = out_backup -	movq	136(%rsp),%rdi -	#   m = m_backup -	movq	144(%rsp),%rsi -	#   x0 ^= *(uint64 *) (m + 0) -	xorq	0(%rsi),%rdx -	#   *(uint64 *) (out + 0) = x0 -	movq	%rdx,0(%rdi) -	#   x2 ^= *(uint64 *) (m + 8) -	xorq	8(%rsi),%rcx -	#   *(uint64 *) (out + 8) = x2 -	movq	%rcx,8(%rdi) -	#   x4 ^= *(uint64 *) (m + 16) -	xorq	16(%rsi),%r9 -	#   *(uint64 *) (out + 16) = x4 -	movq	%r9,16(%rdi) -	#   x6 ^= *(uint64 *) (m + 24) -	xorq	24(%rsi),%rax -	#   *(uint64 *) (out + 24) = x6 -	movq	%rax,24(%rdi) -	#   x8 ^= *(uint64 *) (m + 32) -	xorq	32(%rsi),%r11 -	#   *(uint64 *) (out + 32) = x8 -	movq	%r11,32(%rdi) -	#   x10 ^= *(uint64 *) (m + 40) -	xorq	40(%rsi),%r8 -	#   *(uint64 *) (out + 40) = x10 -	movq	%r8,40(%rdi) -	#   x12 ^= *(uint64 *) (m + 48) -	xorq	48(%rsi),%r14 -	#   *(uint64 *) (out + 48) = x12 -	movq	%r14,48(%rdi) -	#   x14 ^= *(uint64 *) (m + 56) -	xorq	56(%rsi),%rbx -	#   *(uint64 *) (out + 56) = x14 -	movq	%rbx,56(%rdi) -	#   bytes = bytes_backup -	movq	152(%rsp),%rdx -	#   in8 = j8 -	movq	88(%rsp),%rcx -	#   in8 += 1 -	add	$1,%rcx -	#   j8 = in8 -	movq	%rcx,88(%rsp) -	#                          unsigned>? unsigned<? bytes - 64 -	cmp	$64,%rdx -	# comment:fp stack unchanged by jump -	#   goto bytesatleast65 if unsigned> -	ja	._bytesatleast65 -	# comment:fp stack unchanged by jump -	#     goto bytesatleast64 if !unsigned< -	jae	._bytesatleast64 -	#       m = out -	mov	%rdi,%rsi -	#       out = ctarget -	movq	128(%rsp),%rdi -	#       i = bytes -	mov	%rdx,%rcx -	#       while (i) { *out++ = *m++; --i } -	rep	movsb -	# comment:fp stack unchanged by fallthrough -#     bytesatleast64: -._bytesatleast64: -	#     x = x_backup -	movq	120(%rsp),%rdi -	#     in8 = j8 -	movq	88(%rsp),%rsi -	#     *(uint64 *) (x + 32) = in8 -	movq	%rsi,32(%rdi) -	#     r11 = r11_stack -	movq	0(%rsp),%r11 -	#     r12 = r12_stack -	movq	8(%rsp),%r12 -	#     r13 = r13_stack -	movq	16(%rsp),%r13 -	#     r14 = r14_stack -	movq	24(%rsp),%r14 -	#     r15 = r15_stack -	movq	32(%rsp),%r15 -	#     rbx = rbx_stack -	movq	40(%rsp),%rbx -	#     rbp = rbp_stack -	movq	48(%rsp),%rbp -	# comment:fp stack unchanged by fallthrough -#     done: -._done: -	#     leave -	add	%r11,%rsp -	mov	%rdi,%rax -	mov	%rsi,%rdx -	ret -#   bytesatleast65: -._bytesatleast65: -	#   bytes -= 64 -	sub	$64,%rdx -	#   out += 64 -	add	$64,%rdi -	#   m += 64 -	add	$64,%rsi -	# comment:fp stack unchanged by jump -	# goto bytesatleast1 -	jmp	._bytesatleast1 -ENDPROC(salsa20_encrypt_bytes) diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c deleted file mode 100644 index b07d7d959806..000000000000 --- a/arch/x86/crypto/salsa20_glue.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Glue code for optimized assembly version of  Salsa20. - * - * Copyright (c) 2007 Tan Swee Heng <[email protected]> - * - * The assembly codes are public domain assembly codes written by Daniel. J. - * Bernstein <[email protected]>. The codes are modified to include indentation - * and to remove extraneous comments and functions that are not needed. - * - i586 version, renamed as salsa20-i586-asm_32.S - *   available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s> - * - x86-64 version, renamed as salsa20-x86_64-asm_64.S - *   available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s> - * - * Also modified to set up the initial state using the generic C code rather - * than in assembly. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - */ - -#include <asm/unaligned.h> -#include <crypto/internal/skcipher.h> -#include <crypto/salsa20.h> -#include <linux/module.h> - -asmlinkage void salsa20_encrypt_bytes(u32 state[16], const u8 *src, u8 *dst, -				      u32 bytes); - -static int salsa20_asm_crypt(struct skcipher_request *req) -{ -	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); -	const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm); -	struct skcipher_walk walk; -	u32 state[16]; -	int err; - -	err = skcipher_walk_virt(&walk, req, true); - -	crypto_salsa20_init(state, ctx, walk.iv); - -	while (walk.nbytes > 0) { -		unsigned int nbytes = walk.nbytes; - -		if (nbytes < walk.total) -			nbytes = round_down(nbytes, walk.stride); - -		salsa20_encrypt_bytes(state, walk.src.virt.addr, -				      walk.dst.virt.addr, nbytes); -		err = skcipher_walk_done(&walk, walk.nbytes - nbytes); -	} - -	return err; -} - -static struct skcipher_alg alg = { -	.base.cra_name		= "salsa20", -	.base.cra_driver_name	= "salsa20-asm", -	.base.cra_priority	= 200, -	.base.cra_blocksize	= 1, -	.base.cra_ctxsize	= sizeof(struct salsa20_ctx), -	.base.cra_module	= THIS_MODULE, - -	.min_keysize		= SALSA20_MIN_KEY_SIZE, -	.max_keysize		= SALSA20_MAX_KEY_SIZE, -	.ivsize			= SALSA20_IV_SIZE, -	.chunksize		= SALSA20_BLOCK_SIZE, -	.setkey			= crypto_salsa20_setkey, -	.encrypt		= salsa20_asm_crypt, -	.decrypt		= salsa20_asm_crypt, -}; - -static int __init init(void) -{ -	return crypto_register_skcipher(&alg); -} - -static void __exit fini(void) -{ -	crypto_unregister_skcipher(&alg); -} - -module_init(init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)"); -MODULE_ALIAS_CRYPTO("salsa20"); -MODULE_ALIAS_CRYPTO("salsa20-asm"); |