arch/x86/crypto/aes-gcm-aesni-x86_64.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128

/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// AES-NI optimized AES-GCM for x86_64
//
// Copyright 2024 Google LLC
//
// Author: Eric Biggers <ebiggers@google.com>
//
//------------------------------------------------------------------------------
//
// This file is dual-licensed, meaning that you can use it under your choice of
// either of the following two licenses:
//
// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
// of the License at
//
//	http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// or
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
//    this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
//------------------------------------------------------------------------------
//
// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
// support the original set of AES instructions, i.e. AES-NI.  Two
// implementations are provided, one that uses AVX and one that doesn't.  They
// are very similar, being generated by the same macros.  The only difference is
// that the AVX implementation takes advantage of VEX-coded instructions in some
// places to avoid some 'movdqu' and 'movdqa' instructions.  The AVX
// implementation does *not* use 256-bit vectors, as AES is not supported on
// 256-bit vectors until the VAES feature (which this file doesn't target).
//
// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
// for the *_aesni functions or AVX for the *_aesni_avx ones.  (But it seems
// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
//
// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
// more thoroughly commented.  This file has the following notable changes:
//
//    - The vector length is fixed at 128-bit, i.e. xmm registers.  This means
//      there is only one AES block (and GHASH block) per register.
//
//    - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
//      32.  We work around this by being much more careful about using
//      registers, relying heavily on loads to load values as they are needed.
//
//    - Masking is not available either.  We work around this by implementing
//      partial block loads and stores using overlapping scalar loads and stores
//      combined with shifts and SSE4.1 insertion and extraction instructions.
//
//    - The main loop is organized differently due to the different design
//      constraints.  First, with just one AES block per SIMD register, on some
//      CPUs 4 registers don't saturate the 'aesenc' throughput.  We therefore
//      do an 8-register wide loop.  Considering that and the fact that we have
//      just 16 SIMD registers to work with, it's not feasible to cache AES
//      round keys and GHASH key powers in registers across loop iterations.
//      That's not ideal, but also not actually that bad, since loads can run in
//      parallel with other instructions.  Significantly, this also makes it
//      possible to roll up the inner loops, relying on hardware loop unrolling
//      instead of software loop unrolling, greatly reducing code size.
//
//    - We implement the GHASH multiplications in the main loop using Karatsuba
//      multiplication instead of schoolbook multiplication.  This saves one
//      pclmulqdq instruction per block, at the cost of one 64-bit load, one
//      pshufd, and 0.25 pxors per block.  (This is without the three-argument
//      XOR support that would be provided by AVX512 / AVX10, which would be
//      more beneficial to schoolbook than Karatsuba.)
//
//      As a rough approximation, we can assume that Karatsuba multiplication is
//      faster than schoolbook multiplication in this context if one pshufd and
//      0.25 pxors are cheaper than a pclmulqdq.  (We assume that the 64-bit
//      load is "free" due to running in parallel with arithmetic instructions.)
//      This is true on AMD CPUs, including all that support pclmulqdq up to at
//      least Zen 3.  It's also true on older Intel CPUs: Westmere through
//      Haswell on the Core side, and Silvermont through Goldmont Plus on the
//      low-power side.  On some of these CPUs, pclmulqdq is quite slow, and the
//      benefit of Karatsuba should be substantial.  On newer Intel CPUs,
//      schoolbook multiplication should be faster, but only marginally.
//
//      Not all these CPUs were available to be tested.  However, benchmarks on
//      available CPUs suggest that this approximation is plausible.  Switching
//      to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
//      Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
//      Considering that and the fact that Karatsuba should be even more
//      beneficial on older Intel CPUs, it seems like the right choice here.
//
//      An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
//      saved by using a multiplication-less reduction method.  We don't do that
//      because it would require a large number of shift and xor instructions,
//      making it less worthwhile and likely harmful on newer CPUs.
//
//      It does make sense to sometimes use a different reduction optimization
//      that saves a pclmulqdq, though: precompute the hash key times x^64, and
//      multiply the low half of the data block by the hash key with the extra
//      factor of x^64.  This eliminates one step of the reduction.  However,
//      this is incompatible with Karatsuba multiplication.  Therefore, for
//      multi-block processing we use Karatsuba multiplication with a regular
//      reduction.  For single-block processing, we use the x^64 optimization.

#include <linux/linkage.h>

.section .rodata
.p2align 4
.Lbswap_mask:
	.octa   0x000102030405060708090a0b0c0d0e0f
.Lgfpoly:
	.quad	0xc200000000000000
.Lone:
	.quad	1
.Lgfpoly_and_internal_carrybit:
	.octa	0xc2000000000000010000000000000001
	// Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
	// 'len' 0xff bytes and the rest zeroes.
.Lzeropad_mask:
	.octa	0xffffffffffffffffffffffffffffffff
	.octa	0

// Offsets in struct aes_gcm_key_aesni
#define OFFSETOF_AESKEYLEN	480
#define OFFSETOF_H_POWERS	496
#define OFFSETOF_H_POWERS_XORED	624
#define OFFSETOF_H_TIMES_X64	688

.text

// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq.  The fallback
// assumes that all operands are distinct and that any mem operand is aligned.
.macro	_vpclmulqdq	imm, src1, src2, dst
.if USE_AVX
	vpclmulqdq	\imm, \src1, \src2, \dst
.else
	movdqa		\src2, \dst
	pclmulqdq	\imm, \src1, \dst
.endif
.endm

// Do a vpshufb, or fall back to a movdqa and a pshufb.  The fallback assumes
// that all operands are distinct and that any mem operand is aligned.
.macro	_vpshufb	src1, src2, dst
.if USE_AVX
	vpshufb		\src1, \src2, \dst
.else
	movdqa		\src2, \dst
	pshufb		\src1, \dst
.endif
.endm

// Do a vpand, or fall back to a movdqu and a pand.  The fallback assumes that
// all operands are distinct.
.macro	_vpand		src1, src2, dst
.if USE_AVX
	vpand		\src1, \src2, \dst
.else
	movdqu		\src1, \dst
	pand		\src2, \dst
.endif
.endm

// XOR the unaligned memory operand \mem into the xmm register \reg.  \tmp must
// be a temporary xmm register.
.macro	_xor_mem_to_reg	mem, reg, tmp
.if USE_AVX
	vpxor		\mem, \reg, \reg
.else
	movdqu		\mem, \tmp
	pxor		\tmp, \reg
.endif
.endm

// Test the unaligned memory operand \mem against the xmm register \reg.  \tmp
// must be a temporary xmm register.
.macro	_test_mem	mem, reg, tmp
.if USE_AVX
	vptest		\mem, \reg
.else
	movdqu		\mem, \tmp
	ptest		\tmp, \reg
.endif
.endm

// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
// and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
.macro	_load_partial_block	src, dst, tmp64, tmp32
	sub		$8, %ecx		// LEN - 8
	jle		.Lle8\@

	// Load 9 <= LEN <= 15 bytes.
	movq		(\src), \dst		// Load first 8 bytes
	mov		(\src, %rcx), %rax	// Load last 8 bytes
	neg		%ecx
	shl		$3, %ecx
	shr		%cl, %rax		// Discard overlapping bytes
	pinsrq		$1, %rax, \dst
	jmp		.Ldone\@

.Lle8\@:
	add		$4, %ecx		// LEN - 4
	jl		.Llt4\@

	// Load 4 <= LEN <= 8 bytes.
	mov		(\src), %eax		// Load first 4 bytes
	mov		(\src, %rcx), \tmp32	// Load last 4 bytes
	jmp		.Lcombine\@

.Llt4\@:
	// Load 1 <= LEN <= 3 bytes.
	add		$2, %ecx		// LEN - 2
	movzbl		(\src), %eax		// Load first byte
	jl		.Lmovq\@
	movzwl		(\src, %rcx), \tmp32	// Load last 2 bytes
.Lcombine\@:
	shl		$3, %ecx
	shl		%cl, \tmp64
	or		\tmp64, %rax		// Combine the two parts
.Lmovq\@:
	movq		%rax, \dst
.Ldone\@:
.endm

// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
// Clobbers %rax, %rcx, and %rsi.
.macro	_store_partial_block	src, dst
	sub		$8, %ecx		// LEN - 8
	jl		.Llt8\@

	// Store 8 <= LEN <= 15 bytes.
	pextrq		$1, \src, %rax
	mov		%ecx, %esi
	shl		$3, %ecx
	ror		%cl, %rax
	mov		%rax, (\dst, %rsi)	// Store last LEN - 8 bytes
	movq		\src, (\dst)		// Store first 8 bytes
	jmp		.Ldone\@

.Llt8\@:
	add		$4, %ecx		// LEN - 4
	jl		.Llt4\@

	// Store 4 <= LEN <= 7 bytes.
	pextrd		$1, \src, %eax
	mov		%ecx, %esi
	shl		$3, %ecx
	ror		%cl, %eax
	mov		%eax, (\dst, %rsi)	// Store last LEN - 4 bytes
	movd		\src, (\dst)		// Store first 4 bytes
	jmp		.Ldone\@

.Llt4\@:
	// Store 1 <= LEN <= 3 bytes.
	pextrb		$0, \src, 0(\dst)
	cmp		$-2, %ecx		// LEN - 4 == -2, i.e. LEN == 2?
	jl		.Ldone\@
	pextrb		$1, \src, 1(\dst)
	je		.Ldone\@
	pextrb		$2, \src, 2(\dst)
.Ldone\@:
.endm

// Do one step of GHASH-multiplying \a by \b and storing the reduced product in
// \b.  To complete all steps, this must be invoked with \i=0 through \i=9.
// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
// .Lgfpoly constant, and \t0-\t1 must be temporary registers.
.macro	_ghash_mul_step	i, a, a_times_x64, b, gfpoly, t0, t1

	// MI = (a_L * b_H) + ((a*x^64)_L * b_L)
.if \i == 0
	_vpclmulqdq	$0x01, \a, \b, \t0
.elseif \i == 1
	_vpclmulqdq	$0x00, \a_times_x64, \b, \t1
.elseif \i == 2
	pxor		\t1, \t0

	// HI = (a_H * b_H) + ((a*x^64)_H * b_L)
.elseif \i == 3
	_vpclmulqdq	$0x11, \a, \b, \t1
.elseif \i == 4
	pclmulqdq	$0x10, \a_times_x64, \b
.elseif \i == 5
	pxor		\t1, \b
.elseif \i == 6

	// Fold MI into HI.
	pshufd		$0x4e, \t0, \t1		// Swap halves of MI
.elseif \i == 7
	pclmulqdq	$0x00, \gfpoly, \t0	// MI_L*(x^63 + x^62 + x^57)
.elseif \i == 8
	pxor		\t1, \b
.elseif \i == 9
	pxor		\t0, \b
.endif
.endm

// GHASH-multiply \a by \b and store the reduced product in \b.
// See _ghash_mul_step for details.
.macro	_ghash_mul	a, a_times_x64, b, gfpoly, t0, t1
.irp i, 0,1,2,3,4,5,6,7,8,9
	_ghash_mul_step	\i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
.endr
.endm

// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
// This does Karatsuba multiplication and must be paired with _ghash_reduce.  On
// the first call, \lo, \mi, and \hi must be zero.  \a_xored must contain the
// two halves of \a XOR'd together, i.e. a_L + a_H.  \b is clobbered.
.macro	_ghash_mul_noreduce	a, a_xored, b, lo, mi, hi, t0

	// LO += a_L * b_L
	_vpclmulqdq	$0x00, \a, \b, \t0
	pxor		\t0, \lo

	// b_L + b_H
	pshufd		$0x4e, \b, \t0
	pxor		\b, \t0

	// HI += a_H * b_H
	pclmulqdq	$0x11, \a, \b
	pxor		\b, \hi

	// MI += (a_L + a_H) * (b_L + b_H)
	pclmulqdq	$0x00, \a_xored, \t0
	pxor		\t0, \mi
.endm

// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
// This assumes that _ghash_mul_noreduce was used.
.macro	_ghash_reduce	lo, mi, hi, dst, t0

	movq		.Lgfpoly(%rip), \t0

	// MI += LO + HI (needed because we used Karatsuba multiplication)
	pxor		\lo, \mi
	pxor		\hi, \mi

	// Fold LO into MI.
	pshufd		$0x4e, \lo, \dst
	pclmulqdq	$0x00, \t0, \lo
	pxor		\dst, \mi
	pxor		\lo, \mi

	// Fold MI into HI.
	pshufd		$0x4e, \mi, \dst
	pclmulqdq	$0x00, \t0, \mi
	pxor		\hi, \dst
	pxor		\mi, \dst
.endm

// Do the first step of the GHASH update of a set of 8 ciphertext blocks.
//
// The whole GHASH update does:
//
//	GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
//				blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
//
// This macro just does the first step: it does the unreduced multiplication
// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
// registers LO, MI, and GHASH_ACC a.k.a. HI.  It also zero-initializes the
// inner block counter in %rax, which is a value that counts up by 8 for each
// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
//
// To reduce the number of pclmulqdq instructions required, both this macro and
// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
// multiplication.  See the file comment for more details about this choice.
//
// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
// encrypting, or SRC if decrypting.  They also expect the precomputed hash key
// powers H^i and their XOR'd-together halves to be available in the struct
// pointed to by KEY.  Both macros clobber TMP[0-2].
.macro	_ghash_update_begin_8x	enc

	// Initialize the inner block counter.
	xor		%eax, %eax

	// Load the highest hash key power, H^8.
	movdqa		OFFSETOF_H_POWERS(KEY), TMP0

	// Load the first ciphertext block and byte-reflect it.
.if \enc
	movdqu		(DST), TMP1
.else
	movdqu		(SRC), TMP1
.endif
	pshufb		BSWAP_MASK, TMP1

	// Add the GHASH accumulator to the ciphertext block to get the block
	// 'b' that needs to be multiplied with the hash key power 'a'.
	pxor		TMP1, GHASH_ACC

	// b_L + b_H
	pshufd		$0x4e, GHASH_ACC, MI
	pxor		GHASH_ACC, MI

	// LO = a_L * b_L
	_vpclmulqdq	$0x00, TMP0, GHASH_ACC, LO

	// HI = a_H * b_H
	pclmulqdq	$0x11, TMP0, GHASH_ACC

	// MI = (a_L + a_H) * (b_L + b_H)
	pclmulqdq	$0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
.endm

// Continue the GHASH update of 8 ciphertext blocks as described above by doing
// an unreduced multiplication of the next ciphertext block by the next lowest
// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
.macro	_ghash_update_continue_8x enc
	add		$8, %eax

	// Load the next lowest key power.
	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), TMP0

	// Load the next ciphertext block and byte-reflect it.
.if \enc
	movdqu		(DST,%rax,2), TMP1
.else
	movdqu		(SRC,%rax,2), TMP1
.endif
	pshufb		BSWAP_MASK, TMP1

	// LO += a_L * b_L
	_vpclmulqdq	$0x00, TMP0, TMP1, TMP2
	pxor		TMP2, LO

	// b_L + b_H
	pshufd		$0x4e, TMP1, TMP2
	pxor		TMP1, TMP2

	// HI += a_H * b_H
	pclmulqdq	$0x11, TMP0, TMP1
	pxor		TMP1, GHASH_ACC

	// MI += (a_L + a_H) * (b_L + b_H)
	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
	pclmulqdq	$0x00, TMP1, TMP2
	pxor		TMP2, MI
.endm

// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC.  This is similar to
// _ghash_reduce, but it's hardcoded to use the registers of the main loop and
// it uses the same register for HI and the destination.  It's also divided into
// two steps.  TMP1 must be preserved across steps.
//
// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
// shuffling LO, XOR'ing LO into MI, and shuffling MI.  However, this would
// increase the critical path length, and it seems to slightly hurt performance.
.macro	_ghash_update_end_8x_step	i
.if \i == 0
	movq		.Lgfpoly(%rip), TMP1
	pxor		LO, MI
	pxor		GHASH_ACC, MI
	pshufd		$0x4e, LO, TMP2
	pclmulqdq	$0x00, TMP1, LO
	pxor		TMP2, MI
	pxor		LO, MI
.elseif \i == 1
	pshufd		$0x4e, MI, TMP2
	pclmulqdq	$0x00, TMP1, MI
	pxor		TMP2, GHASH_ACC
	pxor		MI, GHASH_ACC
.endif
.endm

// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
//
// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
// related fields in the key struct.
.macro	_aes_gcm_precompute

	// Function arguments
	.set	KEY,		%rdi

	// Additional local variables.
	// %xmm0-%xmm1 and %rax are used as temporaries.
	.set	RNDKEYLAST_PTR,	%rsi
	.set	H_CUR,		%xmm2
	.set	H_POW1,		%xmm3	// H^1
	.set	H_POW1_X64,	%xmm4	// H^1 * x^64
	.set	GFPOLY,		%xmm5

	// Encrypt an all-zeroes block to get the raw hash subkey.
	movl		OFFSETOF_AESKEYLEN(KEY), %eax
	lea		6*16(KEY,%rax,4), RNDKEYLAST_PTR
	movdqa		(KEY), H_POW1  // Zero-th round key XOR all-zeroes block
	lea		16(KEY), %rax
1:
	aesenc		(%rax), H_POW1
	add		$16, %rax
	cmp		%rax, RNDKEYLAST_PTR
	jne		1b
	aesenclast	(RNDKEYLAST_PTR), H_POW1

	// Preprocess the raw hash subkey as needed to operate on GHASH's
	// bit-reflected values directly: reflect its bytes, then multiply it by
	// x^-1 (using the backwards interpretation of polynomial coefficients
	// from the GCM spec) or equivalently x^1 (using the alternative,
	// natural interpretation of polynomial coefficients).
	pshufb		.Lbswap_mask(%rip), H_POW1
	movdqa		H_POW1, %xmm0
	pshufd		$0xd3, %xmm0, %xmm0
	psrad		$31, %xmm0
	paddq		H_POW1, H_POW1
	pand		.Lgfpoly_and_internal_carrybit(%rip), %xmm0
	pxor		%xmm0, H_POW1

	// Store H^1.
	movdqa		H_POW1, OFFSETOF_H_POWERS+7*16(KEY)

	// Compute and store H^1 * x^64.
	movq		.Lgfpoly(%rip), GFPOLY
	pshufd		$0x4e, H_POW1, %xmm0
	_vpclmulqdq	$0x00, H_POW1, GFPOLY, H_POW1_X64
	pxor		%xmm0, H_POW1_X64
	movdqa		H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)

	// Compute and store the halves of H^1 XOR'd together.
	pxor		H_POW1, %xmm0
	movq		%xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)

	// Compute and store the remaining key powers H^2 through H^8.
	movdqa		H_POW1, H_CUR
	mov		$6*8, %eax
.Lprecompute_next\@:
	// Compute H^i = H^{i-1} * H^1.
	_ghash_mul	H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
	// Store H^i.
	movdqa		H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
	// Compute and store the halves of H^i XOR'd together.
	pshufd		$0x4e, H_CUR, %xmm0
	pxor		H_CUR, %xmm0
	movq		%xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
	sub		$8, %eax
	jge		.Lprecompute_next\@

	RET
.endm

// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
//				 u8 ghash_acc[16], const u8 *aad, int aadlen);
//
// This function processes the AAD (Additional Authenticated Data) in GCM.
// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
// can be any length.  The caller must do any buffering needed to ensure this.
.macro	_aes_gcm_aad_update

	// Function arguments
	.set	KEY,		%rdi
	.set	GHASH_ACC_PTR,	%rsi
	.set	AAD,		%rdx
	.set	AADLEN,		%ecx
	// Note: _load_partial_block relies on AADLEN being in %ecx.

	// Additional local variables.
	// %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
	.set	BSWAP_MASK,	%xmm2
	.set	GHASH_ACC,	%xmm3
	.set	H_POW1,		%xmm4	// H^1
	.set	H_POW1_X64,	%xmm5	// H^1 * x^64
	.set	GFPOLY,		%xmm6

	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
	movdqu		(GHASH_ACC_PTR), GHASH_ACC
	movdqa		OFFSETOF_H_POWERS+7*16(KEY), H_POW1
	movdqa		OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
	movq		.Lgfpoly(%rip), GFPOLY

	// Process the AAD one full block at a time.
	sub		$16, AADLEN
	jl		.Laad_loop_1x_done\@
.Laad_loop_1x\@:
	movdqu		(AAD), %xmm0
	pshufb		BSWAP_MASK, %xmm0
	pxor		%xmm0, GHASH_ACC
	_ghash_mul	H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
	add		$16, AAD
	sub		$16, AADLEN
	jge		.Laad_loop_1x\@
.Laad_loop_1x_done\@:
	// Check whether there is a partial block at the end.
	add		$16, AADLEN
	jz		.Laad_done\@

	// Process a partial block of length 1 <= AADLEN <= 15.
	// _load_partial_block assumes that %ecx contains AADLEN.
	_load_partial_block	AAD, %xmm0, %r10, %r10d
	pshufb		BSWAP_MASK, %xmm0
	pxor		%xmm0, GHASH_ACC
	_ghash_mul	H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1

.Laad_done\@:
	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
	RET
.endm

// Increment LE_CTR eight times to generate eight little-endian counter blocks,
// swap each to big-endian, and store them in AESDATA[0-7].  Also XOR them with
// the zero-th AES round key.  Clobbers TMP0 and TMP1.
.macro	_ctr_begin_8x
	movq		.Lone(%rip), TMP0
	movdqa		(KEY), TMP1		// zero-th round key
.irp i, 0,1,2,3,4,5,6,7
	_vpshufb	BSWAP_MASK, LE_CTR, AESDATA\i
	pxor		TMP1, AESDATA\i
	paddd		TMP0, LE_CTR
.endr
.endm

// Do a non-last round of AES on AESDATA[0-7] using \round_key.
.macro	_aesenc_8x	round_key
.irp i, 0,1,2,3,4,5,6,7
	aesenc		\round_key, AESDATA\i
.endr
.endm

// Do the last round of AES on AESDATA[0-7] using \round_key.
.macro	_aesenclast_8x	round_key
.irp i, 0,1,2,3,4,5,6,7
	aesenclast	\round_key, AESDATA\i
.endr
.endm

// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
// store the result to DST.  Clobbers TMP0.
.macro	_xor_data_8x
.irp i, 0,1,2,3,4,5,6,7
	_xor_mem_to_reg	\i*16(SRC), AESDATA\i, tmp=TMP0
.endr
.irp i, 0,1,2,3,4,5,6,7
	movdqu		AESDATA\i, \i*16(DST)
.endr
.endm

// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
//					  const u32 le_ctr[4], u8 ghash_acc[16],
//					  const u8 *src, u8 *dst, int datalen);
//
// This macro generates a GCM encryption or decryption update function with the
// above prototype (with \enc selecting which one).
//
// This function computes the next portion of the CTR keystream, XOR's it with
// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
// data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
// next |datalen| ciphertext bytes.
//
// |datalen| must be a multiple of 16, except on the last call where it can be
// any length.  The caller must do any buffering needed to ensure this.  Both
// in-place and out-of-place en/decryption are supported.
//
// |le_ctr| must give the current counter in little-endian format.  For a new
// message, the low word of the counter must be 2.  This function loads the
// counter from |le_ctr| and increments the loaded counter as needed, but it
// does *not* store the updated counter back to |le_ctr|.  The caller must
// update |le_ctr| if any more data segments follow.  Internally, only the low
// 32-bit word of the counter is incremented, following the GCM standard.
.macro	_aes_gcm_update	enc

	// Function arguments
	.set	KEY,		%rdi
	.set	LE_CTR_PTR,	%rsi	// Note: overlaps with usage as temp reg
	.set	GHASH_ACC_PTR,	%rdx
	.set	SRC,		%rcx
	.set	DST,		%r8
	.set	DATALEN,	%r9d
	.set	DATALEN64,	%r9	// Zero-extend DATALEN before using!
	// Note: the code setting up for _load_partial_block assumes that SRC is
	// in %rcx (and that DATALEN is *not* in %rcx).

	// Additional local variables

	// %rax and %rsi are used as temporary registers.  Note: %rsi overlaps
	// with LE_CTR_PTR, which is used only at the beginning.

	.set	AESKEYLEN,	%r10d	// AES key length in bytes
	.set	AESKEYLEN64,	%r10
	.set	RNDKEYLAST_PTR,	%r11	// Pointer to last AES round key

	// Put the most frequently used values in %xmm0-%xmm7 to reduce code
	// size.  (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
	.set	TMP0,		%xmm0
	.set	TMP1,		%xmm1
	.set	TMP2,		%xmm2
	.set	LO,		%xmm3	// Low part of unreduced product
	.set	MI,		%xmm4	// Middle part of unreduced product
	.set	GHASH_ACC,	%xmm5	// GHASH accumulator; in main loop also
					// the high part of unreduced product
	.set	BSWAP_MASK,	%xmm6	// Shuffle mask for reflecting bytes
	.set	LE_CTR,		%xmm7	// Little-endian counter value
	.set	AESDATA0,	%xmm8
	.set	AESDATA1,	%xmm9
	.set	AESDATA2,	%xmm10
	.set	AESDATA3,	%xmm11
	.set	AESDATA4,	%xmm12
	.set	AESDATA5,	%xmm13
	.set	AESDATA6,	%xmm14
	.set	AESDATA7,	%xmm15

	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
	movdqu		(GHASH_ACC_PTR), GHASH_ACC
	movdqu		(LE_CTR_PTR), LE_CTR

	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
	lea		6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR

	// If there are at least 8*16 bytes of data, then continue into the main
	// loop, which processes 8*16 bytes of data per iteration.
	//
	// The main loop interleaves AES and GHASH to improve performance on
	// CPUs that can execute these instructions in parallel.  When
	// decrypting, the GHASH input (the ciphertext) is immediately
	// available.  When encrypting, we instead encrypt a set of 8 blocks
	// first and then GHASH those blocks while encrypting the next set of 8,
	// repeat that as needed, and finally GHASH the last set of 8 blocks.
	//
	// Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
	// as this makes the immediate fit in a signed byte, saving 3 bytes.
	add		$-8*16, DATALEN
	jl		.Lcrypt_loop_8x_done\@
.if \enc
	// Encrypt the first 8 plaintext blocks.
	_ctr_begin_8x
	lea		16(KEY), %rsi
	.p2align 4
1:
	movdqa		(%rsi), TMP0
	_aesenc_8x	TMP0
	add		$16, %rsi
	cmp		%rsi, RNDKEYLAST_PTR
	jne		1b
	movdqa		(%rsi), TMP0
	_aesenclast_8x	TMP0
	_xor_data_8x
	// Don't increment DST until the ciphertext blocks have been hashed.
	sub		$-8*16, SRC
	add		$-8*16, DATALEN
	jl		.Lghash_last_ciphertext_8x\@
.endif

	.p2align 4
.Lcrypt_loop_8x\@:

	// Generate the next set of 8 counter blocks and start encrypting them.
	_ctr_begin_8x
	lea		16(KEY), %rsi

	// Do a round of AES, and start the GHASH update of 8 ciphertext blocks
	// by doing the unreduced multiplication for the first ciphertext block.
	movdqa		(%rsi), TMP0
	add		$16, %rsi
	_aesenc_8x	TMP0
	_ghash_update_begin_8x \enc

	// Do 7 more rounds of AES, and continue the GHASH update by doing the
	// unreduced multiplication for the remaining ciphertext blocks.
	.p2align 4
1:
	movdqa		(%rsi), TMP0
	add		$16, %rsi
	_aesenc_8x	TMP0
	_ghash_update_continue_8x \enc
	cmp		$7*8, %eax
	jne		1b

	// Do the remaining AES rounds.
	.p2align 4
1:
	movdqa		(%rsi), TMP0
	add		$16, %rsi
	_aesenc_8x	TMP0
	cmp		%rsi, RNDKEYLAST_PTR
	jne		1b

	// Do the GHASH reduction and the last round of AES.
	movdqa		(RNDKEYLAST_PTR), TMP0
	_ghash_update_end_8x_step	0
	_aesenclast_8x	TMP0
	_ghash_update_end_8x_step	1

	// XOR the data with the AES-CTR keystream blocks.
.if \enc
	sub		$-8*16, DST
.endif
	_xor_data_8x
	sub		$-8*16, SRC
.if !\enc
	sub		$-8*16, DST
.endif
	add		$-8*16, DATALEN
	jge		.Lcrypt_loop_8x\@

.if \enc
.Lghash_last_ciphertext_8x\@:
	// Update GHASH with the last set of 8 ciphertext blocks.
	_ghash_update_begin_8x		\enc
	.p2align 4
1:
	_ghash_update_continue_8x	\enc
	cmp		$7*8, %eax
	jne		1b
	_ghash_update_end_8x_step	0
	_ghash_update_end_8x_step	1
	sub		$-8*16, DST
.endif

.Lcrypt_loop_8x_done\@:

	sub		$-8*16, DATALEN
	jz		.Ldone\@

	// Handle the remainder of length 1 <= DATALEN < 8*16 bytes.  We keep
	// things simple and keep the code size down by just going one block at
	// a time, again taking advantage of hardware loop unrolling.  Since
	// there are enough key powers available for all remaining data, we do
	// the GHASH multiplications unreduced, and only reduce at the very end.

	.set	HI,		TMP2
	.set	H_POW,		AESDATA0
	.set	H_POW_XORED,	AESDATA1
	.set	ONE,		AESDATA2

	movq		.Lone(%rip), ONE

	// Start collecting the unreduced GHASH intermediate value LO, MI, HI.
	pxor		LO, LO
	pxor		MI, MI
	pxor		HI, HI

	// Set up a block counter %rax to contain 8*(8-n), where n is the number
	// of blocks that remain, counting any partial block.  This will be used
	// to access the key powers H^n through H^1.
	mov		DATALEN, %eax
	neg		%eax
	and		$~15, %eax
	sar		$1, %eax
	add		$64, %eax

	sub		$16, DATALEN
	jl		.Lcrypt_loop_1x_done\@

	// Process the data one full block at a time.
.Lcrypt_loop_1x\@:

	// Encrypt the next counter block.
	_vpshufb	BSWAP_MASK, LE_CTR, TMP0
	paddd		ONE, LE_CTR
	pxor		(KEY), TMP0
	lea		-6*16(RNDKEYLAST_PTR), %rsi	// Reduce code size
	cmp		$24, AESKEYLEN
	jl		128f	// AES-128?
	je		192f	// AES-192?
	// AES-256
	aesenc		-7*16(%rsi), TMP0
	aesenc		-6*16(%rsi), TMP0
192:
	aesenc		-5*16(%rsi), TMP0
	aesenc		-4*16(%rsi), TMP0
128:
.irp i, -3,-2,-1,0,1,2,3,4,5
	aesenc		\i*16(%rsi), TMP0
.endr
	aesenclast	(RNDKEYLAST_PTR), TMP0

	// Load the next key power H^i.
	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED

	// XOR the keystream block that was just generated in TMP0 with the next
	// source data block and store the resulting en/decrypted data to DST.
.if \enc
	_xor_mem_to_reg	(SRC), TMP0, tmp=TMP1
	movdqu		TMP0, (DST)
.else
	movdqu		(SRC), TMP1
	pxor		TMP1, TMP0
	movdqu		TMP0, (DST)
.endif

	// Update GHASH with the ciphertext block.
.if \enc
	pshufb		BSWAP_MASK, TMP0
	pxor		TMP0, GHASH_ACC
.else
	pshufb		BSWAP_MASK, TMP1
	pxor		TMP1, GHASH_ACC
.endif
	_ghash_mul_noreduce	H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
	pxor		GHASH_ACC, GHASH_ACC

	add		$8, %eax
	add		$16, SRC
	add		$16, DST
	sub		$16, DATALEN
	jge		.Lcrypt_loop_1x\@
.Lcrypt_loop_1x_done\@:
	// Check whether there is a partial block at the end.
	add		$16, DATALEN
	jz		.Lghash_reduce\@

	// Process a partial block of length 1 <= DATALEN <= 15.

	// Encrypt a counter block for the last time.
	pshufb		BSWAP_MASK, LE_CTR
	pxor		(KEY), LE_CTR
	lea		16(KEY), %rsi
1:
	aesenc		(%rsi), LE_CTR
	add		$16, %rsi
	cmp		%rsi, RNDKEYLAST_PTR
	jne		1b
	aesenclast	(RNDKEYLAST_PTR), LE_CTR

	// Load the lowest key power, H^1.
	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED

	// Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC.  SRC is
	// in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
	// RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
	mov		SRC, RNDKEYLAST_PTR
	mov		DATALEN, %ecx
	_load_partial_block	RNDKEYLAST_PTR, TMP0, %rsi, %esi

	// XOR the keystream block that was just generated in LE_CTR with the
	// source data block and store the resulting en/decrypted data to DST.
	pxor		TMP0, LE_CTR
	mov		DATALEN, %ecx
	_store_partial_block	LE_CTR, DST

	// If encrypting, zero-pad the final ciphertext block for GHASH.  (If
	// decrypting, this was already done by _load_partial_block.)
.if \enc
	lea		.Lzeropad_mask+16(%rip), %rax
	sub		DATALEN64, %rax
	_vpand		(%rax), LE_CTR, TMP0
.endif

	// Update GHASH with the final ciphertext block.
	pshufb		BSWAP_MASK, TMP0
	pxor		TMP0, GHASH_ACC
	_ghash_mul_noreduce	H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0

.Lghash_reduce\@:
	// Finally, do the GHASH reduction.
	_ghash_reduce	LO, MI, HI, GHASH_ACC, TMP0

.Ldone\@:
	// Store the updated GHASH accumulator back to memory.
	movdqu		GHASH_ACC, (GHASH_ACC_PTR)

	RET
.endm

// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
//				   const u32 le_ctr[4], u8 ghash_acc[16],
//				   u64 total_aadlen, u64 total_datalen);
// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
//				   const u32 le_ctr[4], const u8 ghash_acc[16],
//				   u64 total_aadlen, u64 total_datalen,
//				   const u8 tag[16], int taglen);
//
// This macro generates one of the above two functions (with \enc selecting
// which one).  Both functions finish computing the GCM authentication tag by
// updating GHASH with the lengths block and encrypting the GHASH accumulator.
// |total_aadlen| and |total_datalen| must be the total length of the additional
// authenticated data and the en/decrypted data in bytes, respectively.
//
// The encryption function then stores the full-length (16-byte) computed
// authentication tag to |ghash_acc|.  The decryption function instead loads the
// expected authentication tag (the one that was transmitted) from the 16-byte
// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
// computed tag in constant time, and returns true if and only if they match.
.macro	_aes_gcm_final	enc

	// Function arguments
	.set	KEY,		%rdi
	.set	LE_CTR_PTR,	%rsi
	.set	GHASH_ACC_PTR,	%rdx
	.set	TOTAL_AADLEN,	%rcx
	.set	TOTAL_DATALEN,	%r8
	.set	TAG,		%r9
	.set	TAGLEN,		%r10d	// Originally at 8(%rsp)
	.set	TAGLEN64,	%r10

	// Additional local variables.
	// %rax and %xmm0-%xmm2 are used as temporary registers.
	.set	AESKEYLEN,	%r11d
	.set	AESKEYLEN64,	%r11
	.set	BSWAP_MASK,	%xmm3
	.set	GHASH_ACC,	%xmm4
	.set	H_POW1,		%xmm5	// H^1
	.set	H_POW1_X64,	%xmm6	// H^1 * x^64
	.set	GFPOLY,		%xmm7

	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN

	// Set up a counter block with 1 in the low 32-bit word.  This is the
	// counter that produces the ciphertext needed to encrypt the auth tag.
	movdqu		(LE_CTR_PTR), %xmm0
	mov		$1, %eax
	pinsrd		$0, %eax, %xmm0

	// Build the lengths block and XOR it into the GHASH accumulator.
	movq		TOTAL_DATALEN, GHASH_ACC
	pinsrq		$1, TOTAL_AADLEN, GHASH_ACC
	psllq		$3, GHASH_ACC	// Bytes to bits
	_xor_mem_to_reg	(GHASH_ACC_PTR), GHASH_ACC, %xmm1

	movdqa		OFFSETOF_H_POWERS+7*16(KEY), H_POW1
	movdqa		OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
	movq		.Lgfpoly(%rip), GFPOLY

	// Make %rax point to the 6th from last AES round key.  (Using signed
	// byte offsets -7*16 through 6*16 decreases code size.)
	lea		(KEY,AESKEYLEN64,4), %rax

	// AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
	// Interleave the AES and GHASH instructions to improve performance.
	pshufb		BSWAP_MASK, %xmm0
	pxor		(KEY), %xmm0
	cmp		$24, AESKEYLEN
	jl		128f	// AES-128?
	je		192f	// AES-192?
	// AES-256
	aesenc		-7*16(%rax), %xmm0
	aesenc		-6*16(%rax), %xmm0
192:
	aesenc		-5*16(%rax), %xmm0
	aesenc		-4*16(%rax), %xmm0
128:
.irp i, 0,1,2,3,4,5,6,7,8
	aesenc		(\i-3)*16(%rax), %xmm0
	_ghash_mul_step	\i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
.endr
	aesenclast	6*16(%rax), %xmm0
	_ghash_mul_step	9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2

	// Undo the byte reflection of the GHASH accumulator.
	pshufb		BSWAP_MASK, GHASH_ACC

	// Encrypt the GHASH accumulator.
	pxor		%xmm0, GHASH_ACC

.if \enc
	// Return the computed auth tag.
	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
.else
	.set		ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!

	// Verify the auth tag in constant time by XOR'ing the transmitted and
	// computed auth tags together and using the ptest instruction to check
	// whether the first TAGLEN bytes of the result are zero.
	_xor_mem_to_reg	(TAG), GHASH_ACC, tmp=%xmm0
	movl		8(%rsp), TAGLEN
	lea		.Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
	sub		TAGLEN64, ZEROPAD_MASK_PTR
	xor		%eax, %eax
	_test_mem	(ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
	sete		%al
.endif
	RET
.endm

.set	USE_AVX, 0
SYM_FUNC_START(aes_gcm_precompute_aesni)
	_aes_gcm_precompute
SYM_FUNC_END(aes_gcm_precompute_aesni)
SYM_FUNC_START(aes_gcm_aad_update_aesni)
	_aes_gcm_aad_update
SYM_FUNC_END(aes_gcm_aad_update_aesni)
SYM_FUNC_START(aes_gcm_enc_update_aesni)
	_aes_gcm_update	1
SYM_FUNC_END(aes_gcm_enc_update_aesni)
SYM_FUNC_START(aes_gcm_dec_update_aesni)
	_aes_gcm_update	0
SYM_FUNC_END(aes_gcm_dec_update_aesni)
SYM_FUNC_START(aes_gcm_enc_final_aesni)
	_aes_gcm_final	1
SYM_FUNC_END(aes_gcm_enc_final_aesni)
SYM_FUNC_START(aes_gcm_dec_final_aesni)
	_aes_gcm_final	0
SYM_FUNC_END(aes_gcm_dec_final_aesni)

.set	USE_AVX, 1
SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
	_aes_gcm_precompute
SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
	_aes_gcm_aad_update
SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
	_aes_gcm_update	1
SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
	_aes_gcm_update	0
SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
	_aes_gcm_final	1
SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
	_aes_gcm_final	0
SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)