diff options
Diffstat (limited to 'arch/x86/lib')
-rw-r--r-- | arch/x86/lib/cache-smp.c | 1 | ||||
-rw-r--r-- | arch/x86/lib/csum-partial_64.c | 105 | ||||
-rw-r--r-- | arch/x86/lib/delay.c | 2 | ||||
-rw-r--r-- | arch/x86/lib/misc.c | 2 |
4 files changed, 38 insertions, 72 deletions
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c index 7c48ff4ae8d1..7af743bd3b13 100644 --- a/arch/x86/lib/cache-smp.c +++ b/arch/x86/lib/cache-smp.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <asm/paravirt.h> #include <linux/smp.h> #include <linux/export.h> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index cea25ca8b8cf..c9dae65ac01b 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -11,26 +11,23 @@ #include <asm/checksum.h> #include <asm/word-at-a-time.h> -static inline unsigned short from32to16(unsigned a) +static inline __wsum csum_finalize_sum(u64 temp64) { - unsigned short b = a >> 16; - asm("addw %w2,%w0\n\t" - "adcw $0,%w0\n" - : "=r" (b) - : "0" (b), "r" (a)); - return b; + return (__force __wsum)((temp64 + ror64(temp64, 32)) >> 32); } -static inline __wsum csum_tail(u64 temp64, int odd) +static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5]) { - unsigned int result; - - result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); - if (unlikely(odd)) { - result = from32to16(result); - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - } - return (__force __wsum)result; + asm("addq %1,%0\n\t" + "adcq %2,%0\n\t" + "adcq %3,%0\n\t" + "adcq %4,%0\n\t" + "adcq %5,%0\n\t" + "adcq $0,%0" + :"+r" (sum) + :"m" (m[0]), "m" (m[1]), "m" (m[2]), + "m" (m[3]), "m" (m[4])); + return sum; } /* @@ -47,64 +44,32 @@ static inline __wsum csum_tail(u64 temp64, int odd) __wsum csum_partial(const void *buff, int len, __wsum sum) { u64 temp64 = (__force u64)sum; - unsigned odd; - odd = 1 & (unsigned long) buff; - if (unlikely(odd)) { - if (unlikely(len == 0)) - return sum; - temp64 = ror32((__force u32)sum, 8); - temp64 += (*(unsigned char *)buff << 8); - len--; - buff++; + /* Do two 40-byte chunks in parallel to get better ILP */ + if (likely(len >= 80)) { + u64 temp64_2 = 0; + do { + temp64 = update_csum_40b(temp64, buff); + temp64_2 = update_csum_40b(temp64_2, buff + 40); + buff += 80; + len -= 80; + } while (len >= 80); + + asm("addq %1,%0\n\t" + "adcq $0,%0" + :"+r" (temp64): "r" (temp64_2)); } /* - * len == 40 is the hot case due to IPv6 headers, but annotating it likely() - * has noticeable negative affect on codegen for all other cases with - * minimal performance benefit here. + * len == 40 is the hot case due to IPv6 headers, so return + * early for that exact case without checking the tail bytes. */ - if (len == 40) { - asm("addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcq 4*8(%[src]),%[res]\n\t" - "adcq $0,%[res]" - : [res] "+r"(temp64) - : [src] "r"(buff), "m"(*(const char(*)[40])buff)); - return csum_tail(temp64, odd); - } - if (unlikely(len >= 64)) { - /* - * Extra accumulators for better ILP in the loop. - */ - u64 tmp_accum, tmp_carries; - - asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t" - "xorl %k[tmp_carries],%k[tmp_carries]\n\t" - "subl $64, %[len]\n\t" - "1:\n\t" - "addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcl $0,%k[tmp_carries]\n\t" - "addq 4*8(%[src]),%[tmp_accum]\n\t" - "adcq 5*8(%[src]),%[tmp_accum]\n\t" - "adcq 6*8(%[src]),%[tmp_accum]\n\t" - "adcq 7*8(%[src]),%[tmp_accum]\n\t" - "adcl $0,%k[tmp_carries]\n\t" - "addq $64, %[src]\n\t" - "subl $64, %[len]\n\t" - "jge 1b\n\t" - "addq %[tmp_accum],%[res]\n\t" - "adcq %[tmp_carries],%[res]\n\t" - "adcq $0,%[res]" - : [tmp_accum] "=&r"(tmp_accum), - [tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64), - [len] "+r"(len), [src] "+r"(buff) - : "m"(*(const char *)buff)); + if (len >= 40) { + temp64 = update_csum_40b(temp64, buff); + len -= 40; + if (!len) + return csum_finalize_sum(temp64); + buff += 40; } if (len & 32) { @@ -143,7 +108,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) : [res] "+r"(temp64) : [trail] "r"(trail)); } - return csum_tail(temp64, odd); + return csum_finalize_sum(temp64); } EXPORT_SYMBOL(csum_partial); diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 0e65d00e2339..23f81ca3f06b 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -128,7 +128,7 @@ static void delay_halt_mwaitx(u64 unused, u64 cycles) delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles); /* - * Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu + * Use cpu_tss_rw as a cacheline-aligned, seldom accessed per-cpu * variable as the monitor target. */ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c index 92cd8ecc3a2c..40b81c338ae5 100644 --- a/arch/x86/lib/misc.c +++ b/arch/x86/lib/misc.c @@ -8,7 +8,7 @@ */ int num_digits(int val) { - int m = 10; + long long m = 10; int d = 1; if (val < 0) { |