diff options
Diffstat (limited to 'arch/x86/lib/csum-partial_64.c')
| -rw-r--r-- | arch/x86/lib/csum-partial_64.c | 183 | 
1 files changed, 91 insertions, 92 deletions
| diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index e7925d668b68..1f8a8f895173 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -9,6 +9,7 @@  #include <linux/compiler.h>  #include <linux/export.h>  #include <asm/checksum.h> +#include <asm/word-at-a-time.h>  static inline unsigned short from32to16(unsigned a)   { @@ -21,120 +22,119 @@ static inline unsigned short from32to16(unsigned a)  }  /* - * Do a 64-bit checksum on an arbitrary memory area. + * Do a checksum on an arbitrary memory area.   * Returns a 32bit checksum.   *   * This isn't as time critical as it used to be because many NICs   * do hardware checksumming these days. - *  - * Things tried and found to not make it faster: - * Manual Prefetching - * Unrolling to an 128 bytes inner loop. - * Using interleaving with more registers to break the carry chains. + * + * Still, with CHECKSUM_COMPLETE this is called to compute + * checksums on IPv6 headers (40 bytes) and other small parts. + * it's best to have buff aligned on a 64-bit boundary   */ -static unsigned do_csum(const unsigned char *buff, unsigned len) +__wsum csum_partial(const void *buff, int len, __wsum sum)  { -	unsigned odd, count; -	unsigned long result = 0; +	u64 temp64 = (__force u64)sum; +	unsigned odd, result; -	if (unlikely(len == 0)) -		return result;   	odd = 1 & (unsigned long) buff;  	if (unlikely(odd)) { -		result = *buff << 8; +		if (unlikely(len == 0)) +			return sum; +		temp64 = ror32((__force u32)sum, 8); +		temp64 += (*(unsigned char *)buff << 8);  		len--;  		buff++;  	} -	count = len >> 1;		/* nr of 16-bit words.. */ -	if (count) { -		if (2 & (unsigned long) buff) { -			result += *(unsigned short *)buff; -			count--; -			len -= 2; -			buff += 2; -		} -		count >>= 1;		/* nr of 32-bit words.. */ -		if (count) { -			unsigned long zero; -			unsigned count64; -			if (4 & (unsigned long) buff) { -				result += *(unsigned int *) buff; -				count--; -				len -= 4; -				buff += 4; -			} -			count >>= 1;	/* nr of 64-bit words.. */ -			/* main loop using 64byte blocks */ -			zero = 0; -			count64 = count >> 3; -			while (count64) {  -				asm("addq 0*8(%[src]),%[res]\n\t" -				    "adcq 1*8(%[src]),%[res]\n\t" -				    "adcq 2*8(%[src]),%[res]\n\t" -				    "adcq 3*8(%[src]),%[res]\n\t" -				    "adcq 4*8(%[src]),%[res]\n\t" -				    "adcq 5*8(%[src]),%[res]\n\t" -				    "adcq 6*8(%[src]),%[res]\n\t" -				    "adcq 7*8(%[src]),%[res]\n\t" -				    "adcq %[zero],%[res]" -				    : [res] "=r" (result) -				    : [src] "r" (buff), [zero] "r" (zero), -				    "[res]" (result)); -				buff += 64; -				count64--; -			} +	while (unlikely(len >= 64)) { +		asm("addq 0*8(%[src]),%[res]\n\t" +		    "adcq 1*8(%[src]),%[res]\n\t" +		    "adcq 2*8(%[src]),%[res]\n\t" +		    "adcq 3*8(%[src]),%[res]\n\t" +		    "adcq 4*8(%[src]),%[res]\n\t" +		    "adcq 5*8(%[src]),%[res]\n\t" +		    "adcq 6*8(%[src]),%[res]\n\t" +		    "adcq 7*8(%[src]),%[res]\n\t" +		    "adcq $0,%[res]" +		    : [res] "+r" (temp64) +		    : [src] "r" (buff) +		    : "memory"); +		buff += 64; +		len -= 64; +	} + +	if (len & 32) { +		asm("addq 0*8(%[src]),%[res]\n\t" +		    "adcq 1*8(%[src]),%[res]\n\t" +		    "adcq 2*8(%[src]),%[res]\n\t" +		    "adcq 3*8(%[src]),%[res]\n\t" +		    "adcq $0,%[res]" +			: [res] "+r" (temp64) +			: [src] "r" (buff) +			: "memory"); +		buff += 32; +	} +	if (len & 16) { +		asm("addq 0*8(%[src]),%[res]\n\t" +		    "adcq 1*8(%[src]),%[res]\n\t" +		    "adcq $0,%[res]" +			: [res] "+r" (temp64) +			: [src] "r" (buff) +			: "memory"); +		buff += 16; +	} +	if (len & 8) { +		asm("addq 0*8(%[src]),%[res]\n\t" +		    "adcq $0,%[res]" +			: [res] "+r" (temp64) +			: [src] "r" (buff) +			: "memory"); +		buff += 8; +	} +	if (len & 7) { +#ifdef CONFIG_DCACHE_WORD_ACCESS +		unsigned int shift = (8 - (len & 7)) * 8; +		unsigned long trail; -			/* last up to 7 8byte blocks */ -			count %= 8;  -			while (count) {  -				asm("addq %1,%0\n\t" -				    "adcq %2,%0\n"  -					    : "=r" (result) -				    : "m" (*(unsigned long *)buff),  -				    "r" (zero),  "0" (result)); -				--count;  -				buff += 8; -			} -			result = add32_with_carry(result>>32, -						  result&0xffffffff);  +		trail = (load_unaligned_zeropad(buff) << shift) >> shift; -			if (len & 4) { -				result += *(unsigned int *) buff; -				buff += 4; -			} +		asm("addq %[trail],%[res]\n\t" +		    "adcq $0,%[res]" +			: [res] "+r" (temp64) +			: [trail] "r" (trail)); +#else +		if (len & 4) { +			asm("addq %[val],%[res]\n\t" +			    "adcq $0,%[res]" +				: [res] "+r" (temp64) +				: [val] "r" ((u64)*(u32 *)buff) +				: "memory"); +			buff += 4;  		}  		if (len & 2) { -			result += *(unsigned short *) buff; +			asm("addq %[val],%[res]\n\t" +			    "adcq $0,%[res]" +				: [res] "+r" (temp64) +				: [val] "r" ((u64)*(u16 *)buff) +				: "memory");  			buff += 2;  		} +		if (len & 1) { +			asm("addq %[val],%[res]\n\t" +			    "adcq $0,%[res]" +				: [res] "+r" (temp64) +				: [val] "r" ((u64)*(u8 *)buff) +				: "memory"); +		} +#endif  	} -	if (len & 1) -		result += *buff; -	result = add32_with_carry(result>>32, result & 0xffffffff);  -	if (unlikely(odd)) {  +	result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); +	if (unlikely(odd)) {  		result = from32to16(result);  		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);  	} -	return result; -} - -/* - * computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit) - * - * returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic - * - * this function must be called with even lengths, except - * for the last fragment, which may be odd - * - * it's best to have buff aligned on a 64-bit boundary - */ -__wsum csum_partial(const void *buff, int len, __wsum sum) -{ -	return (__force __wsum)add32_with_carry(do_csum(buff, len), -						(__force u32)sum); +	return (__force __wsum)result;  }  EXPORT_SYMBOL(csum_partial); @@ -147,4 +147,3 @@ __sum16 ip_compute_csum(const void *buff, int len)  	return csum_fold(csum_partial(buff,len,0));  }  EXPORT_SYMBOL(ip_compute_csum); - |