From 5ef814753eb810d900fbd77af7c87f6d04f0e551 Mon Sep 17 00:00:00 2001 From: "Angelo P. Castellani" Date: Thu, 22 Feb 2007 00:23:05 -0800 Subject: [TCP] YeAH-TCP: algorithm implementation YeAH-TCP is a sender-side high-speed enabled TCP congestion control algorithm, which uses a mixed loss/delay approach to compute the congestion window. It's design goals target high efficiency, internal, RTT and Reno fairness, resilience to link loss while keeping network elements load as low as possible. For further details look here: http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf Signed-off-by: Angelo P. Castellani Signed-off-by: David S. Miller --- net/ipv4/tcp_yeah.c | 288 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 net/ipv4/tcp_yeah.c (limited to 'net/ipv4/tcp_yeah.c') diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c new file mode 100644 index 000000000000..815e020e98fe --- /dev/null +++ b/net/ipv4/tcp_yeah.c @@ -0,0 +1,288 @@ +/* + * + * YeAH TCP + * + * For further details look at: + * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + * + */ + +#include "tcp_yeah.h" + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 + +#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck +#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt +#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss +#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion +#define TCP_YEAH_PHY 8 //lin maximum delta from base +#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss +#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count + +#define TCP_SCALABLE_AI_CNT 100U + +/* YeAH variables */ +struct yeah { + /* Vegas */ + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now;/* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ + + /* YeAH */ + u32 lastQ; + u32 doing_reno_now; + + u32 reno_count; + u32 fast_count; + + u32 pkts_acked; +}; + +static void tcp_yeah_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + tcp_vegas_init(sk); + + yeah->doing_reno_now = 0; + yeah->lastQ = 0; + + yeah->reno_count = 2; + + /* Ensure the MD arithmetic works. This is somewhat pedantic, + * since I don't think we will see a cwnd this large. :) */ + tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); + +} + + +static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + if (icsk->icsk_ca_state == TCP_CA_Open) + yeah->pkts_acked = pkts_acked; +} + +/* 64bit divisor, dividend and result. dynamic precision */ +static inline u64 div64_64(u64 dividend, u64 divisor) +{ + u32 d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (u32) dividend / d; + + return dividend; +} + +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + tcp_slow_start(tp); + } else if (!yeah->doing_reno_now) { + /* Scalable */ + + tp->snd_cwnd_cnt+=yeah->pkts_acked; + if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + + yeah->pkts_acked = 1; + + } else { + /* Reno */ + + if (tp->snd_cwnd_cnt < tp->snd_cwnd) + tp->snd_cwnd_cnt++; + + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } + + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up yeahly with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, yeah->beg_snd_nxt)) { + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (yeah->cntRTT > 2) { + u32 rtt; + u32 queue, maxqueue; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = yeah->minRTT; + + queue = (u32)div64_64((u64)tp->snd_cwnd * (rtt - yeah->baseRTT), rtt); + + maxqueue = TCP_YEAH_ALPHA; + + if (queue > maxqueue || + rtt - yeah->baseRTT > (yeah->baseRTT / TCP_YEAH_PHY)) { + + if (queue > maxqueue && tp->snd_cwnd > yeah->reno_count) { + u32 reduction = min( queue / TCP_YEAH_GAMMA , + tp->snd_cwnd >> TCP_YEAH_EPSILON ); + + tp->snd_cwnd -= reduction; + + tp->snd_cwnd = max( tp->snd_cwnd, yeah->reno_count); + + tp->snd_ssthresh = tp->snd_cwnd; + } + + if (yeah->reno_count <= 2) + yeah->reno_count = max( tp->snd_cwnd>>1, 2U); + else + yeah->reno_count++; + + yeah->doing_reno_now = + min_t( u32, yeah->doing_reno_now + 1 , 0xffffff); + + } else { + yeah->fast_count++; + + if (yeah->fast_count > TCP_YEAH_ZETA) { + yeah->reno_count = 2; + yeah->fast_count = 0; + } + + yeah->doing_reno_now = 0; + } + + yeah->lastQ = queue; + + } + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + yeah->beg_snd_una = yeah->beg_snd_nxt; + yeah->beg_snd_nxt = tp->snd_nxt; + yeah->beg_snd_cwnd = tp->snd_cwnd; + + /* Wipe the slate clean for the next RTT. */ + yeah->cntRTT = 0; + yeah->minRTT = 0x7fffffff; + } +} + +static u32 tcp_yeah_ssthresh(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + u32 reduction; + + if (yeah->doing_reno_now < TCP_YEAH_RHO) { + reduction = yeah->lastQ; + + reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); + + reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); + } else + reduction = max(tp->snd_cwnd>>1,2U); + + yeah->fast_count = 0; + yeah->reno_count = max(yeah->reno_count>>1, 2U); + + return tp->snd_cwnd - reduction; +} + +static struct tcp_congestion_ops tcp_yeah = { + .init = tcp_yeah_init, + .ssthresh = tcp_yeah_ssthresh, + .cong_avoid = tcp_yeah_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_vegas_rtt_calc, + .set_state = tcp_vegas_state, + .cwnd_event = tcp_vegas_cwnd_event, + .get_info = tcp_vegas_get_info, + .pkts_acked = tcp_yeah_pkts_acked, + + .owner = THIS_MODULE, + .name = "yeah", +}; + +static int __init tcp_yeah_register(void) +{ + BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_yeah); + return 0; +} + +static void __exit tcp_yeah_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_yeah); +} + +module_init(tcp_yeah_register); +module_exit(tcp_yeah_unregister); + +MODULE_AUTHOR("Angelo P. Castellani"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("YeAH TCP"); -- cgit From 3927f2e8f9afa3424bb51ca81f7abac01ffd0005 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 25 Mar 2007 19:54:23 -0700 Subject: [NET]: div64_64 consolidate (rev3) Here is the current version of the 64 bit divide common code. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/asm-arm/div64.h | 3 +++ include/asm-generic/div64.h | 7 +++++++ include/asm-i386/div64.h | 4 ++++ include/asm-m68k/div64.h | 3 +++ include/asm-mips/div64.h | 11 ++++++++++- include/asm-um/div64.h | 1 + include/asm-xtensa/div64.h | 6 ++++++ lib/Makefile | 5 +++-- lib/div64.c | 22 ++++++++++++++++++++++ net/ipv4/tcp_cubic.c | 23 ----------------------- net/ipv4/tcp_yeah.c | 21 --------------------- net/ipv4/tcp_yeah.h | 1 + net/netfilter/xt_connbytes.c | 16 ---------------- 13 files changed, 60 insertions(+), 63 deletions(-) (limited to 'net/ipv4/tcp_yeah.c') diff --git a/include/asm-arm/div64.h b/include/asm-arm/div64.h index 37e0a96e8789..0b5f881c3d85 100644 --- a/include/asm-arm/div64.h +++ b/include/asm-arm/div64.h @@ -2,6 +2,7 @@ #define __ASM_ARM_DIV64 #include +#include /* * The semantics of do_div() are: @@ -223,4 +224,6 @@ #endif +extern uint64_t div64_64(uint64_t dividend, uint64_t divisor); + #endif diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h index 8f4e3193342e..a4a49370793c 100644 --- a/include/asm-generic/div64.h +++ b/include/asm-generic/div64.h @@ -30,6 +30,11 @@ __rem; \ }) +static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor) +{ + return dividend / divisor; +} + #elif BITS_PER_LONG == 32 extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor); @@ -49,6 +54,8 @@ extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor); __rem; \ }) +extern uint64_t div64_64(uint64_t dividend, uint64_t divisor); + #else /* BITS_PER_LONG == ?? */ # error do_div() does not yet support the C64 diff --git a/include/asm-i386/div64.h b/include/asm-i386/div64.h index 75c67c785bb8..438e980068bd 100644 --- a/include/asm-i386/div64.h +++ b/include/asm-i386/div64.h @@ -1,6 +1,8 @@ #ifndef __I386_DIV64 #define __I386_DIV64 +#include + /* * do_div() is NOT a C function. It wants to return * two values (the quotient and the remainder), but @@ -45,4 +47,6 @@ div_ll_X_l_rem(long long divs, long div, long *rem) return dum2; } + +extern uint64_t div64_64(uint64_t dividend, uint64_t divisor); #endif diff --git a/include/asm-m68k/div64.h b/include/asm-m68k/div64.h index 9f65de1a2480..33caad1628d4 100644 --- a/include/asm-m68k/div64.h +++ b/include/asm-m68k/div64.h @@ -1,6 +1,8 @@ #ifndef _M68K_DIV64_H #define _M68K_DIV64_H +#include + /* n = n / base; return rem; */ #define do_div(n, base) ({ \ @@ -23,4 +25,5 @@ __rem; \ }) +extern uint64_t div64_64(uint64_t dividend, uint64_t divisor); #endif /* _M68K_DIV64_H */ diff --git a/include/asm-mips/div64.h b/include/asm-mips/div64.h index d107832de1b6..66189f5f6399 100644 --- a/include/asm-mips/div64.h +++ b/include/asm-mips/div64.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2000, 2004 Maciej W. Rozycki - * Copyright (C) 2003 Ralf Baechle + * Copyright (C) 2003, 07 Ralf Baechle (ralf@linux-mips.org) * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive @@ -9,6 +9,8 @@ #ifndef _ASM_DIV64_H #define _ASM_DIV64_H +#include + #if (_MIPS_SZLONG == 32) #include @@ -78,6 +80,8 @@ __quot = __quot << 32 | __low; \ (n) = __quot; \ __mod; }) + +extern uint64_t div64_64(uint64_t dividend, uint64_t divisor); #endif /* (_MIPS_SZLONG == 32) */ #if (_MIPS_SZLONG == 64) @@ -101,6 +105,11 @@ (n) = __quot; \ __mod; }) +static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor) +{ + return dividend / divisor; +} + #endif /* (_MIPS_SZLONG == 64) */ #endif /* _ASM_DIV64_H */ diff --git a/include/asm-um/div64.h b/include/asm-um/div64.h index 1e17f7409cab..7b73b2cd5b34 100644 --- a/include/asm-um/div64.h +++ b/include/asm-um/div64.h @@ -3,4 +3,5 @@ #include "asm/arch/div64.h" +extern uint64_t div64_64(uint64_t dividend, uint64_t divisor); #endif diff --git a/include/asm-xtensa/div64.h b/include/asm-xtensa/div64.h index c4a105776383..20965e3af1dd 100644 --- a/include/asm-xtensa/div64.h +++ b/include/asm-xtensa/div64.h @@ -11,9 +11,15 @@ #ifndef _XTENSA_DIV64_H #define _XTENSA_DIV64_H +#include + #define do_div(n,base) ({ \ int __res = n % ((unsigned int) base); \ n /= (unsigned int) base; \ __res; }) +static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor) +{ + return dividend / divisor; +} #endif diff --git a/lib/Makefile b/lib/Makefile index 992a39ef9ffd..ae57f357fec0 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -4,7 +4,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o \ - idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \ + idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o lib-$(CONFIG_MMU) += ioremap.o @@ -12,7 +12,8 @@ lib-$(CONFIG_SMP) += cpumask.o lib-y += kobject.o kref.o kobject_uevent.o klist.o -obj-y += sort.o parser.o halfmd4.o debug_locks.o random32.o bust_spinlocks.o +obj-y += div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ + bust_spinlocks.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG diff --git a/lib/div64.c b/lib/div64.c index 365719f84832..c3d7655cdfb5 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -58,4 +58,26 @@ uint32_t __div64_32(uint64_t *n, uint32_t base) EXPORT_SYMBOL(__div64_32); +/* 64bit divisor, dividend and result. dynamic precision */ +uint64_t div64_64(uint64_t dividend, uint64_t divisor) +{ + uint32_t d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (uint32_t) dividend / d; + + return dividend; +} +EXPORT_SYMBOL(div64_64); + #endif /* BITS_PER_LONG == 32 */ diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 9a582fb4ef9f..6f08adbda54e 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -51,8 +51,6 @@ MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_ module_param(tcp_friendliness, int, 0644); MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); -#include - /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ @@ -93,27 +91,6 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -/* 64bit divisor, dividend and result. dynamic precision */ -static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) -{ - u_int32_t d = divisor; - - if (divisor > 0xffffffffULL) { - unsigned int shift = fls(divisor >> 32); - - d = divisor >> shift; - dividend >>= shift; - } - - /* avoid 64 bit division if possible */ - if (dividend >> 32) - do_div(dividend, d); - else - dividend = (uint32_t) dividend / d; - - return dividend; -} - /* * calculate the cubic root of x using Newton-Raphson */ diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 815e020e98fe..18355a2608e1 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -73,27 +73,6 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked) yeah->pkts_acked = pkts_acked; } -/* 64bit divisor, dividend and result. dynamic precision */ -static inline u64 div64_64(u64 dividend, u64 divisor) -{ - u32 d = divisor; - - if (divisor > 0xffffffffULL) { - unsigned int shift = fls(divisor >> 32); - - d = divisor >> shift; - dividend >>= shift; - } - - /* avoid 64 bit division if possible */ - if (dividend >> 32) - do_div(dividend, d); - else - dividend = (u32) dividend / d; - - return dividend; -} - static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int flag) { diff --git a/net/ipv4/tcp_yeah.h b/net/ipv4/tcp_yeah.h index b3255dba4e2d..a62d82038fd0 100644 --- a/net/ipv4/tcp_yeah.h +++ b/net/ipv4/tcp_yeah.h @@ -2,6 +2,7 @@ #include #include #include +#include #include diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 5e32dfa2668b..302043bc41b2 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -24,22 +24,6 @@ MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); MODULE_ALIAS("ipt_connbytes"); -/* 64bit divisor, dividend and result. dynamic precision */ -static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) -{ - u_int32_t d = divisor; - - if (divisor > 0xffffffffULL) { - unsigned int shift = fls(divisor >> 32); - - d = divisor >> shift; - dividend >>= shift; - } - - do_div(dividend, d); - return dividend; -} - static int match(const struct sk_buff *skb, const struct net_device *in, -- cgit From 43e683926f808cec9802466c27cee7499eda3d11 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 6 Mar 2007 20:21:20 -0800 Subject: [TCP] TCP Yeah: cleanup Eliminate need for full 6/4/64 divide to compute queue. Variable maxqueue was really a constant. Fix indentation. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_yeah.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'net/ipv4/tcp_yeah.c') diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 18355a2608e1..46dd1bee583a 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -74,7 +74,7 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked) } static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, - u32 seq_rtt, u32 in_flight, int flag) + u32 seq_rtt, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); @@ -142,8 +142,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, */ if (yeah->cntRTT > 2) { - u32 rtt; - u32 queue, maxqueue; + u32 rtt, queue; + u64 bw; /* We have enough RTT samples, so, using the Vegas * algorithm, we determine if we should increase or @@ -158,32 +158,36 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, */ rtt = yeah->minRTT; - queue = (u32)div64_64((u64)tp->snd_cwnd * (rtt - yeah->baseRTT), rtt); - - maxqueue = TCP_YEAH_ALPHA; - - if (queue > maxqueue || - rtt - yeah->baseRTT > (yeah->baseRTT / TCP_YEAH_PHY)) { - - if (queue > maxqueue && tp->snd_cwnd > yeah->reno_count) { - u32 reduction = min( queue / TCP_YEAH_GAMMA , - tp->snd_cwnd >> TCP_YEAH_EPSILON ); + /* Compute excess number of packets above bandwidth + * Avoid doing full 64 bit divide. + */ + bw = tp->snd_cwnd; + bw *= rtt - yeah->baseRTT; + do_div(bw, rtt); + queue = bw; + + if (queue > TCP_YEAH_ALPHA || + rtt - yeah->baseRTT > (yeah->baseRTT / TCP_YEAH_PHY)) { + if (queue > TCP_YEAH_ALPHA + && tp->snd_cwnd > yeah->reno_count) { + u32 reduction = min(queue / TCP_YEAH_GAMMA , + tp->snd_cwnd >> TCP_YEAH_EPSILON); tp->snd_cwnd -= reduction; - tp->snd_cwnd = max( tp->snd_cwnd, yeah->reno_count); + tp->snd_cwnd = max(tp->snd_cwnd, + yeah->reno_count); tp->snd_ssthresh = tp->snd_cwnd; - } + } if (yeah->reno_count <= 2) - yeah->reno_count = max( tp->snd_cwnd>>1, 2U); + yeah->reno_count = max(tp->snd_cwnd>>1, 2U); else yeah->reno_count++; - yeah->doing_reno_now = - min_t( u32, yeah->doing_reno_now + 1 , 0xffffff); - + yeah->doing_reno_now = min(yeah->doing_reno_now + 1, + 0xffffffU); } else { yeah->fast_count++; -- cgit From 164891aadf1721fca4dce473bb0e0998181537c6 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 23 Apr 2007 22:26:16 -0700 Subject: [TCP]: Congestion control API update. Do some simple changes to make congestion control API faster/cleaner. * use ktime_t rather than timeval * merge rtt sampling into existing ack callback this means one indirect call versus two per ack. * use flags bits to store options/settings Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ include/net/tcp.h | 9 +++++---- net/ipv4/tcp_bic.c | 2 +- net/ipv4/tcp_cong.c | 14 +++++++------- net/ipv4/tcp_cubic.c | 2 +- net/ipv4/tcp_htcp.c | 2 +- net/ipv4/tcp_illinois.c | 16 +++++++--------- net/ipv4/tcp_input.c | 25 ++++++++----------------- net/ipv4/tcp_lp.c | 8 +++++--- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_vegas.c | 10 +++++++--- net/ipv4/tcp_veno.c | 10 +++++++--- net/ipv4/tcp_westwood.c | 2 +- net/ipv4/tcp_yeah.c | 6 ++++-- net/ipv4/tcp_yeah.h | 7 +++++-- 15 files changed, 65 insertions(+), 55 deletions(-) (limited to 'net/ipv4/tcp_yeah.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 50f6f6a094cf..2694cb3ca763 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1569,6 +1569,11 @@ static inline void __net_timestamp(struct sk_buff *skb) skb->tstamp = ktime_get_real(); } +static inline ktime_t net_timedelta(ktime_t t) +{ + return ktime_sub(ktime_get_real(), t); +} + extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len); extern __sum16 __skb_checksum_complete(struct sk_buff *skb); diff --git a/include/net/tcp.h b/include/net/tcp.h index 43910fe3c448..a385797f160a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -629,9 +629,12 @@ enum tcp_ca_event { #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) +#define TCP_CONG_NON_RESTRICTED 0x1 +#define TCP_CONG_RTT_STAMP 0x2 + struct tcp_congestion_ops { struct list_head list; - int non_restricted; + unsigned long flags; /* initialize private data (optional) */ void (*init)(struct sock *sk); @@ -645,8 +648,6 @@ struct tcp_congestion_ops { /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int good_ack); - /* round trip time sample per acked packet (optional) */ - void (*rtt_sample)(struct sock *sk, u32 usrtt); /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ @@ -654,7 +655,7 @@ struct tcp_congestion_ops { /* new value of cwnd after loss (optional) */ u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ - void (*pkts_acked)(struct sock *sk, u32 num_acked); + void (*pkts_acked)(struct sock *sk, u32 num_acked, ktime_t last); /* get info for inet_diag (optional) */ void (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 5730333cd0ac..281c9f913257 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -206,7 +206,7 @@ static void bictcp_state(struct sock *sk, u8 new_state) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt) +static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index ccd88407e0cd..86b26539e54b 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -126,7 +126,7 @@ int tcp_set_default_congestion_control(const char *name) #endif if (ca) { - ca->non_restricted = 1; /* default is always allowed */ + ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ list_move(&ca->list, &tcp_cong_list); ret = 0; } @@ -181,7 +181,7 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (!ca->non_restricted) + if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) continue; offs += snprintf(buf + offs, maxlen - offs, "%s%s", @@ -212,16 +212,16 @@ int tcp_set_allowed_congestion_control(char *val) } } - /* pass 2 clear */ + /* pass 2 clear old values */ list_for_each_entry_rcu(ca, &tcp_cong_list, list) - ca->non_restricted = 0; + ca->flags &= ~TCP_CONG_NON_RESTRICTED; /* pass 3 mark as allowed */ while ((name = strsep(&val, " ")) && *name) { ca = tcp_ca_find(name); WARN_ON(!ca); if (ca) - ca->non_restricted = 1; + ca->flags |= TCP_CONG_NON_RESTRICTED; } out: spin_unlock(&tcp_cong_list_lock); @@ -256,7 +256,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) if (!ca) err = -ENOENT; - else if (!(ca->non_restricted || capable(CAP_NET_ADMIN))) + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN))) err = -EPERM; else if (!try_module_get(ca->owner)) @@ -371,8 +371,8 @@ u32 tcp_reno_min_cwnd(const struct sock *sk) EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); struct tcp_congestion_ops tcp_reno = { + .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", - .non_restricted = 1, .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 296845be912b..14224487b16b 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -334,7 +334,7 @@ static void bictcp_state(struct sock *sk, u8 new_state) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt) +static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 1020eb48d8d1..4ba4a7ae0a85 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -98,7 +98,7 @@ static inline void measure_rtt(struct sock *sk) } } -static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked) +static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index ae6298600886..8e3165917f72 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -83,9 +83,14 @@ static void tcp_illinois_init(struct sock *sk) } /* Measure RTT for each ack. */ -static void tcp_illinois_rtt_sample(struct sock *sk, u32 rtt) +static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, ktime_t last) { struct illinois *ca = inet_csk_ca(sk); + u32 rtt; + + ca->acked = pkts_acked; + + rtt = ktime_to_ns(net_timedelta(last)) / NSEC_PER_USEC; /* ignore bogus values, this prevents wraparound in alpha math */ if (rtt > RTT_MAX) @@ -103,13 +108,6 @@ static void tcp_illinois_rtt_sample(struct sock *sk, u32 rtt) ca->sum_rtt += rtt; } -/* Capture count of packets covered by ack, to adjust for delayed acks */ -static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked) -{ - struct illinois *ca = inet_csk_ca(sk); - ca->acked = pkts_acked; -} - /* Maximum queuing delay */ static inline u32 max_delay(const struct illinois *ca) { @@ -325,12 +323,12 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, } static struct tcp_congestion_ops tcp_illinois = { + .flags = TCP_CONG_RTT_STAMP, .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, .min_cwnd = tcp_reno_min_cwnd, .cong_avoid = tcp_illinois_cong_avoid, .set_state = tcp_illinois_state, - .rtt_sample = tcp_illinois_rtt_sample, .get_info = tcp_illinois_info, .pkts_acked = tcp_illinois_acked, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 633389390788..051f0f815f17 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2402,14 +2402,6 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } -static u32 tcp_usrtt(struct timeval *tv) -{ - struct timeval now; - - do_gettimeofday(&now); - return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec); -} - /* Remove acknowledged frames from the retransmission queue. */ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { @@ -2420,9 +2412,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) int acked = 0; __s32 seq_rtt = -1; u32 pkts_acked = 0; - void (*rtt_sample)(struct sock *sk, u32 usrtt) - = icsk->icsk_ca_ops->rtt_sample; - struct timeval tv = { .tv_sec = 0, .tv_usec = 0 }; + ktime_t last_ackt = ktime_set(0,0); while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { @@ -2471,7 +2461,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) seq_rtt = -1; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - skb_get_timestamp(skb, &tv); + last_ackt = skb->tstamp; } if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); @@ -2484,7 +2474,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - skb_get_timestamp(skb, &tv); + last_ackt = skb->tstamp; } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); @@ -2494,13 +2484,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } if (acked&FLAG_ACKED) { + const struct tcp_congestion_ops *ca_ops + = inet_csk(sk)->icsk_ca_ops; + tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk); - if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED)) - (*rtt_sample)(sk, tcp_usrtt(&tv)); - if (icsk->icsk_ca_ops->pkts_acked) - icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked); + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, last_ackt); } #if FASTRETRANS_DEBUG > 0 diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index f0ebaf0e21cb..b4e062ab24a1 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -218,7 +218,7 @@ static u32 tcp_lp_owd_calculator(struct sock *sk) * 3. calc smoothed OWD (SOWD). * Most ideas come from the original TCP-LP implementation. */ -static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) +static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) { struct lp *lp = inet_csk_ca(sk); s64 mowd = tcp_lp_owd_calculator(sk); @@ -261,11 +261,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) * newReno in increase case. * We work it out by following the idea from TCP-LP's paper directly */ -static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) +static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, ktime_t last) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); + tcp_lp_rtt_sample(sk, ktime_to_ns(net_timedelta(last)) / NSEC_PER_USEC); + /* calc inference */ if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); @@ -312,11 +314,11 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) } static struct tcp_congestion_ops tcp_lp = { + .flags = TCP_CONG_RTT_STAMP, .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_lp_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, - .rtt_sample = tcp_lp_rtt_sample, .pkts_acked = tcp_lp_pkts_acked, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3a60aea744ae..e70a6840cb64 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -409,7 +409,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* If congestion control is doing timestamping, we must * take such a timestamp before we potentially clone/copy. */ - if (icsk->icsk_ca_ops->rtt_sample) + if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) __net_timestamp(skb); if (likely(clone_it)) { diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 87e72bef6d08..f4104eeb5f26 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -120,10 +120,13 @@ static void tcp_vegas_init(struct sock *sk) * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ -static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) +static void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct vegas *vegas = inet_csk_ca(sk); - u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + u32 vrtt; + + /* Never allow zero rtt or baseRTT */ + vrtt = (ktime_to_ns(net_timedelta(last)) / NSEC_PER_USEC) + 1; /* Filter to find propagation delay: */ if (vrtt < vegas->baseRTT) @@ -353,11 +356,12 @@ static void tcp_vegas_get_info(struct sock *sk, u32 ext, } static struct tcp_congestion_ops tcp_vegas = { + .flags = TCP_CONG_RTT_STAMP, .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_vegas_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, - .rtt_sample = tcp_vegas_rtt_calc, + .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index ce57bf302f6c..0b50d0607a0e 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -69,10 +69,13 @@ static void tcp_veno_init(struct sock *sk) } /* Do rtt sampling needed for Veno. */ -static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt) +static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct veno *veno = inet_csk_ca(sk); - u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */ + u32 vrtt; + + /* Never allow zero rtt or baseRTT */ + vrtt = (ktime_to_ns(net_timedelta(last)) / NSEC_PER_USEC) + 1; /* Filter to find propagation delay: */ if (vrtt < veno->basertt) @@ -199,10 +202,11 @@ static u32 tcp_veno_ssthresh(struct sock *sk) } static struct tcp_congestion_ops tcp_veno = { + .flags = TCP_CONG_RTT_STAMP, .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .cong_avoid = tcp_veno_cong_avoid, - .rtt_sample = tcp_veno_rtt_calc, + .pkts_acked = tcp_veno_pkts_acked, .set_state = tcp_veno_state, .cwnd_event = tcp_veno_cwnd_event, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index ae1026a67720..e61e09dd513e 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -100,7 +100,7 @@ static void westwood_filter(struct westwood *w, u32 delta) * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ -static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt) +static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct westwood *w = inet_csk_ca(sk); if (cnt > 0) diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 46dd1bee583a..81ef02c1649a 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -64,13 +64,15 @@ static void tcp_yeah_init(struct sock *sk) } -static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked) +static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); struct yeah *yeah = inet_csk_ca(sk); if (icsk->icsk_ca_state == TCP_CA_Open) yeah->pkts_acked = pkts_acked; + + tcp_vegas_pkts_acked(sk, pkts_acked, last); } static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, @@ -237,11 +239,11 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { } static struct tcp_congestion_ops tcp_yeah = { + .flags = TCP_CONG_RTT_STAMP, .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, .cong_avoid = tcp_yeah_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, - .rtt_sample = tcp_vegas_rtt_calc, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, diff --git a/net/ipv4/tcp_yeah.h b/net/ipv4/tcp_yeah.h index a62d82038fd0..33ad5385c188 100644 --- a/net/ipv4/tcp_yeah.h +++ b/net/ipv4/tcp_yeah.h @@ -81,10 +81,13 @@ static void tcp_vegas_state(struct sock *sk, u8 ca_state) * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ -static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) +static void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct vegas *vegas = inet_csk_ca(sk); - u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + u32 vrtt; + + /* Never allow zero rtt or baseRTT */ + vrtt = (ktime_to_ns(net_timedelta(last)) / NSEC_PER_USEC) + 1; /* Filter to find propagation delay: */ if (vrtt < vegas->baseRTT) -- cgit From 7752237e9f07b316f81aebdc43f0d7c9a4ba0acf Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 23 Apr 2007 22:28:23 -0700 Subject: [TCP] TCP YEAH: Use vegas dont copy it. Rather than using a copy of vegas code, the YEAH code should just have it exported so there is common code. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 31 +++++------- net/ipv4/tcp_vegas.h | 24 ++++++++++ net/ipv4/tcp_yeah.c | 53 ++++++++++----------- net/ipv4/tcp_yeah.h | 131 --------------------------------------------------- 4 files changed, 61 insertions(+), 178 deletions(-) create mode 100644 net/ipv4/tcp_vegas.h (limited to 'net/ipv4/tcp_yeah.c') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index f4104eeb5f26..0f0ee7f732c3 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -38,6 +38,8 @@ #include +#include "tcp_vegas.h" + /* Default values of the Vegas variables, in fixed-point representation * with V_PARAM_SHIFT bits to the right of the binary point. */ @@ -54,17 +56,6 @@ module_param(gamma, int, 0644); MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); -/* Vegas variables */ -struct vegas { - u32 beg_snd_nxt; /* right edge during last RTT */ - u32 beg_snd_una; /* left edge during last RTT */ - u32 beg_snd_cwnd; /* saves the size of the cwnd */ - u8 doing_vegas_now;/* if true, do vegas for this RTT */ - u16 cntRTT; /* # of RTTs measured within last RTT */ - u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ - u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ -}; - /* There are several situations when we must "re-start" Vegas: * * o when a connection is established @@ -81,7 +72,7 @@ struct vegas { * Instead we must wait until the completion of an RTT during * which we actually receive ACKs. */ -static inline void vegas_enable(struct sock *sk) +static void vegas_enable(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); @@ -104,13 +95,14 @@ static inline void vegas_disable(struct sock *sk) vegas->doing_vegas_now = 0; } -static void tcp_vegas_init(struct sock *sk) +void tcp_vegas_init(struct sock *sk) { struct vegas *vegas = inet_csk_ca(sk); vegas->baseRTT = 0x7fffffff; vegas_enable(sk); } +EXPORT_SYMBOL_GPL(tcp_vegas_init); /* Do RTT sampling needed for Vegas. * Basically we: @@ -120,7 +112,7 @@ static void tcp_vegas_init(struct sock *sk) * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ -static void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) +void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct vegas *vegas = inet_csk_ca(sk); u32 vrtt; @@ -138,8 +130,9 @@ static void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) vegas->minRTT = min(vegas->minRTT, vrtt); vegas->cntRTT++; } +EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked); -static void tcp_vegas_state(struct sock *sk, u8 ca_state) +void tcp_vegas_state(struct sock *sk, u8 ca_state) { if (ca_state == TCP_CA_Open) @@ -147,6 +140,7 @@ static void tcp_vegas_state(struct sock *sk, u8 ca_state) else vegas_disable(sk); } +EXPORT_SYMBOL_GPL(tcp_vegas_state); /* * If the connection is idle and we are restarting, @@ -157,12 +151,13 @@ static void tcp_vegas_state(struct sock *sk, u8 ca_state) * packets, _then_ we can make Vegas calculations * again. */ -static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) +void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) tcp_vegas_init(sk); } +EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int flag) @@ -339,8 +334,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, } /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_vegas_get_info(struct sock *sk, u32 ext, - struct sk_buff *skb) +void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct vegas *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { @@ -354,6 +348,7 @@ static void tcp_vegas_get_info(struct sock *sk, u32 ext, nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); } } +EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas = { .flags = TCP_CONG_RTT_STAMP, diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h new file mode 100644 index 000000000000..502fa8183634 --- /dev/null +++ b/net/ipv4/tcp_vegas.h @@ -0,0 +1,24 @@ +/* + * TCP Vegas congestion control interface + */ +#ifndef __TCP_VEGAS_H +#define __TCP_VEGAS_H 1 + +/* Vegas variables */ +struct vegas { + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now;/* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ +}; + +extern void tcp_vegas_init(struct sock *sk); +extern void tcp_vegas_state(struct sock *sk, u8 ca_state); +extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last); +extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); +extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); + +#endif /* __TCP_VEGAS_H */ diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 81ef02c1649a..545ed237ab53 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -6,13 +6,14 @@ * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf * */ +#include +#include +#include +#include -#include "tcp_yeah.h" +#include -/* Default values of the Vegas variables, in fixed-point representation - * with V_PARAM_SHIFT bits to the right of the binary point. - */ -#define V_PARAM_SHIFT 1 +#include "tcp_vegas.h" #define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck #define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt @@ -26,14 +27,7 @@ /* YeAH variables */ struct yeah { - /* Vegas */ - u32 beg_snd_nxt; /* right edge during last RTT */ - u32 beg_snd_una; /* left edge during last RTT */ - u32 beg_snd_cwnd; /* saves the size of the cwnd */ - u8 doing_vegas_now;/* if true, do vegas for this RTT */ - u16 cntRTT; /* # of RTTs measured within last RTT */ - u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ - u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ + struct vegas vegas; /* must be first */ /* YeAH */ u32 lastQ; @@ -84,9 +78,10 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp); - } else if (!yeah->doing_reno_now) { + + else if (!yeah->doing_reno_now) { /* Scalable */ tp->snd_cwnd_cnt+=yeah->pkts_acked; @@ -110,19 +105,19 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, } } - /* The key players are v_beg_snd_una and v_beg_snd_nxt. + /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. * * These are so named because they represent the approximate values * of snd_una and snd_nxt at the beginning of the current RTT. More * precisely, they represent the amount of data sent during the RTT. * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, - * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding * bytes of data have been ACKed during the course of the RTT, giving * an "actual" rate of: * - * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration) * - * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una, * because delayed ACKs can cover more than one segment, so they * don't line up yeahly with the boundaries of RTTs. * @@ -132,7 +127,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, * So we keep track of our cwnd separately, in v_beg_snd_cwnd. */ - if (after(ack, yeah->beg_snd_nxt)) { + if (after(ack, yeah->vegas.beg_snd_nxt)) { /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got @@ -143,7 +138,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, * If we have 3 samples, we should be OK. */ - if (yeah->cntRTT > 2) { + if (yeah->vegas.cntRTT > 2) { u32 rtt, queue; u64 bw; @@ -158,18 +153,18 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, * of delayed ACKs, at the cost of noticing congestion * a bit later. */ - rtt = yeah->minRTT; + rtt = yeah->vegas.minRTT; /* Compute excess number of packets above bandwidth * Avoid doing full 64 bit divide. */ bw = tp->snd_cwnd; - bw *= rtt - yeah->baseRTT; + bw *= rtt - yeah->vegas.baseRTT; do_div(bw, rtt); queue = bw; if (queue > TCP_YEAH_ALPHA || - rtt - yeah->baseRTT > (yeah->baseRTT / TCP_YEAH_PHY)) { + rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { if (queue > TCP_YEAH_ALPHA && tp->snd_cwnd > yeah->reno_count) { u32 reduction = min(queue / TCP_YEAH_GAMMA , @@ -208,13 +203,13 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, /* Save the extent of the current window so we can use this * at the end of the next RTT. */ - yeah->beg_snd_una = yeah->beg_snd_nxt; - yeah->beg_snd_nxt = tp->snd_nxt; - yeah->beg_snd_cwnd = tp->snd_cwnd; + yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt; + yeah->vegas.beg_snd_nxt = tp->snd_nxt; + yeah->vegas.beg_snd_cwnd = tp->snd_cwnd; /* Wipe the slate clean for the next RTT. */ - yeah->cntRTT = 0; - yeah->minRTT = 0x7fffffff; + yeah->vegas.cntRTT = 0; + yeah->vegas.minRTT = 0x7fffffff; } } diff --git a/net/ipv4/tcp_yeah.h b/net/ipv4/tcp_yeah.h index 33ad5385c188..ed3b7198f23c 100644 --- a/net/ipv4/tcp_yeah.h +++ b/net/ipv4/tcp_yeah.h @@ -5,134 +5,3 @@ #include #include - -/* Vegas variables */ -struct vegas { - u32 beg_snd_nxt; /* right edge during last RTT */ - u32 beg_snd_una; /* left edge during last RTT */ - u32 beg_snd_cwnd; /* saves the size of the cwnd */ - u8 doing_vegas_now;/* if true, do vegas for this RTT */ - u16 cntRTT; /* # of RTTs measured within last RTT */ - u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ - u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ -}; - -/* There are several situations when we must "re-start" Vegas: - * - * o when a connection is established - * o after an RTO - * o after fast recovery - * o when we send a packet and there is no outstanding - * unacknowledged data (restarting an idle connection) - * - * In these circumstances we cannot do a Vegas calculation at the - * end of the first RTT, because any calculation we do is using - * stale info -- both the saved cwnd and congestion feedback are - * stale. - * - * Instead we must wait until the completion of an RTT during - * which we actually receive ACKs. - */ -static inline void vegas_enable(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct vegas *vegas = inet_csk_ca(sk); - - /* Begin taking Vegas samples next time we send something. */ - vegas->doing_vegas_now = 1; - - /* Set the beginning of the next send window. */ - vegas->beg_snd_nxt = tp->snd_nxt; - - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; -} - -/* Stop taking Vegas samples for now. */ -static inline void vegas_disable(struct sock *sk) -{ - struct vegas *vegas = inet_csk_ca(sk); - - vegas->doing_vegas_now = 0; -} - -static void tcp_vegas_init(struct sock *sk) -{ - struct vegas *vegas = inet_csk_ca(sk); - - vegas->baseRTT = 0x7fffffff; - vegas_enable(sk); -} - -static void tcp_vegas_state(struct sock *sk, u8 ca_state) -{ - - if (ca_state == TCP_CA_Open) - vegas_enable(sk); - else - vegas_disable(sk); -} - -/* Do RTT sampling needed for Vegas. - * Basically we: - * o min-filter RTT samples from within an RTT to get the current - * propagation delay + queuing delay (we are min-filtering to try to - * avoid the effects of delayed ACKs) - * o min-filter RTT samples from a much longer window (forever for now) - * to find the propagation delay (baseRTT) - */ -static void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) -{ - struct vegas *vegas = inet_csk_ca(sk); - u32 vrtt; - - /* Never allow zero rtt or baseRTT */ - vrtt = (ktime_to_ns(net_timedelta(last)) / NSEC_PER_USEC) + 1; - - /* Filter to find propagation delay: */ - if (vrtt < vegas->baseRTT) - vegas->baseRTT = vrtt; - - /* Find the min RTT during the last RTT to find - * the current prop. delay + queuing delay: - */ - vegas->minRTT = min(vegas->minRTT, vrtt); - vegas->cntRTT++; -} - -/* - * If the connection is idle and we are restarting, - * then we don't want to do any Vegas calculations - * until we get fresh RTT samples. So when we - * restart, we reset our Vegas state to a clean - * slate. After we get acks for this flight of - * packets, _then_ we can make Vegas calculations - * again. - */ -static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) -{ - if (event == CA_EVENT_CWND_RESTART || - event == CA_EVENT_TX_START) - tcp_vegas_init(sk); -} - -/* Extract info for Tcp socket info provided via netlink. */ -static void tcp_vegas_get_info(struct sock *sk, u32 ext, - struct sk_buff *skb) -{ - const struct vegas *ca = inet_csk_ca(sk); - if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info *info; - - info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, - sizeof(*info))); - - info->tcpv_enabled = ca->doing_vegas_now; - info->tcpv_rttcnt = ca->cntRTT; - info->tcpv_rtt = ca->baseRTT; - info->tcpv_minrtt = ca->minRTT; - rtattr_failure: ; - } -} - - -- cgit