diff options
Diffstat (limited to 'net/ipv4/tcp_bbr.c')
| -rw-r--r-- | net/ipv4/tcp_bbr.c | 180 | 
1 files changed, 159 insertions, 21 deletions
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 0f497fc49c3f..56be7d27f208 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -115,6 +115,14 @@ struct bbr {  		unused_b:5;  	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */  	u32	full_bw;	/* recent bw, to estimate if pipe is full */ + +	/* For tracking ACK aggregation: */ +	u64	ack_epoch_mstamp;	/* start of ACK sampling epoch */ +	u16	extra_acked[2];		/* max excess data ACKed in epoch */ +	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */ +		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */ +		extra_acked_win_idx:1,	/* current index in extra_acked array */ +		unused_c:6;  };  #define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */ @@ -182,6 +190,15 @@ static const u32 bbr_lt_bw_diff = 4000 / 8;  /* If we estimate we're policed, use lt_bw for this many round trips: */  static const u32 bbr_lt_bw_max_rtts = 48; +/* Gain factor for adding extra_acked to target cwnd: */ +static const int bbr_extra_acked_gain = BBR_UNIT; +/* Window length of extra_acked window. */ +static const u32 bbr_extra_acked_win_rtts = 5; +/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ +static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; +/* Time period for clamping cwnd increment due to ack aggregation */ +static const u32 bbr_extra_acked_max_us = 100 * 1000; +  static void bbr_check_probe_rtt_done(struct sock *sk);  /* Do we estimate that STARTUP filled the pipe? */ @@ -208,6 +225,16 @@ static u32 bbr_bw(const struct sock *sk)  	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);  } +/* Return maximum extra acked in past k-2k round trips, + * where k = bbr_extra_acked_win_rtts. + */ +static u16 bbr_extra_acked(const struct sock *sk) +{ +	struct bbr *bbr = inet_csk_ca(sk); + +	return max(bbr->extra_acked[0], bbr->extra_acked[1]); +} +  /* Return rate in bytes per second, optionally with a gain.   * The order here is chosen carefully to avoid overflow of u64. This should   * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. @@ -305,6 +332,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)  	if (event == CA_EVENT_TX_START && tp->app_limited) {  		bbr->idle_restart = 1; +		bbr->ack_epoch_mstamp = tp->tcp_mstamp; +		bbr->ack_epoch_acked = 0;  		/* Avoid pointless buffer overflows: pace at est. bw if we don't  		 * need more speed (we're restarting from idle and app-limited).  		 */ @@ -315,30 +344,19 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)  	}  } -/* Find target cwnd. Right-size the cwnd based on min RTT and the - * estimated bottleneck bandwidth: +/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:   * - * cwnd = bw * min_rtt * gain = BDP * gain + * bdp = bw * min_rtt * gain   *   * The key factor, gain, controls the amount of queue. While a small gain   * builds a smaller queue, it becomes more vulnerable to noise in RTT   * measurements (e.g., delayed ACKs or other ACK compression effects). This   * noise may cause BBR to under-estimate the rate. - * - * To achieve full performance in high-speed paths, we budget enough cwnd to - * fit full-sized skbs in-flight on both end hosts to fully utilize the path: - *   - one skb in sending host Qdisc, - *   - one skb in sending host TSO/GSO engine - *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine - * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because - * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, - * which allows 2 outstanding 2-packet sequences, to try to keep pipe - * full even with ACK-every-other-packet delayed ACKs.   */ -static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) +static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)  {  	struct bbr *bbr = inet_csk_ca(sk); -	u32 cwnd; +	u32 bdp;  	u64 w;  	/* If we've never had a valid RTT sample, cap cwnd at the initial @@ -353,7 +371,24 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)  	w = (u64)bw * bbr->min_rtt_us;  	/* Apply a gain to the given value, then remove the BW_SCALE shift. */ -	cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; +	bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + +	return bdp; +} + +/* To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + *   - one skb in sending host Qdisc, + *   - one skb in sending host TSO/GSO engine + *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) +{ +	struct bbr *bbr = inet_csk_ca(sk);  	/* Allow enough full-sized skbs in flight to utilize end systems. */  	cwnd += 3 * bbr_tso_segs_goal(sk); @@ -368,6 +403,17 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)  	return cwnd;  } +/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ +static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) +{ +	u32 inflight; + +	inflight = bbr_bdp(sk, bw, gain); +	inflight = bbr_quantization_budget(sk, inflight, gain); + +	return inflight; +} +  /* With pacing at lower layers, there's often less data "in the network" than   * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),   * we often have several skbs queued in the pacing layer with a pre-scheduled @@ -401,6 +447,22 @@ static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)  	return inflight_at_edt - interval_delivered;  } +/* Find the cwnd increment based on estimate of ack aggregation */ +static u32 bbr_ack_aggregation_cwnd(struct sock *sk) +{ +	u32 max_aggr_cwnd, aggr_cwnd = 0; + +	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { +		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) +				/ BW_UNIT; +		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) +			     >> BBR_SCALE; +		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); +	} + +	return aggr_cwnd; +} +  /* An optimization in BBR to reduce losses: On the first round of recovery, we   * follow the packet conservation principle: send P packets per P packets acked.   * After that, we slow-start and send at most 2*P packets per P packets acked. @@ -461,8 +523,15 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,  	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))  		goto done; +	target_cwnd = bbr_bdp(sk, bw, gain); + +	/* Increment the cwnd to account for excess ACKed data that seems +	 * due to aggregation (of data and/or ACKs) visible in the ACK stream. +	 */ +	target_cwnd += bbr_ack_aggregation_cwnd(sk); +	target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); +  	/* If we're below target cwnd, slow start cwnd toward target cwnd. */ -	target_cwnd = bbr_target_cwnd(sk, bw, gain);  	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */  		cwnd = min(cwnd + acked, target_cwnd);  	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) @@ -503,14 +572,14 @@ static bool bbr_is_next_cycle_phase(struct sock *sk,  	if (bbr->pacing_gain > BBR_UNIT)  		return is_full_length &&  			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */ -			 inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); +			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));  	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw  	 * probing didn't find more bw. If inflight falls to match BDP then we  	 * estimate queue is drained; persisting would underutilize the pipe.  	 */  	return is_full_length || -		inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); +		inflight <= bbr_inflight(sk, bw, BBR_UNIT);  }  static void bbr_advance_cycle_phase(struct sock *sk) @@ -727,6 +796,67 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)  	}  } +/* Estimates the windowed max degree of ack aggregation. + * This is used to provision extra in-flight data to keep sending during + * inter-ACK silences. + * + * Degree of ack aggregation is estimated as extra data acked beyond expected. + * + * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" + * cwnd += max_extra_acked + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round + * trips. + */ +static void bbr_update_ack_aggregation(struct sock *sk, +				       const struct rate_sample *rs) +{ +	u32 epoch_us, expected_acked, extra_acked; +	struct bbr *bbr = inet_csk_ca(sk); +	struct tcp_sock *tp = tcp_sk(sk); + +	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || +	    rs->delivered < 0 || rs->interval_us <= 0) +		return; + +	if (bbr->round_start) { +		bbr->extra_acked_win_rtts = min(0x1F, +						bbr->extra_acked_win_rtts + 1); +		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { +			bbr->extra_acked_win_rtts = 0; +			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? +						   0 : 1; +			bbr->extra_acked[bbr->extra_acked_win_idx] = 0; +		} +	} + +	/* Compute how many packets we expected to be delivered over epoch. */ +	epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, +				      bbr->ack_epoch_mstamp); +	expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; + +	/* Reset the aggregation epoch if ACK rate is below expected rate or +	 * significantly large no. of ack received since epoch (potentially +	 * quite old epoch). +	 */ +	if (bbr->ack_epoch_acked <= expected_acked || +	    (bbr->ack_epoch_acked + rs->acked_sacked >= +	     bbr_ack_epoch_acked_reset_thresh)) { +		bbr->ack_epoch_acked = 0; +		bbr->ack_epoch_mstamp = tp->delivered_mstamp; +		expected_acked = 0; +	} + +	/* Compute excess data delivered, beyond what was expected. */ +	bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, +				     bbr->ack_epoch_acked + rs->acked_sacked); +	extra_acked = bbr->ack_epoch_acked - expected_acked; +	extra_acked = min(extra_acked, tp->snd_cwnd); +	if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) +		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; +} +  /* Estimate when the pipe is full, using the change in delivery rate: BBR   * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by   * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited @@ -762,11 +892,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)  	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {  		bbr->mode = BBR_DRAIN;	/* drain queue we created */  		tcp_sk(sk)->snd_ssthresh = -				bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); +				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);  	}	/* fall through to check if in-flight is already small: */  	if (bbr->mode == BBR_DRAIN &&  	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= -	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) +	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))  		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */  } @@ -881,6 +1011,7 @@ static void bbr_update_gains(struct sock *sk)  static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)  {  	bbr_update_bw(sk, rs); +	bbr_update_ack_aggregation(sk, rs);  	bbr_update_cycle_phase(sk, rs);  	bbr_check_full_bw_reached(sk, rs);  	bbr_check_drain(sk, rs); @@ -932,6 +1063,13 @@ static void bbr_init(struct sock *sk)  	bbr_reset_lt_bw_sampling(sk);  	bbr_reset_startup_mode(sk); +	bbr->ack_epoch_mstamp = tp->tcp_mstamp; +	bbr->ack_epoch_acked = 0; +	bbr->extra_acked_win_rtts = 0; +	bbr->extra_acked_win_idx = 0; +	bbr->extra_acked[0] = 0; +	bbr->extra_acked[1] = 0; +  	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);  }  |