Inter-revision diff: patch 12

Comparing v2 (message) to v1 (message)

--- v2
+++ v1
@@ -1,5 +1,27 @@
-Export tcp_mss_to_mtu(), so that congestion control modules can use
-this to help calculate a pacing rate.
+From: Yuchung Cheng <ycheng@google.com>
+
+This commit introduces an optional new "omnipotent" hook,
+cong_control(), for congestion control modules. The cong_control()
+function is called at the end of processing an ACK (i.e., after
+updating sequence numbers, the SACK scoreboard, and loss
+detection). At that moment we have precise delivery rate information
+the congestion control module can use to control the sending behavior
+(using cwnd, TSO skb size, and pacing rate) in any CA state.
+
+This function can also be used by a congestion control that prefers
+not to use the default cwnd reduction approach (i.e., the PRR
+algorithm) during CA_Recovery to control the cwnd and sending rate
+during loss recovery.
+
+We take advantage of the fact that recent changes defer the
+retransmission or transmission of new data (e.g. by F-RTO) in recovery
+until the new tcp_cong_control() function is run.
+
+With this commit, we only run tcp_update_pacing_rate() if the
+congestion control is not using this new API. New congestion controls
+which use the new API do not want the TCP stack to run the default
+pacing rate calculation and overwrite whatever pacing rate they have
+chosen at initialization time.
 
 Signed-off-by: Van Jacobson <vanj@google.com>
 Signed-off-by: Neal Cardwell <ncardwell@google.com>
@@ -8,20 +30,88 @@
 Signed-off-by: Eric Dumazet <edumazet@google.com>
 Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
 ---
- net/ipv4/tcp_output.c | 1 +
- 1 file changed, 1 insertion(+)
+ include/net/tcp.h    |  4 ++++
+ net/ipv4/tcp_cong.c  |  2 +-
+ net/ipv4/tcp_input.c | 17 ++++++++++++++---
+ 3 files changed, 19 insertions(+), 4 deletions(-)
 
-diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
-index 0bf3d48..7d025a7 100644
---- a/net/ipv4/tcp_output.c
-+++ b/net/ipv4/tcp_output.c
-@@ -1362,6 +1362,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index c4d2e46..35ec286 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -919,6 +919,10 @@ struct tcp_congestion_ops {
+ 	u32 (*tso_segs_goal)(struct sock *sk);
+ 	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
+ 	u32 (*sndbuf_expand)(struct sock *sk);
++	/* call when packets are delivered to update cwnd and pacing rate,
++	 * after all the ca_state processing. (optional)
++	 */
++	void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
+ 	/* get info for inet_diag (optional) */
+ 	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
+ 			   union tcp_cc_info *info);
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index 882caa4..1294af4 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+ 	int ret = 0;
+ 
+ 	/* all algorithms must implement ssthresh and cong_avoid ops */
+-	if (!ca->ssthresh || !ca->cong_avoid) {
++	if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
+ 		pr_err("%s does not implement required ops\n", ca->name);
+ 		return -EINVAL;
  	}
- 	return mtu;
- }
-+EXPORT_SYMBOL(tcp_mss_to_mtu);
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index a134e66..931fe32 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -2536,6 +2536,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
  
- /* MTU probing init per socket */
- void tcp_mtup_init(struct sock *sk)
++	if (inet_csk(sk)->icsk_ca_ops->cong_control)
++		return;
++
+ 	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+ 	if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
+ 	    (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
+@@ -3312,8 +3315,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+  * information. All transmission or retransmission are delayed afterwards.
+  */
+ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
+-			     int flag)
++			     int flag, const struct rate_sample *rs)
+ {
++	const struct inet_connection_sock *icsk = inet_csk(sk);
++
++	if (icsk->icsk_ca_ops->cong_control) {
++		icsk->icsk_ca_ops->cong_control(sk, rs);
++		return;
++	}
++
+ 	if (tcp_in_cwnd_reduction(sk)) {
+ 		/* Reduce cwnd if state mandates */
+ 		tcp_cwnd_reduction(sk, acked_sacked, flag);
+@@ -3683,7 +3693,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ 	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
+ 	lost = tp->lost - lost;			/* freshly marked lost */
+ 	tcp_rate_gen(sk, delivered, lost, &now, &rs);
+-	tcp_cong_control(sk, ack, delivered, flag);
++	tcp_cong_control(sk, ack, delivered, flag, &rs);
+ 	tcp_xmit_recovery(sk, rexmit);
+ 	return 1;
+ 
+@@ -5981,7 +5991,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
+ 		} else
+ 			tcp_init_metrics(sk);
+ 
+-		tcp_update_pacing_rate(sk);
++		if (!inet_csk(sk)->icsk_ca_ops->cong_control)
++			tcp_update_pacing_rate(sk);
+ 
+ 		/* Prevent spurious tcp_cwnd_restart() on first data packet */
+ 		tp->lsndtime = tcp_time_stamp;
 -- 
 2.8.0.rc3.226.g39d4020
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help