--- v2
+++ v1
@@ -1,5 +1,27 @@
-Export tcp_mss_to_mtu(), so that congestion control modules can use
-this to help calculate a pacing rate.
+From: Yuchung Cheng <ycheng@google.com>
+
+This commit introduces an optional new "omnipotent" hook,
+cong_control(), for congestion control modules. The cong_control()
+function is called at the end of processing an ACK (i.e., after
+updating sequence numbers, the SACK scoreboard, and loss
+detection). At that moment we have precise delivery rate information
+the congestion control module can use to control the sending behavior
+(using cwnd, TSO skb size, and pacing rate) in any CA state.
+
+This function can also be used by a congestion control that prefers
+not to use the default cwnd reduction approach (i.e., the PRR
+algorithm) during CA_Recovery to control the cwnd and sending rate
+during loss recovery.
+
+We take advantage of the fact that recent changes defer the
+retransmission or transmission of new data (e.g. by F-RTO) in recovery
+until the new tcp_cong_control() function is run.
+
+With this commit, we only run tcp_update_pacing_rate() if the
+congestion control is not using this new API. New congestion controls
+which use the new API do not want the TCP stack to run the default
+pacing rate calculation and overwrite whatever pacing rate they have
+chosen at initialization time.
Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
@@ -8,20 +30,88 @@
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
---
- net/ipv4/tcp_output.c | 1 +
- 1 file changed, 1 insertion(+)
+ include/net/tcp.h | 4 ++++
+ net/ipv4/tcp_cong.c | 2 +-
+ net/ipv4/tcp_input.c | 17 ++++++++++++++---
+ 3 files changed, 19 insertions(+), 4 deletions(-)
-diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
-index 0bf3d48..7d025a7 100644
---- a/net/ipv4/tcp_output.c
-+++ b/net/ipv4/tcp_output.c
-@@ -1362,6 +1362,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index c4d2e46..35ec286 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -919,6 +919,10 @@ struct tcp_congestion_ops {
+ u32 (*tso_segs_goal)(struct sock *sk);
+ /* returns the multiplier used in tcp_sndbuf_expand (optional) */
+ u32 (*sndbuf_expand)(struct sock *sk);
++ /* call when packets are delivered to update cwnd and pacing rate,
++ * after all the ca_state processing. (optional)
++ */
++ void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
+ /* get info for inet_diag (optional) */
+ size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info);
+diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
+index 882caa4..1294af4 100644
+--- a/net/ipv4/tcp_cong.c
++++ b/net/ipv4/tcp_cong.c
+@@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+ int ret = 0;
+
+ /* all algorithms must implement ssthresh and cong_avoid ops */
+- if (!ca->ssthresh || !ca->cong_avoid) {
++ if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
+ pr_err("%s does not implement required ops\n", ca->name);
+ return -EINVAL;
}
- return mtu;
- }
-+EXPORT_SYMBOL(tcp_mss_to_mtu);
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index a134e66..931fe32 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -2536,6 +2536,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
- /* MTU probing init per socket */
- void tcp_mtup_init(struct sock *sk)
++ if (inet_csk(sk)->icsk_ca_ops->cong_control)
++ return;
++
+ /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
+ (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
+@@ -3312,8 +3315,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+ * information. All transmission or retransmission are delayed afterwards.
+ */
+ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
+- int flag)
++ int flag, const struct rate_sample *rs)
+ {
++ const struct inet_connection_sock *icsk = inet_csk(sk);
++
++ if (icsk->icsk_ca_ops->cong_control) {
++ icsk->icsk_ca_ops->cong_control(sk, rs);
++ return;
++ }
++
+ if (tcp_in_cwnd_reduction(sk)) {
+ /* Reduce cwnd if state mandates */
+ tcp_cwnd_reduction(sk, acked_sacked, flag);
+@@ -3683,7 +3693,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
+ lost = tp->lost - lost; /* freshly marked lost */
+ tcp_rate_gen(sk, delivered, lost, &now, &rs);
+- tcp_cong_control(sk, ack, delivered, flag);
++ tcp_cong_control(sk, ack, delivered, flag, &rs);
+ tcp_xmit_recovery(sk, rexmit);
+ return 1;
+
+@@ -5981,7 +5991,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
+ } else
+ tcp_init_metrics(sk);
+
+- tcp_update_pacing_rate(sk);
++ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
++ tcp_update_pacing_rate(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data packet */
+ tp->lsndtime = tcp_time_stamp;
--
2.8.0.rc3.226.g39d4020