Inter-revision diff: patch 2

Comparing v4 (message) to v1 (message)

--- v4
+++ v1
@@ -1,21 +1,10 @@
-This commit introduces a generic library to estimate either the min or
-max value of a time-varying variable over a recent time window. This
-is code originally from Kathleen Nichols. The current form of the code
-is from Van Jacobson.
+Refactor the TCP min_rtt code to reuse the new win_minmax library in
+lib/win_minmax.c to simplify the TCP code.
 
-A single struct minmax_sample will track the estimated windowed-max
-value of the series if you call minmax_running_max() or the estimated
-windowed-min value of the series if you call minmax_running_min().
-
-Nearly equivalent code is already in place for minimum RTT estimation
-in the TCP stack. This commit extracts that code and generalizes it to
-handle both min and max. Moving the code here reduces the footprint
-and complexity of the TCP code base and makes the filter generally
-available for other parts of the codebase, including an upcoming TCP
-congestion control module.
-
-This library works well for time series where the measurements are
-smoothly increasing or decreasing.
+This is a pure refactor: the functionality is exactly the same. We
+just moved the windowed min code to make TCP easier to read and
+maintain, and to allow other parts of the kernel to use the windowed
+min/max filter code.
 
 Signed-off-by: Van Jacobson <vanj@google.com>
 Signed-off-by: Neal Cardwell <ncardwell@google.com>
@@ -24,172 +13,151 @@
 Signed-off-by: Eric Dumazet <edumazet@google.com>
 Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
 ---
- include/linux/win_minmax.h | 37 +++++++++++++++++
- lib/Makefile               |  2 +-
- lib/win_minmax.c           | 98 ++++++++++++++++++++++++++++++++++++++++++++++
- 3 files changed, 136 insertions(+), 1 deletion(-)
- create mode 100644 include/linux/win_minmax.h
- create mode 100644 lib/win_minmax.c
+ include/linux/tcp.h      |  5 ++--
+ include/net/tcp.h        |  2 +-
+ net/ipv4/tcp.c           |  2 +-
+ net/ipv4/tcp_input.c     | 64 ++++--------------------------------------------
+ net/ipv4/tcp_minisocks.c |  2 +-
+ 5 files changed, 10 insertions(+), 65 deletions(-)
 
-diff --git a/include/linux/win_minmax.h b/include/linux/win_minmax.h
-new file mode 100644
-index 0000000..5656960
---- /dev/null
-+++ b/include/linux/win_minmax.h
-@@ -0,0 +1,37 @@
-+/**
-+ * lib/minmax.c: windowed min/max tracker by Kathleen Nichols.
-+ *
-+ */
-+#ifndef MINMAX_H
-+#define MINMAX_H
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index c723a46..6433cc8 100644
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -19,6 +19,7 @@
+ 
+ 
+ #include <linux/skbuff.h>
++#include <linux/win_minmax.h>
+ #include <net/sock.h>
+ #include <net/inet_connection_sock.h>
+ #include <net/inet_timewait_sock.h>
+@@ -234,9 +235,7 @@ struct tcp_sock {
+ 	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
+ 	u32	rttvar_us;	/* smoothed mdev_max			*/
+ 	u32	rtt_seq;	/* sequence number to update rttvar	*/
+-	struct rtt_meas {
+-		u32 rtt, ts;	/* RTT in usec and sampling time in jiffies. */
+-	} rtt_min[3];
++	struct  minmax rtt_min;
+ 
+ 	u32	packets_out;	/* Packets which are "in flight"	*/
+ 	u32	retrans_out;	/* Retransmitted packets out		*/
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index fdfbedd..2f1648a 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -671,7 +671,7 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
+ /* Minimum RTT in usec. ~0 means not available. */
+ static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
+ {
+-	return tp->rtt_min[0].rtt;
++	return minmax_get(&tp->rtt_min);
+ }
+ 
+ /* Compute the actual receive window we are currently advertising.
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index a13fcb3..5b0b49c 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk)
+ 
+ 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ 	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+-	tp->rtt_min[0].rtt = ~0U;
++	minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
+ 
+ 	/* So many TCP implementations out there (incorrectly) count the
+ 	 * initial SYN frame in their delayed-ACK and congestion control
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 70b892d..ac5b38f 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -2879,67 +2879,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
+ 	*rexmit = REXMIT_LOST;
+ }
+ 
+-/* Kathleen Nichols' algorithm for tracking the minimum value of
+- * a data stream over some fixed time interval. (E.g., the minimum
+- * RTT over the past five minutes.) It uses constant space and constant
+- * time per update yet almost always delivers the same minimum as an
+- * implementation that has to keep all the data in the window.
+- *
+- * The algorithm keeps track of the best, 2nd best & 3rd best min
+- * values, maintaining an invariant that the measurement time of the
+- * n'th best >= n-1'th best. It also makes sure that the three values
+- * are widely separated in the time window since that bounds the worse
+- * case error when that data is monotonically increasing over the window.
+- *
+- * Upon getting a new min, we can forget everything earlier because it
+- * has no value - the new min is <= everything else in the window by
+- * definition and it's the most recent. So we restart fresh on every new min
+- * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
+- * best.
+- */
+ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
+ {
+-	const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
+-	struct rtt_meas *m = tcp_sk(sk)->rtt_min;
+-	struct rtt_meas rttm = {
+-		.rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
+-		.ts = now,
+-	};
+-	u32 elapsed;
+-
+-	/* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
+-	if (unlikely(rttm.rtt <= m[0].rtt))
+-		m[0] = m[1] = m[2] = rttm;
+-	else if (rttm.rtt <= m[1].rtt)
+-		m[1] = m[2] = rttm;
+-	else if (rttm.rtt <= m[2].rtt)
+-		m[2] = rttm;
+-
+-	elapsed = now - m[0].ts;
+-	if (unlikely(elapsed > wlen)) {
+-		/* Passed entire window without a new min so make 2nd choice
+-		 * the new min & 3rd choice the new 2nd. So forth and so on.
+-		 */
+-		m[0] = m[1];
+-		m[1] = m[2];
+-		m[2] = rttm;
+-		if (now - m[0].ts > wlen) {
+-			m[0] = m[1];
+-			m[1] = rttm;
+-			if (now - m[0].ts > wlen)
+-				m[0] = rttm;
+-		}
+-	} else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
+-		/* Passed a quarter of the window without a new min so
+-		 * take 2nd choice from the 2nd quarter of the window.
+-		 */
+-		m[2] = m[1] = rttm;
+-	} else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
+-		/* Passed half the window without a new min so take the 3rd
+-		 * choice from the last half of the window.
+-		 */
+-		m[2] = rttm;
+-	}
++	struct tcp_sock *tp = tcp_sk(sk);
++	u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
 +
-+#include <linux/types.h>
-+
-+/* A single data point for our parameterized min-max tracker */
-+struct minmax_sample {
-+	u32	t;	/* time measurement was taken */
-+	u32	v;	/* value measured */
-+};
-+
-+/* State for the parameterized min-max tracker */
-+struct minmax {
-+	struct minmax_sample s[3];
-+};
-+
-+static inline u32 minmax_get(const struct minmax *m)
-+{
-+	return m->s[0].v;
-+}
-+
-+static inline u32 minmax_reset(struct minmax *m, u32 t, u32 meas)
-+{
-+	struct minmax_sample val = { .t = t, .v = meas };
-+
-+	m->s[2] = m->s[1] = m->s[0] = val;
-+	return m->s[0].v;
-+}
-+
-+u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas);
-+u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas);
-+
-+#endif
-diff --git a/lib/Makefile b/lib/Makefile
-index 5dc77a8..df747e5 100644
---- a/lib/Makefile
-+++ b/lib/Makefile
-@@ -22,7 +22,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
- 	 sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
- 	 flex_proportions.o ratelimit.o show_mem.o \
- 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
--	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
-+	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o
++	minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
++			   rtt_us ? : jiffies_to_usecs(1));
+ }
  
- lib-$(CONFIG_MMU) += ioremap.o
- lib-$(CONFIG_SMP) += cpumask.o
-diff --git a/lib/win_minmax.c b/lib/win_minmax.c
-new file mode 100644
-index 0000000..c8420d4
---- /dev/null
-+++ b/lib/win_minmax.c
-@@ -0,0 +1,98 @@
-+/**
-+ * lib/minmax.c: windowed min/max tracker
-+ *
-+ * Kathleen Nichols' algorithm for tracking the minimum (or maximum)
-+ * value of a data stream over some fixed time interval.  (E.g.,
-+ * the minimum RTT over the past five minutes.) It uses constant
-+ * space and constant time per update yet almost always delivers
-+ * the same minimum as an implementation that has to keep all the
-+ * data in the window.
-+ *
-+ * The algorithm keeps track of the best, 2nd best & 3rd best min
-+ * values, maintaining an invariant that the measurement time of
-+ * the n'th best >= n-1'th best. It also makes sure that the three
-+ * values are widely separated in the time window since that bounds
-+ * the worse case error when that data is monotonically increasing
-+ * over the window.
-+ *
-+ * Upon getting a new min, we can forget everything earlier because
-+ * it has no value - the new min is <= everything else in the window
-+ * by definition and it's the most recent. So we restart fresh on
-+ * every new min and overwrites 2nd & 3rd choices. The same property
-+ * holds for 2nd & 3rd best.
-+ */
-+#include <linux/module.h>
-+#include <linux/win_minmax.h>
-+
-+/* As time advances, update the 1st, 2nd, and 3rd choices. */
-+static u32 minmax_subwin_update(struct minmax *m, u32 win,
-+				const struct minmax_sample *val)
-+{
-+	u32 dt = val->t - m->s[0].t;
-+
-+	if (unlikely(dt > win)) {
-+		/*
-+		 * Passed entire window without a new val so make 2nd
-+		 * choice the new val & 3rd choice the new 2nd choice.
-+		 * we may have to iterate this since our 2nd choice
-+		 * may also be outside the window (we checked on entry
-+		 * that the third choice was in the window).
-+		 */
-+		m->s[0] = m->s[1];
-+		m->s[1] = m->s[2];
-+		m->s[2] = *val;
-+		if (unlikely(val->t - m->s[0].t > win)) {
-+			m->s[0] = m->s[1];
-+			m->s[1] = m->s[2];
-+			m->s[2] = *val;
-+		}
-+	} else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) {
-+		/*
-+		 * We've passed a quarter of the window without a new val
-+		 * so take a 2nd choice from the 2nd quarter of the window.
-+		 */
-+		m->s[2] = m->s[1] = *val;
-+	} else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) {
-+		/*
-+		 * We've passed half the window without finding a new val
-+		 * so take a 3rd choice from the last half of the window
-+		 */
-+		m->s[2] = *val;
-+	}
-+	return m->s[0].v;
-+}
-+
-+/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */
-+u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas)
-+{
-+	struct minmax_sample val = { .t = t, .v = meas };
-+
-+	if (unlikely(val.v >= m->s[0].v) ||	  /* found new max? */
-+	    unlikely(val.t - m->s[2].t > win))	  /* nothing left in window? */
-+		return minmax_reset(m, t, meas);  /* forget earlier samples */
-+
-+	if (unlikely(val.v >= m->s[1].v))
-+		m->s[2] = m->s[1] = val;
-+	else if (unlikely(val.v >= m->s[2].v))
-+		m->s[2] = val;
-+
-+	return minmax_subwin_update(m, win, &val);
-+}
-+EXPORT_SYMBOL(minmax_running_max);
-+
-+/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */
-+u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas)
-+{
-+	struct minmax_sample val = { .t = t, .v = meas };
-+
-+	if (unlikely(val.v <= m->s[0].v) ||	  /* found new min? */
-+	    unlikely(val.t - m->s[2].t > win))	  /* nothing left in window? */
-+		return minmax_reset(m, t, meas);  /* forget earlier samples */
-+
-+	if (unlikely(val.v <= m->s[1].v))
-+		m->s[2] = m->s[1] = val;
-+	else if (unlikely(val.v <= m->s[2].v))
-+		m->s[2] = val;
-+
-+	return minmax_subwin_update(m, win, &val);
-+}
+ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index f63c73d..5689471 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
+ 
+ 		newtp->srtt_us = 0;
+ 		newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+-		newtp->rtt_min[0].rtt = ~0U;
++		minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U);
+ 		newicsk->icsk_rto = TCP_TIMEOUT_INIT;
+ 
+ 		newtp->packets_out = 0;
 -- 
 2.8.0.rc3.226.g39d4020
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help