All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
From: Leonard Crestez <cdleonard@gmail.com>
To: Neal Cardwell <ncardwell@google.com>,
	Matt Mathis <mattmathis@google.com>
Cc: "David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Willem de Bruijn <willemb@google.com>,
	Jakub Kicinski <kuba@kernel.org>,
	Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>,
	David Ahern <dsahern@kernel.org>,
	John Heffner <johnwheffner@gmail.com>,
	Leonard Crestez <lcrestez@drivenets.com>,
	Soheil Hassas Yeganeh <soheil@google.com>,
	Roopa Prabhu <roopa@cumulusnetworks.com>,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [RFC 2/3] tcp: Use mtu probes if RACK is enabled
Date: Tue, 11 May 2021 15:04:17 +0300	[thread overview]
Message-ID: <b3be1a00fa6e242709ce2cfbd10b09d22934e73e.1620733594.git.cdleonard@gmail.com> (raw)
In-Reply-To: <cover.1620733594.git.cdleonard@gmail.com>

RACK allows detecting a loss in min_rtt / 4 based on just one extra
packet. If enabled use this instead of relying of fast retransmit.

Suggested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Leonard Crestez <cdleonard@gmail.com>
---
 Documentation/networking/ip-sysctl.rst |  5 +++++
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv4/tcp_ipv4.c                    |  1 +
 net/ipv4/tcp_output.c                  | 22 +++++++++++++++++++++-
 5 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 108a5ee227d3..4f6ac69f61e7 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -325,10 +325,15 @@ tcp_mtu_probe_floor - INTEGER
 tcp_mtu_probe_autocork - BOOLEAN
 	Take into account mtu probe size when accumulating data via autocorking.
 
 	Default: 1
 
+tcp_mtu_probe_rack - BOOLEAN
+	Try to use shorter probes if RACK is also enabled
+
+	Default: 1
+
 tcp_min_snd_mss - INTEGER
 	TCP SYN and SYNACK messages usually advertise an ADVMSS option,
 	as described in RFC 1122 and RFC 6691.
 
 	If this ADVMSS option is smaller than tcp_min_snd_mss,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 3a2d8bf2b20a..298e65d8605c 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -113,10 +113,11 @@ struct netns_ipv4 {
 	u8 sysctl_tcp_l3mdev_accept;
 #endif
 	u8 sysctl_tcp_mtu_probing;
 	int sysctl_tcp_mtu_probe_floor;
 	int sysctl_tcp_mtu_probe_autocork;
+	int sysctl_tcp_mtu_probe_rack;
 	int sysctl_tcp_base_mss;
 	int sysctl_tcp_min_snd_mss;
 	int sysctl_tcp_probe_threshold;
 	u32 sysctl_tcp_probe_interval;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e19176c17973..f9366f35ff9c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -834,10 +834,17 @@ static struct ctl_table ipv4_net_table[] = {
 		.data		= &init_net.ipv4.sysctl_tcp_mtu_probe_autocork,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "tcp_mtu_probe_rack",
+		.data		= &init_net.ipv4.sysctl_tcp_mtu_probe_rack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "tcp_probe_threshold",
 		.data		= &init_net.ipv4.sysctl_tcp_probe_threshold,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7e75423c08c9..4928fcd6e233 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2890,10 +2890,11 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
 	net->ipv4.sysctl_tcp_mtu_probe_autocork = 1;
+	net->ipv4.sysctl_tcp_mtu_probe_rack = 1;
 
 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5a320d792ec4..7cd1e8fd9749 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2311,27 +2311,47 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
 	}
 
 	return true;
 }
 
+static int tcp_mtu_probe_is_rack(const struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+
+	return (net->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION &&
+			net->ipv4.sysctl_tcp_mtu_probe_rack);
+}
+
 /* Calculate the size of an MTU probe
  * Probing the MTU requires one packets which is larger that current MSS as well
  * as enough following mtu-sized packets to ensure that a probe loss can be
  * detected without a full Retransmit Time out.
  */
 int tcp_mtu_probe_size_needed(struct sock *sk, int *probe_size)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	int probe_size_val;
 	int size_needed;
 
 	/* This might be a little slow: */
 	probe_size_val = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + icsk->icsk_mtup.search_low) >> 1);
 	if (probe_size)
 		*probe_size = probe_size_val;
-	size_needed = probe_size_val + (tp->reordering + 1) * tp->mss_cache;
+
+	if (tcp_mtu_probe_is_rack(sk)) {
+		/* RACK allows recovering in min_rtt / 4 based on just one extra packet
+		 * Use two to account for unrelated losses
+		 */
+		size_needed = probe_size_val + 2 * tp->mss_cache;
+	} else {
+		/* Without RACK send enough extra packets to trigger fast retransmit
+		 * This is dynamic DupThresh + 1
+		 */
+		size_needed = probe_size_val + (tp->reordering + 1) * tp->mss_cache;
+	}
 
 	return size_needed;
 }
 
 /* Create a new MTU probe if we are ready.
-- 
2.25.1


  parent reply	other threads:[~2021-05-11 12:04 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-11 12:04 [RFC 0/3] tcp: Improve mtu probe preconditions Leonard Crestez
2021-05-11 12:04 ` [RFC 1/3] tcp: Consider mtu probing for tcp_xmit_size_goal Leonard Crestez
2021-05-11 13:04   ` Eric Dumazet
2021-05-17 13:42     ` Leonard Crestez
2021-05-11 12:04 ` Leonard Crestez [this message]
2021-05-11 12:04 ` [RFC 3/3] tcp: Adjust congestion window handling for mtu probe Leonard Crestez

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b3be1a00fa6e242709ce2cfbd10b09d22934e73e.1620733594.git.cdleonard@gmail.com \
    --to=cdleonard@gmail.com \
    --cc=davem@davemloft.net \
    --cc=dsahern@kernel.org \
    --cc=edumazet@google.com \
    --cc=johnwheffner@gmail.com \
    --cc=kuba@kernel.org \
    --cc=lcrestez@drivenets.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mattmathis@google.com \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=roopa@cumulusnetworks.com \
    --cc=soheil@google.com \
    --cc=willemb@google.com \
    --cc=yoshfuji@linux-ipv6.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.