mirror of
https://github.com/Ysurac/openmptcprouter.git
synced 2025-02-15 04:42:02 +00:00
Update MPTCP
This commit is contained in:
parent
7c795b345d
commit
4393b4dcbc
1 changed files with 189 additions and 31 deletions
|
@ -23651,36 +23651,6 @@ diff -aurN linux-5.4.64/tools/include/uapi/linux/bpf.h linux-5.4.64.mptcp/tools/
|
||||||
|
|
||||||
BPF_TCP_MAX_STATES /* Leave at the end! */
|
BPF_TCP_MAX_STATES /* Leave at the end! */
|
||||||
};
|
};
|
||||||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
|
|
||||||
index 37e229d2f615..b428f61d959c 100644
|
|
||||||
--- linux-5.4.64/net/ipv4/tcp_input.c
|
|
||||||
+++ linux-5.4.64.mptcp/net/ipv4/tcp_input.c
|
|
||||||
@@ -4842,7 +4842,24 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
|
||||||
* Out of sequence packets to the out_of_order_queue.
|
|
||||||
*/
|
|
||||||
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
|
|
||||||
- if (tcp_receive_window(tp) == 0) {
|
|
||||||
+ /* Receiving data on a zero window in MPTCP can occur due to
|
|
||||||
+ * reinjected data sent on another subflow filling the
|
|
||||||
+ * window. This semi-frequently occurs due to penalization
|
|
||||||
+ * while initially growing the congestion window.
|
|
||||||
+ * For the subflow, dropping the packet is seen (wrongly) as a
|
|
||||||
+ * loss, impacting the congestion control.
|
|
||||||
+ *
|
|
||||||
+ * To avoid this, accept the packet at the subflow level, and
|
|
||||||
+ * let the meta handle the segment.
|
|
||||||
+ * If it was a duplicate segment, or if it was a new segment
|
|
||||||
+ * somehow (a bug in the sender), it is up to the meta level to
|
|
||||||
+ * handle this and drop the segment. mptcp_data_ready is able to
|
|
||||||
+ * handle either case.
|
|
||||||
+ *
|
|
||||||
+ * We still check for rmem constraints, so there is no risk of
|
|
||||||
+ * queueing too much data.
|
|
||||||
+ */
|
|
||||||
+ if (tcp_receive_window(tp) == 0 && !mptcp(tp)) {
|
|
||||||
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
|
|
||||||
goto out_of_window;
|
|
||||||
}
|
|
||||||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
|
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
|
||||||
index 221e055623c1..49555fee79b4 100644
|
index 221e055623c1..49555fee79b4 100644
|
||||||
--- a/net/ipv4/tcp_output.c
|
--- a/net/ipv4/tcp_output.c
|
||||||
|
@ -23824,4 +23794,192 @@ index 39a997f84209..a4d8c4a5e52d 100644
|
||||||
+
|
+
|
||||||
/* No splitting required, as we will only send one single segment */
|
/* No splitting required, as we will only send one single segment */
|
||||||
if (skb->len <= mss_now)
|
if (skb->len <= mss_now)
|
||||||
return skb;
|
return skb;
|
||||||
|
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
|
||||||
|
index 04fcc5219f7b..970fb566f94d 100644
|
||||||
|
--- a/include/linux/tcp.h
|
||||||
|
+++ b/include/linux/tcp.h
|
||||||
|
@@ -348,6 +348,7 @@ struct tcp_sock {
|
||||||
|
u32 rate_interval_us; /* saved rate sample: time elapsed */
|
||||||
|
|
||||||
|
u32 rcv_wnd; /* Current receiver window */
|
||||||
|
+ u32 rcv_right_edge; /* Highest announced right edge */
|
||||||
|
u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
|
||||||
|
u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */
|
||||||
|
u32 pushed_seq; /* Last pushed seq, required to talk to windows */
|
||||||
|
diff --git a/include/net/tcp.h b/include/net/tcp.h
|
||||||
|
index 3e4f5179a835..93d53f5d5359 100644
|
||||||
|
--- a/include/net/tcp.h
|
||||||
|
+++ b/include/net/tcp.h
|
||||||
|
@@ -850,6 +850,30 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp)
|
||||||
|
return (u32) win;
|
||||||
|
}
|
||||||
|
|
||||||
|
+/* right edge only moves forward, even if window shrinks due
|
||||||
|
+ * to mptcp meta
|
||||||
|
+ */
|
||||||
|
+static inline void tcp_update_rcv_right_edge(struct tcp_sock *tp)
|
||||||
|
+{
|
||||||
|
+ if (after(tp->rcv_wup + tp->rcv_wnd, tp->rcv_right_edge))
|
||||||
|
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/* Compute receive window which will never shrink. The way MPTCP handles
|
||||||
|
+ * the receive window can cause the effective right edge to shrink,
|
||||||
|
+ * causing valid segments to become out of window.
|
||||||
|
+ * This function should be used when checking if a segment is valid for
|
||||||
|
+ * the max right edge announced.
|
||||||
|
+ */
|
||||||
|
+static inline u32 tcp_receive_window_no_shrink(const struct tcp_sock *tp)
|
||||||
|
+{
|
||||||
|
+ s32 win = tp->rcv_right_edge - tp->rcv_nxt;
|
||||||
|
+
|
||||||
|
+ if (win < 0)
|
||||||
|
+ win = 0;
|
||||||
|
+ return (u32) win;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/* Choose a new window, without checks for shrinking, and without
|
||||||
|
* scaling applied to the result. The caller does these things
|
||||||
|
* if necessary. This is a "raw" window selection.
|
||||||
|
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
|
||||||
|
index 617c06364516..81d35b7b00c0 100644
|
||||||
|
--- a/net/ipv4/tcp.c
|
||||||
|
+++ b/net/ipv4/tcp.c
|
||||||
|
@@ -2824,6 +2824,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l
|
||||||
|
|
||||||
|
tp->rcv_wnd = opt.rcv_wnd;
|
||||||
|
tp->rcv_wup = opt.rcv_wup;
|
||||||
|
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
|
||||||
|
index 42d7a7d208df..88c211d28bc5 100644
|
||||||
|
--- a/net/ipv4/tcp_fastopen.c
|
||||||
|
+++ b/net/ipv4/tcp_fastopen.c
|
||||||
|
@@ -277,6 +277,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
|
||||||
|
|
||||||
|
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
|
||||||
|
tp->rcv_wup = tp->rcv_nxt;
|
||||||
|
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
||||||
|
|
||||||
|
meta_sk = child;
|
||||||
|
ret = mptcp_check_req_fastopen(meta_sk, req);
|
||||||
|
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
|
||||||
|
index 37e229d2f615..d968cc6fddf7 100644
|
||||||
|
--- a/net/ipv4/tcp_input.c
|
||||||
|
+++ b/net/ipv4/tcp_input.c
|
||||||
|
@@ -4122,7 +4122,7 @@ static inline bool tcp_paws_discard(const struct sock *sk,
|
||||||
|
static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
|
||||||
|
{
|
||||||
|
return !before(end_seq, tp->rcv_wup) &&
|
||||||
|
- !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
|
||||||
|
+ !after(seq, tp->rcv_nxt + tcp_receive_window_no_shrink(tp));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* When we get a reset we do this. */
|
||||||
|
@@ -4842,7 +4842,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
||||||
|
* Out of sequence packets to the out_of_order_queue.
|
||||||
|
*/
|
||||||
|
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
|
||||||
|
- if (tcp_receive_window(tp) == 0) {
|
||||||
|
+ if (tcp_receive_window_no_shrink(tp) == 0) {
|
||||||
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
|
||||||
|
goto out_of_window;
|
||||||
|
}
|
||||||
|
@@ -4903,7 +4903,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Out of window. F.e. zero window probe. */
|
||||||
|
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
|
||||||
|
+ if (!before(TCP_SKB_CB(skb)->seq,
|
||||||
|
+ tp->rcv_nxt + tcp_receive_window_no_shrink(tp)))
|
||||||
|
goto out_of_window;
|
||||||
|
|
||||||
|
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
|
||||||
|
@@ -4913,7 +4914,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
||||||
|
/* If window is closed, drop tail of packet. But after
|
||||||
|
* remembering D-SACK for its head made in previous line.
|
||||||
|
*/
|
||||||
|
- if (!tcp_receive_window(tp)) {
|
||||||
|
+ if (!tcp_receive_window_no_shrink(tp)) {
|
||||||
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
|
||||||
|
goto out_of_window;
|
||||||
|
}
|
||||||
|
@@ -6069,6 +6070,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
|
||||||
|
*/
|
||||||
|
WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
|
||||||
|
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
|
||||||
|
+ tcp_update_rcv_right_edge(tp);
|
||||||
|
|
||||||
|
/* RFC1323: The window in SYN & SYN/ACK segments is
|
||||||
|
* never scaled.
|
||||||
|
@@ -6187,6 +6189,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
|
||||||
|
WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
|
||||||
|
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
|
||||||
|
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
|
||||||
|
+ tcp_update_rcv_right_edge(tp);
|
||||||
|
|
||||||
|
/* RFC1323: The window in SYN & SYN/ACK segments is
|
||||||
|
* never scaled.
|
||||||
|
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
|
||||||
|
index c4b489bfa9ae..fa9f63e3caaa 100644
|
||||||
|
--- a/net/ipv4/tcp_minisocks.c
|
||||||
|
+++ b/net/ipv4/tcp_minisocks.c
|
||||||
|
@@ -550,6 +550,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
|
||||||
|
newtp->window_clamp = req->rsk_window_clamp;
|
||||||
|
newtp->rcv_ssthresh = req->rsk_rcv_wnd;
|
||||||
|
newtp->rcv_wnd = req->rsk_rcv_wnd;
|
||||||
|
+ newtp->rcv_right_edge = newtp->rcv_wnd + newtp->rcv_wup;
|
||||||
|
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
|
||||||
|
if (newtp->rx_opt.wscale_ok) {
|
||||||
|
newtp->rx_opt.snd_wscale = ireq->snd_wscale;
|
||||||
|
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
|
||||||
|
index 221e055623c1..0f3bb4467133 100644
|
||||||
|
--- a/net/ipv4/tcp_output.c
|
||||||
|
+++ b/net/ipv4/tcp_output.c
|
||||||
|
@@ -283,6 +283,7 @@ u16 tcp_select_window(struct sock *sk)
|
||||||
|
|
||||||
|
tp->rcv_wnd = new_win;
|
||||||
|
tp->rcv_wup = tp->rcv_nxt;
|
||||||
|
+ tcp_update_rcv_right_edge(tp);
|
||||||
|
|
||||||
|
/* Make sure we do not exceed the maximum possible
|
||||||
|
* scaled window.
|
||||||
|
@@ -3484,6 +3485,8 @@ static void tcp_connect_init(struct sock *sk)
|
||||||
|
else
|
||||||
|
tp->rcv_tstamp = tcp_jiffies32;
|
||||||
|
tp->rcv_wup = tp->rcv_nxt;
|
||||||
|
+ /* force set rcv_right_edge here at start of connection */
|
||||||
|
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
||||||
|
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
|
||||||
|
|
||||||
|
inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
|
||||||
|
diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c
|
||||||
|
index a6bbb7a688ba..9210e755ae3d 100644
|
||||||
|
--- a/net/mptcp/mptcp_ctrl.c
|
||||||
|
+++ b/net/mptcp/mptcp_ctrl.c
|
||||||
|
@@ -1278,6 +1278,7 @@ void mptcp_initialize_recv_vars(struct tcp_sock *meta_tp, struct mptcp_cb *mpcb,
|
||||||
|
meta_tp->copied_seq = (u32)idsn;
|
||||||
|
meta_tp->rcv_nxt = (u32)idsn;
|
||||||
|
meta_tp->rcv_wup = (u32)idsn;
|
||||||
|
+ meta_tp->rcv_right_edge = meta_tp->rcv_wup + meta_tp->rcv_wnd;
|
||||||
|
|
||||||
|
meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;
|
||||||
|
}
|
||||||
|
diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c
|
||||||
|
index fc71d41c608d..bdea1a26e3fc 100644
|
||||||
|
--- a/net/mptcp/mptcp_output.c
|
||||||
|
+++ b/net/mptcp/mptcp_output.c
|
||||||
|
@@ -1229,6 +1229,10 @@ u16 mptcp_select_window(struct sock *sk)
|
||||||
|
|
||||||
|
meta_tp->rcv_wnd = tp->rcv_wnd;
|
||||||
|
meta_tp->rcv_wup = meta_tp->rcv_nxt;
|
||||||
|
+ /* no need to use tcp_update_rcv_right_edge, because at the meta level
|
||||||
|
+ * right edge cannot go back
|
||||||
|
+ */
|
||||||
|
+ meta_tp->rcv_right_edge = meta_tp->rcv_wnd + meta_tp->rcv_wup;
|
||||||
|
|
||||||
|
return new_win;
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue