mirror of
https://github.com/Ysurac/openmptcprouter.git
synced 2025-02-13 03:41:54 +00:00
Update MPTCP
This commit is contained in:
parent
7c795b345d
commit
4393b4dcbc
1 changed files with 189 additions and 31 deletions
|
@ -23651,36 +23651,6 @@ diff -aurN linux-5.4.64/tools/include/uapi/linux/bpf.h linux-5.4.64.mptcp/tools/
|
|||
|
||||
BPF_TCP_MAX_STATES /* Leave at the end! */
|
||||
};
|
||||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
|
||||
index 37e229d2f615..b428f61d959c 100644
|
||||
--- linux-5.4.64/net/ipv4/tcp_input.c
|
||||
+++ linux-5.4.64.mptcp/net/ipv4/tcp_input.c
|
||||
@@ -4842,7 +4842,24 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
||||
* Out of sequence packets to the out_of_order_queue.
|
||||
*/
|
||||
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
|
||||
- if (tcp_receive_window(tp) == 0) {
|
||||
+ /* Receiving data on a zero window in MPTCP can occur due to
|
||||
+ * reinjected data sent on another subflow filling the
|
||||
+ * window. This semi-frequently occurs due to penalization
|
||||
+ * while initially growing the congestion window.
|
||||
+ * For the subflow, dropping the packet is seen (wrongly) as a
|
||||
+ * loss, impacting the congestion control.
|
||||
+ *
|
||||
+ * To avoid this, accept the packet at the subflow level, and
|
||||
+ * let the meta handle the segment.
|
||||
+ * If it was a duplicate segment, or if it was a new segment
|
||||
+ * somehow (a bug in the sender), it is up to the meta level to
|
||||
+ * handle this and drop the segment. mptcp_data_ready is able to
|
||||
+ * handle either case.
|
||||
+ *
|
||||
+ * We still check for rmem constraints, so there is no risk of
|
||||
+ * queueing too much data.
|
||||
+ */
|
||||
+ if (tcp_receive_window(tp) == 0 && !mptcp(tp)) {
|
||||
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
|
||||
goto out_of_window;
|
||||
}
|
||||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
|
||||
index 221e055623c1..49555fee79b4 100644
|
||||
--- a/net/ipv4/tcp_output.c
|
||||
|
@ -23825,3 +23795,191 @@ index 39a997f84209..a4d8c4a5e52d 100644
|
|||
/* No splitting required, as we will only send one single segment */
|
||||
if (skb->len <= mss_now)
|
||||
return skb;
|
||||
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
|
||||
index 04fcc5219f7b..970fb566f94d 100644
|
||||
--- a/include/linux/tcp.h
|
||||
+++ b/include/linux/tcp.h
|
||||
@@ -348,6 +348,7 @@ struct tcp_sock {
|
||||
u32 rate_interval_us; /* saved rate sample: time elapsed */
|
||||
|
||||
u32 rcv_wnd; /* Current receiver window */
|
||||
+ u32 rcv_right_edge; /* Highest announced right edge */
|
||||
u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
|
||||
u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */
|
||||
u32 pushed_seq; /* Last pushed seq, required to talk to windows */
|
||||
diff --git a/include/net/tcp.h b/include/net/tcp.h
|
||||
index 3e4f5179a835..93d53f5d5359 100644
|
||||
--- a/include/net/tcp.h
|
||||
+++ b/include/net/tcp.h
|
||||
@@ -850,6 +850,30 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp)
|
||||
return (u32) win;
|
||||
}
|
||||
|
||||
+/* right edge only moves forward, even if window shrinks due
|
||||
+ * to mptcp meta
|
||||
+ */
|
||||
+static inline void tcp_update_rcv_right_edge(struct tcp_sock *tp)
|
||||
+{
|
||||
+ if (after(tp->rcv_wup + tp->rcv_wnd, tp->rcv_right_edge))
|
||||
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
||||
+}
|
||||
+
|
||||
+/* Compute receive window which will never shrink. The way MPTCP handles
|
||||
+ * the receive window can cause the effective right edge to shrink,
|
||||
+ * causing valid segments to become out of window.
|
||||
+ * This function should be used when checking if a segment is valid for
|
||||
+ * the max right edge announced.
|
||||
+ */
|
||||
+static inline u32 tcp_receive_window_no_shrink(const struct tcp_sock *tp)
|
||||
+{
|
||||
+ s32 win = tp->rcv_right_edge - tp->rcv_nxt;
|
||||
+
|
||||
+ if (win < 0)
|
||||
+ win = 0;
|
||||
+ return (u32) win;
|
||||
+}
|
||||
+
|
||||
/* Choose a new window, without checks for shrinking, and without
|
||||
* scaling applied to the result. The caller does these things
|
||||
* if necessary. This is a "raw" window selection.
|
||||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
|
||||
index 617c06364516..81d35b7b00c0 100644
|
||||
--- a/net/ipv4/tcp.c
|
||||
+++ b/net/ipv4/tcp.c
|
||||
@@ -2824,6 +2824,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l
|
||||
|
||||
tp->rcv_wnd = opt.rcv_wnd;
|
||||
tp->rcv_wup = opt.rcv_wup;
|
||||
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
||||
|
||||
return 0;
|
||||
}
|
||||
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
|
||||
index 42d7a7d208df..88c211d28bc5 100644
|
||||
--- a/net/ipv4/tcp_fastopen.c
|
||||
+++ b/net/ipv4/tcp_fastopen.c
|
||||
@@ -277,6 +277,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
|
||||
|
||||
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
|
||||
tp->rcv_wup = tp->rcv_nxt;
|
||||
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
||||
|
||||
meta_sk = child;
|
||||
ret = mptcp_check_req_fastopen(meta_sk, req);
|
||||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
|
||||
index 37e229d2f615..d968cc6fddf7 100644
|
||||
--- a/net/ipv4/tcp_input.c
|
||||
+++ b/net/ipv4/tcp_input.c
|
||||
@@ -4122,7 +4122,7 @@ static inline bool tcp_paws_discard(const struct sock *sk,
|
||||
static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
|
||||
{
|
||||
return !before(end_seq, tp->rcv_wup) &&
|
||||
- !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
|
||||
+ !after(seq, tp->rcv_nxt + tcp_receive_window_no_shrink(tp));
|
||||
}
|
||||
|
||||
/* When we get a reset we do this. */
|
||||
@@ -4842,7 +4842,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
||||
* Out of sequence packets to the out_of_order_queue.
|
||||
*/
|
||||
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
|
||||
- if (tcp_receive_window(tp) == 0) {
|
||||
+ if (tcp_receive_window_no_shrink(tp) == 0) {
|
||||
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
|
||||
goto out_of_window;
|
||||
}
|
||||
@@ -4903,7 +4903,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
||||
}
|
||||
|
||||
/* Out of window. F.e. zero window probe. */
|
||||
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
|
||||
+ if (!before(TCP_SKB_CB(skb)->seq,
|
||||
+ tp->rcv_nxt + tcp_receive_window_no_shrink(tp)))
|
||||
goto out_of_window;
|
||||
|
||||
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
|
||||
@@ -4913,7 +4914,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
||||
/* If window is closed, drop tail of packet. But after
|
||||
* remembering D-SACK for its head made in previous line.
|
||||
*/
|
||||
- if (!tcp_receive_window(tp)) {
|
||||
+ if (!tcp_receive_window_no_shrink(tp)) {
|
||||
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
|
||||
goto out_of_window;
|
||||
}
|
||||
@@ -6069,6 +6070,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
|
||||
*/
|
||||
WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
|
||||
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
|
||||
+ tcp_update_rcv_right_edge(tp);
|
||||
|
||||
/* RFC1323: The window in SYN & SYN/ACK segments is
|
||||
* never scaled.
|
||||
@@ -6187,6 +6189,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
|
||||
WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
|
||||
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
|
||||
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
|
||||
+ tcp_update_rcv_right_edge(tp);
|
||||
|
||||
/* RFC1323: The window in SYN & SYN/ACK segments is
|
||||
* never scaled.
|
||||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
|
||||
index c4b489bfa9ae..fa9f63e3caaa 100644
|
||||
--- a/net/ipv4/tcp_minisocks.c
|
||||
+++ b/net/ipv4/tcp_minisocks.c
|
||||
@@ -550,6 +550,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
|
||||
newtp->window_clamp = req->rsk_window_clamp;
|
||||
newtp->rcv_ssthresh = req->rsk_rcv_wnd;
|
||||
newtp->rcv_wnd = req->rsk_rcv_wnd;
|
||||
+ newtp->rcv_right_edge = newtp->rcv_wnd + newtp->rcv_wup;
|
||||
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
|
||||
if (newtp->rx_opt.wscale_ok) {
|
||||
newtp->rx_opt.snd_wscale = ireq->snd_wscale;
|
||||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
|
||||
index 221e055623c1..0f3bb4467133 100644
|
||||
--- a/net/ipv4/tcp_output.c
|
||||
+++ b/net/ipv4/tcp_output.c
|
||||
@@ -283,6 +283,7 @@ u16 tcp_select_window(struct sock *sk)
|
||||
|
||||
tp->rcv_wnd = new_win;
|
||||
tp->rcv_wup = tp->rcv_nxt;
|
||||
+ tcp_update_rcv_right_edge(tp);
|
||||
|
||||
/* Make sure we do not exceed the maximum possible
|
||||
* scaled window.
|
||||
@@ -3484,6 +3485,8 @@ static void tcp_connect_init(struct sock *sk)
|
||||
else
|
||||
tp->rcv_tstamp = tcp_jiffies32;
|
||||
tp->rcv_wup = tp->rcv_nxt;
|
||||
+ /* force set rcv_right_edge here at start of connection */
|
||||
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
||||
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
|
||||
|
||||
inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
|
||||
diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c
|
||||
index a6bbb7a688ba..9210e755ae3d 100644
|
||||
--- a/net/mptcp/mptcp_ctrl.c
|
||||
+++ b/net/mptcp/mptcp_ctrl.c
|
||||
@@ -1278,6 +1278,7 @@ void mptcp_initialize_recv_vars(struct tcp_sock *meta_tp, struct mptcp_cb *mpcb,
|
||||
meta_tp->copied_seq = (u32)idsn;
|
||||
meta_tp->rcv_nxt = (u32)idsn;
|
||||
meta_tp->rcv_wup = (u32)idsn;
|
||||
+ meta_tp->rcv_right_edge = meta_tp->rcv_wup + meta_tp->rcv_wnd;
|
||||
|
||||
meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;
|
||||
}
|
||||
diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c
|
||||
index fc71d41c608d..bdea1a26e3fc 100644
|
||||
--- a/net/mptcp/mptcp_output.c
|
||||
+++ b/net/mptcp/mptcp_output.c
|
||||
@@ -1229,6 +1229,10 @@ u16 mptcp_select_window(struct sock *sk)
|
||||
|
||||
meta_tp->rcv_wnd = tp->rcv_wnd;
|
||||
meta_tp->rcv_wup = meta_tp->rcv_nxt;
|
||||
+ /* no need to use tcp_update_rcv_right_edge, because at the meta level
|
||||
+ * right edge cannot go back
|
||||
+ */
|
||||
+ meta_tp->rcv_right_edge = meta_tp->rcv_wnd + meta_tp->rcv_wup;
|
||||
|
||||
return new_win;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue