diff -aurN '--exclude=.git' mptcp-mptcp_trunk/include/net/mptcp.h mptcp/include/net/mptcp.h --- mptcp-mptcp_trunk/include/net/mptcp.h 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/include/net/mptcp.h 2020-05-14 15:15:39.929940266 +0200 @@ -102,7 +102,8 @@ u8 loc_id; u8 rem_id; /* Address-id in the MP_JOIN */ - u8 dss_csum:1, + u16 dss_csum:1, + rem_key_set:1, is_sub:1, /* Is this a new subflow? */ low_prio:1, /* Interface set to low-prio? */ rcv_low_prio:1, @@ -240,7 +241,6 @@ struct module *owner; }; -#define MPTCP_SCHED_NAME_MAX 16 struct mptcp_sched_ops { struct list_head list; @@ -272,6 +272,8 @@ u32 rcv_high_order[2]; u16 send_infinite_mapping:1, + send_mptcpv1_mpcapable:1, + rem_key_set:1, in_time_wait:1, list_rcvd:1, /* XXX TO REMOVE */ addr_signal:1, /* Path-manager wants us to call addr_signal */ @@ -354,6 +356,16 @@ #define MPTCP_SUB_LEN_CAPABLE_ACK 20 #define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20 +#define MPTCPV1_SUB_LEN_CAPABLE_SYN 4 +#define MPTCPV1_SUB_LEN_CAPABLE_SYN_ALIGN 4 +#define MPTCPV1_SUB_LEN_CAPABLE_SYNACK 12 +#define MPTCPV1_SUB_LEN_CAPABLE_SYNACK_ALIGN 12 +#define MPTCPV1_SUB_LEN_CAPABLE_ACK 20 +#define MPTCPV1_SUB_LEN_CAPABLE_ACK_ALIGN 20 +#define MPTCPV1_SUB_LEN_CAPABLE_DATA 22 +#define MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM 22 +#define MPTCPV1_SUB_LEN_CAPABLE_DATA_ALIGN 24 + #define MPTCP_SUB_JOIN 1 #define MPTCP_SUB_LEN_JOIN_SYN 12 #define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12 @@ -450,14 +462,15 @@ #define MPTCPHDR_SEQ 0x01 /* DSS.M option is present */ #define MPTCPHDR_FIN 0x02 /* DSS.F option is present */ #define MPTCPHDR_SEQ64_INDEX 0x04 /* index of seq in mpcb->snd_high_order */ +#define MPTCPHDR_MPC_DATA 0x08 /* MPTCP flags: RX only */ -#define MPTCPHDR_ACK 0x08 -#define MPTCPHDR_SEQ64_SET 0x10 /* Did we received a 64-bit seq number? */ -#define MPTCPHDR_SEQ64_OFO 0x20 /* Is it not in our circular array? */ -#define MPTCPHDR_DSS_CSUM 0x40 +#define MPTCPHDR_ACK 0x10 +#define MPTCPHDR_SEQ64_SET 0x20 /* Did we received a 64-bit seq number? */ +#define MPTCPHDR_SEQ64_OFO 0x40 /* Is it not in our circular array? */ +#define MPTCPHDR_DSS_CSUM 0x80 /* MPTCP flags: TX only */ -#define MPTCPHDR_INF 0x08 -#define MPTCP_REINJECT 0x10 /* Did we reinject this segment? */ +#define MPTCPHDR_INF 0x10 +#define MPTCP_REINJECT 0x20 /* Did we reinject this segment? */ struct mptcp_option { __u8 kind; @@ -800,10 +813,11 @@ void mptcp_close(struct sock *meta_sk, long timeout); bool mptcp_doit(struct sock *sk); int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, - __u8 mptcp_ver, u32 window); + int rem_key_set, __u8 mptcp_ver, u32 window); int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req); int mptcp_check_req_master(struct sock *sk, struct sock *child, struct request_sock *req, const struct sk_buff *skb, + const struct mptcp_options_received *mopt, int drop, u32 tsoff); struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child, @@ -816,8 +830,8 @@ int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); unsigned int mptcp_current_mss(struct sock *meta_sk); -void mptcp_hmac_sha1(const u8 *key_1, const u8 *key_2, u32 *hash_out, - int arg_num, ...); +void mptcp_hmac(u8 ver, const u8 *key_1, const u8 *key_2, u32 *hash_out, + int arg_num, ...); void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk); void mptcp_fin(struct sock *meta_sk); void mptcp_meta_retransmit_timer(struct sock *meta_sk); @@ -827,6 +841,8 @@ void mptcp_sub_close(struct sock *sk, unsigned long delay); struct sock *mptcp_select_ack_sock(const struct sock *meta_sk); void mptcp_prepare_for_backlog(struct sock *sk, struct sk_buff *skb); +void mptcp_initialize_recv_vars(struct tcp_sock *meta_tp, struct mptcp_cb *mpcb, + __u64 remote_key); int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb); void mptcp_ack_handler(struct timer_list *t); bool mptcp_check_rtt(const struct tcp_sock *tp, int time); @@ -982,6 +998,11 @@ } } +static inline bool mptcp_is_data_mpcapable(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_MPC_DATA; +} + static inline bool mptcp_is_data_seq(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ; @@ -1399,6 +1420,7 @@ const struct sock *child, const struct request_sock *req, const struct sk_buff *skb, + const struct mptcp_options_received *mopt, int drop, u32 tsoff) { diff -aurN '--exclude=.git' mptcp-mptcp_trunk/include/net/tcp.h mptcp/include/net/tcp.h --- mptcp-mptcp_trunk/include/net/tcp.h 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/include/net/tcp.h 2020-05-14 15:15:27.126152589 +0200 @@ -343,7 +343,6 @@ struct mptcp_options_received; void tcp_cleanup_rbuf(struct sock *sk, int copied); -void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited); int tcp_close_state(struct sock *sk); void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, const struct sk_buff *skb); @@ -583,6 +582,7 @@ /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, + const struct mptcp_options_received *mopt, struct dst_entry *dst, u32 tsoff); int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, u32 cookie); @@ -2126,7 +2126,6 @@ void (*retransmit_timer)(struct sock *sk); void (*time_wait)(struct sock *sk, int state, int timeo); void (*cleanup_rbuf)(struct sock *sk, int copied); - void (*cwnd_validate)(struct sock *sk, bool is_cwnd_limited); int (*set_cong_ctrl)(struct sock *sk, const char *name, bool load, bool reinit, bool cap_net_admin); }; diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/ipv4/syncookies.c mptcp/net/ipv4/syncookies.c --- mptcp-mptcp_trunk/net/ipv4/syncookies.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/ipv4/syncookies.c 2020-05-14 15:15:27.126152589 +0200 @@ -203,6 +203,7 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, + const struct mptcp_options_received *mopt, struct dst_entry *dst, u32 tsoff) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -219,7 +220,7 @@ if (!child) goto listen_overflow; - ret = mptcp_check_req_master(sk, child, req, skb, 0, tsoff); + ret = mptcp_check_req_master(sk, child, req, skb, mopt, 0, tsoff); if (ret < 0) return NULL; @@ -428,7 +429,7 @@ ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); - ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst, tsoff); + ret = tcp_get_cookie_sock(sk, skb, req, &mopt, &rt->dst, tsoff); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/ipv4/tcp.c mptcp/net/ipv4/tcp.c --- mptcp-mptcp_trunk/net/ipv4/tcp.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/ipv4/tcp.c 2020-05-11 09:40:04.803741955 +0200 @@ -415,7 +415,6 @@ .retransmit_timer = tcp_retransmit_timer, .time_wait = tcp_time_wait, .cleanup_rbuf = tcp_cleanup_rbuf, - .cwnd_validate = tcp_cwnd_validate, .set_cong_ctrl = __tcp_set_congestion_control, }; diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/ipv4/tcp_minisocks.c mptcp/net/ipv4/tcp_minisocks.c --- mptcp-mptcp_trunk/net/ipv4/tcp_minisocks.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/ipv4/tcp_minisocks.c 2020-05-14 15:15:27.138152390 +0200 @@ -828,7 +828,7 @@ goto listen_overflow; if (own_req && !is_meta_sk(sk)) { - int ret = mptcp_check_req_master(sk, child, req, skb, 1, 0); + int ret = mptcp_check_req_master(sk, child, req, skb, &mopt, 1, 0); if (ret < 0) goto listen_overflow; diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/ipv4/tcp_output.c mptcp/net/ipv4/tcp_output.c --- mptcp-mptcp_trunk/net/ipv4/tcp_output.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/ipv4/tcp_output.c 2020-05-11 09:40:04.803741955 +0200 @@ -825,8 +825,8 @@ if (mptcp(tp)) tcp_tsq_write(meta_sk); } else { - if (!test_and_set_bit(TCP_TSQ_DEFERRED, &meta_sk->sk_tsq_flags)) - sock_hold(meta_sk); + if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); if ((mptcp(tp)) && (sk->sk_state != TCP_CLOSE)) mptcp_tsq_flags(sk); @@ -1672,7 +1672,7 @@ tp->snd_cwnd_stamp = tcp_jiffies32; } -void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) +static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); @@ -2512,8 +2512,7 @@ if (push_one != 2) tcp_schedule_loss_probe(sk, false); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); - if (tp->ops->cwnd_validate) - tp->ops->cwnd_validate(sk, is_cwnd_limited); + tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/ipv6/syncookies.c mptcp/net/ipv6/syncookies.c --- mptcp-mptcp_trunk/net/ipv6/syncookies.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/ipv6/syncookies.c 2020-05-14 15:15:27.142152325 +0200 @@ -267,7 +267,7 @@ ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); - ret = tcp_get_cookie_sock(sk, skb, req, dst, tsoff); + ret = tcp_get_cookie_sock(sk, skb, req, &mopt, dst, tsoff); out: return ret; out_free: diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/mptcp/mptcp_ctrl.c mptcp/net/mptcp/mptcp_ctrl.c --- mptcp-mptcp_trunk/net/mptcp/mptcp_ctrl.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/mptcp/mptcp_ctrl.c 2020-05-14 15:15:39.953939868 +0200 @@ -27,6 +27,8 @@ * 2 of the License, or (at your option) any later version. */ +#include + #include #include #include @@ -77,7 +79,7 @@ struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE; EXPORT_SYMBOL(mptcp_static_key); -static void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn); +static void mptcp_key_hash(u8 version, u64 key, u32 *token, u64 *idsn); static int proc_mptcp_path_manager(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, @@ -286,7 +288,7 @@ #endif } - mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); + mptcp_key_hash(mtreq->mptcp_ver, mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); } /* New MPTCP-connection request, prepare a new token for the meta-socket that @@ -319,7 +321,11 @@ spin_unlock(&mptcp_tk_hashlock); local_bh_enable(); rcu_read_unlock(); - mtreq->mptcp_rem_key = mopt->mptcp_sender_key; + + if (mtreq->mptcp_ver == MPTCP_VERSION_0) { + mtreq->mptcp_rem_key = mopt->mptcp_sender_key; + mtreq->rem_key_set = 1; + } } static int mptcp_reqsk_new_cookie(struct request_sock *req, @@ -355,7 +361,10 @@ local_bh_enable(); rcu_read_unlock(); - mtreq->mptcp_rem_key = mopt->mptcp_sender_key; + if (mtreq->mptcp_ver == MPTCP_VERSION_0) { + mtreq->mptcp_rem_key = mopt->mptcp_sender_key; + mtreq->rem_key_set = 1; + } return true; } @@ -380,8 +389,7 @@ mptcp_seed++); #endif - mptcp_key_sha1(tp->mptcp_loc_key, - &tp->mptcp_loc_token, NULL); + mptcp_key_hash(tp->mptcp_ver, tp->mptcp_loc_key, &tp->mptcp_loc_token, NULL); } #ifdef CONFIG_JUMP_LABEL @@ -835,6 +843,71 @@ siphash_key_t mptcp_secret __read_mostly; u32 mptcp_seed = 0; +#define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4) + +static void mptcp_key_sha256(const u64 key, u32 *token, u64 *idsn) +{ + u32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; + struct sha256_state state; + + sha256_init(&state); + sha256_update(&state, (const u8 *)&key, sizeof(key)); + sha256_final(&state, (u8 *)mptcp_hashed_key); + + if (token) + *token = mptcp_hashed_key[0]; + if (idsn) + *idsn = ntohll(*((__be64 *)&mptcp_hashed_key[6])); +} + +static void mptcp_hmac_sha256(const u8 *key_1, const u8 *key_2, u32 *hash_out, + int arg_num, va_list list) +{ + u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; + __be32 output[SHA256_DIGEST_WORDS]; + struct sha256_state state; + int index, msg_length; + int length = 0; + u8 *msg; + int i; + + /* Generate key xored with ipad */ + memset(input, 0x36, SHA256_BLOCK_SIZE); + for (i = 0; i < 8; i++) + input[i] ^= key_1[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key_2[i]; + + index = SHA256_BLOCK_SIZE; + msg_length = 0; + for (i = 0; i < arg_num; i++) { + length = va_arg(list, int); + msg = va_arg(list, u8 *); + BUG_ON(index + length >= sizeof(input)); /* Message is too long */ + memcpy(&input[index], msg, length); + index += length; + msg_length += length; + } + + sha256_init(&state); + sha256_update(&state, input, SHA256_BLOCK_SIZE + msg_length); + sha256_final(&state, &input[SHA256_BLOCK_SIZE]); + + /* Prepare second part of hmac */ + memset(input, 0x5C, SHA256_BLOCK_SIZE); + for (i = 0; i < 8; i++) + input[i] ^= key_1[i]; + for (i = 0; i < 8; i++) + input[i + 8] ^= key_2[i]; + + sha256_init(&state); + sha256_update(&state, input, sizeof(input)); + sha256_final(&state, (u8 *)output); + + for (i = 0; i < 5; i++) + hash_out[i] = output[i]; +} + static void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn) { u32 workspace[SHA_WORKSPACE_WORDS]; @@ -864,8 +937,16 @@ *idsn = ntohll(*((__be64 *)&mptcp_hashed_key[3])); } -void mptcp_hmac_sha1(const u8 *key_1, const u8 *key_2, u32 *hash_out, - int arg_num, ...) +static void mptcp_key_hash(u8 version, u64 key, u32 *token, u64 *idsn) +{ + if (version == MPTCP_VERSION_0) + mptcp_key_sha1(key, token, idsn); + else if (version >= MPTCP_VERSION_1) + mptcp_key_sha256(key, token, idsn); +} + +static void mptcp_hmac_sha1(const u8 *key_1, const u8 *key_2, u32 *hash_out, + int arg_num, va_list list) { u32 workspace[SHA_WORKSPACE_WORDS]; u8 input[128]; /* 2 512-bit blocks */ @@ -873,7 +954,6 @@ int index; int length; u8 *msg; - va_list list; memset(workspace, 0, sizeof(workspace)); @@ -884,7 +964,6 @@ for (i = 0; i < 8; i++) input[i + 8] ^= key_2[i]; - va_start(list, arg_num); index = 64; for (i = 0; i < arg_num; i++) { length = va_arg(list, int); @@ -893,7 +972,6 @@ memcpy(&input[index], msg, length); index += length; } - va_end(list); input[index] = 0x80; /* Padding: First bit after message = 1 */ memset(&input[index + 1], 0, (126 - index)); @@ -936,7 +1014,20 @@ for (i = 0; i < 5; i++) hash_out[i] = (__force u32)cpu_to_be32(hash_out[i]); } -EXPORT_SYMBOL(mptcp_hmac_sha1); + +void mptcp_hmac(u8 ver, const u8 *key_1, const u8 *key_2, u32 *hash_out, + int arg_num, ...) +{ + va_list args; + + va_start(args, arg_num); + if (ver == MPTCP_VERSION_0) + mptcp_hmac_sha1(key_1, key_2, hash_out, arg_num, args); + else if (ver >= MPTCP_VERSION_1) + mptcp_hmac_sha256(key_1, key_2, hash_out, arg_num, args); + va_end(args); +} +EXPORT_SYMBOL(mptcp_hmac); static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk) { @@ -1169,14 +1260,33 @@ .set_cong_ctrl = __tcp_set_congestion_control, }; +void mptcp_initialize_recv_vars(struct tcp_sock *meta_tp, struct mptcp_cb *mpcb, + __u64 remote_key) +{ + u64 idsn; + + mpcb->mptcp_rem_key = remote_key; + mpcb->rem_key_set = 1; + mptcp_key_hash(mpcb->mptcp_ver, mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn); + + idsn++; + mpcb->rcv_high_order[0] = idsn >> 32; + mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1; + meta_tp->copied_seq = (u32)idsn; + meta_tp->rcv_nxt = (u32)idsn; + meta_tp->rcv_wup = (u32)idsn; + + meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1; +} + static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, - __u8 mptcp_ver, u32 window) + int rem_key_set, __u8 mptcp_ver, u32 window) { struct mptcp_cb *mpcb; struct sock *master_sk; struct inet_connection_sock *meta_icsk = inet_csk(meta_sk); struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk); - u64 snd_idsn, rcv_idsn; + u64 snd_idsn; dst_release(meta_sk->sk_rx_dst); meta_sk->sk_rx_dst = NULL; @@ -1204,17 +1314,11 @@ mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token; /* Generate Initial data-sequence-numbers */ - mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &snd_idsn); + mptcp_key_hash(mpcb->mptcp_ver, mpcb->mptcp_loc_key, NULL, &snd_idsn); snd_idsn++; mpcb->snd_high_order[0] = snd_idsn >> 32; mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1; - mpcb->mptcp_rem_key = remote_key; - mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &rcv_idsn); - rcv_idsn++; - mpcb->rcv_high_order[0] = rcv_idsn >> 32; - mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1; - mpcb->meta_sk = meta_sk; mpcb->master_sk = master_sk; @@ -1326,11 +1430,9 @@ meta_tp->pushed_seq = meta_tp->write_seq; meta_tp->snd_up = meta_tp->write_seq; - meta_tp->copied_seq = (u32)rcv_idsn; - meta_tp->rcv_nxt = (u32)rcv_idsn; - meta_tp->rcv_wup = (u32)rcv_idsn; + if (rem_key_set) + mptcp_initialize_recv_vars(meta_tp, mpcb, remote_key); - meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1; meta_tp->snd_wnd = window; meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */ @@ -2077,12 +2179,12 @@ } int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, - __u8 mptcp_ver, u32 window) + int rem_key_set, __u8 mptcp_ver, u32 window) { struct tcp_sock *master_tp; struct sock *master_sk; - if (mptcp_alloc_mpcb(meta_sk, remote_key, mptcp_ver, window)) + if (mptcp_alloc_mpcb(meta_sk, remote_key, rem_key_set, mptcp_ver, window)) goto err_alloc_mpcb; master_sk = tcp_sk(meta_sk)->mpcb->master_sk; @@ -2110,6 +2212,7 @@ } static int __mptcp_check_req_master(struct sock *child, + const struct mptcp_options_received *mopt, struct request_sock *req) { struct tcp_sock *child_tp = tcp_sk(child); @@ -2121,6 +2224,8 @@ if (!inet_rsk(req)->mptcp_rqsk) return 1; + mtreq = mptcp_rsk(req); + if (!inet_rsk(req)->saw_mpc) { /* Fallback to regular TCP, because we saw one SYN without * MP_CAPABLE. In tcp_check_req we continue the regular path. @@ -2132,15 +2237,21 @@ return 1; } + /* mopt can be NULL when coming from FAST-OPEN */ + if (mopt && mopt->saw_mpc && mtreq->mptcp_ver == MPTCP_VERSION_1) { + mtreq->mptcp_rem_key = mopt->mptcp_sender_key; + mtreq->rem_key_set = 1; + } + MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); /* Just set this values to pass them to mptcp_alloc_mpcb */ - mtreq = mptcp_rsk(req); child_tp->mptcp_loc_key = mtreq->mptcp_loc_key; child_tp->mptcp_loc_token = mtreq->mptcp_loc_token; if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key, - mtreq->mptcp_ver, child_tp->snd_wnd)) { + mtreq->rem_key_set, mtreq->mptcp_ver, + child_tp->snd_wnd)) { inet_csk_prepare_forced_close(meta_sk); tcp_done(meta_sk); @@ -2175,7 +2286,7 @@ u32 new_mapping; int ret; - ret = __mptcp_check_req_master(child, req); + ret = __mptcp_check_req_master(child, NULL, req); if (ret) return ret; @@ -2218,12 +2329,13 @@ int mptcp_check_req_master(struct sock *sk, struct sock *child, struct request_sock *req, const struct sk_buff *skb, + const struct mptcp_options_received *mopt, int drop, u32 tsoff) { struct sock *meta_sk = child; int ret; - ret = __mptcp_check_req_master(child, req); + ret = __mptcp_check_req_master(child, mopt, req); if (ret) return ret; child = tcp_sk(child)->mpcb->master_sk; @@ -2281,11 +2393,10 @@ goto teardown; } - mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, - (u8 *)&mpcb->mptcp_loc_key, - (u32 *)hash_mac_check, 2, - 4, (u8 *)&mtreq->mptcp_rem_nonce, - 4, (u8 *)&mtreq->mptcp_loc_nonce); + mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key, + (u8 *)&mpcb->mptcp_loc_key, (u32 *)hash_mac_check, 2, + 4, (u8 *)&mtreq->mptcp_rem_nonce, + 4, (u8 *)&mtreq->mptcp_loc_nonce); if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20)) { MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKMAC); @@ -2547,11 +2658,10 @@ mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce; - mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, - (u8 *)&mpcb->mptcp_rem_key, - (u32 *)mptcp_hash_mac, 2, - 4, (u8 *)&mtreq->mptcp_loc_nonce, - 4, (u8 *)&mtreq->mptcp_rem_nonce); + mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key, + (u8 *)&mpcb->mptcp_rem_key, (u32 *)mptcp_hash_mac, 2, + 4, (u8 *)&mtreq->mptcp_loc_nonce, + 4, (u8 *)&mtreq->mptcp_rem_nonce); mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac; mtreq->rem_id = mopt.rem_id; @@ -2591,11 +2701,13 @@ /* Absolutely need to always initialize this. */ mtreq->hash_entry.pprev = NULL; + mtreq->mptcp_ver = mopt->mptcp_ver; mtreq->mptcp_rem_key = mopt->mptcp_sender_key; mtreq->mptcp_loc_key = mopt->mptcp_receiver_key; + mtreq->rem_key_set = 1; /* Generate the token */ - mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); + mptcp_key_hash(mtreq->mptcp_ver, mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL); rcu_read_lock(); local_bh_disable(); diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/mptcp/mptcp_fullmesh.c mptcp/net/mptcp/mptcp_fullmesh.c --- mptcp-mptcp_trunk/net/mptcp/mptcp_fullmesh.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/mptcp/mptcp_fullmesh.c 2020-05-14 15:15:39.957939801 +0200 @@ -1596,11 +1596,10 @@ u8 no_key[8]; *(u64 *)no_key = 0; - mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, - (u8 *)no_key, - (u32 *)mptcp_hash_mac, 2, - 1, (u8 *)&mptcp_local->locaddr4[ind].loc4_id, - 4, (u8 *)&opts->add_addr4.addr.s_addr); + mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key, + (u8 *)no_key, (u32 *)mptcp_hash_mac, 2, + 1, (u8 *)&mptcp_local->locaddr4[ind].loc4_id, + 4, (u8 *)&opts->add_addr4.addr.s_addr); opts->add_addr4.trunc_mac = *(u64 *)mptcp_hash_mac; } @@ -1639,11 +1638,10 @@ u8 no_key[8]; *(u64 *)no_key = 0; - mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, - (u8 *)no_key, - (u32 *)mptcp_hash_mac, 2, - 1, (u8 *)&mptcp_local->locaddr6[ind].loc6_id, - 16, (u8 *)&opts->add_addr6.addr.s6_addr); + mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key, + (u8 *)no_key, (u32 *)mptcp_hash_mac, 2, + 1, (u8 *)&mptcp_local->locaddr6[ind].loc6_id, + 16, (u8 *)&opts->add_addr6.addr.s6_addr); opts->add_addr6.trunc_mac = *(u64 *)mptcp_hash_mac; } diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/mptcp/mptcp_input.c mptcp/net/mptcp/mptcp_input.c --- mptcp-mptcp_trunk/net/mptcp/mptcp_input.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/mptcp/mptcp_input.c 2020-05-14 15:15:39.965939670 +0200 @@ -176,6 +176,10 @@ } /* Inspired by tcp_rcv_state_process */ +/* Returns 0 if processing the packet can continue + * -1 if connection was closed with an active reset + * 1 if connection was closed and processing should stop. + */ static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk, const struct sk_buff *skb, u32 data_seq, u16 data_len) @@ -216,7 +220,7 @@ mptcp_send_active_reset(meta_sk, GFP_ATOMIC); tcp_done(meta_sk); __NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); - return 1; + return -1; } tmo = tcp_fin_time(meta_sk); @@ -259,7 +263,7 @@ __NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); mptcp_send_active_reset(meta_sk, GFP_ATOMIC); tcp_reset(meta_sk); - return 1; + return -1; } } break; @@ -344,6 +348,17 @@ sizeof(data_seq), csum_tcp); dss_csum_added = 1; /* Just do it once */ + } else if (mptcp_is_data_mpcapable(tmp) && !dss_csum_added) { + u32 offset = skb_transport_offset(tmp) + TCP_SKB_CB(tmp)->dss_off; + __be64 data_seq = htonll(tp->mptcp->map_data_seq); + __be32 rel_seq = htonl(tp->mptcp->map_subseq - tp->mptcp->rcv_isn); + + csum_tcp = csum_partial(&data_seq, sizeof(data_seq), csum_tcp); + csum_tcp = csum_partial(&rel_seq, sizeof(rel_seq), csum_tcp); + + csum_tcp = skb_checksum(tmp, offset, 4, csum_tcp); + + dss_csum_added = 1; } last = tmp; iter++; @@ -554,11 +569,12 @@ * this segment, this path has to fallback to infinite or be torn down. */ if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) && + !mptcp_is_data_mpcapable(skb) && !tp->mptcp->mapping_present && !mpcb->infinite_mapping_rcv) { - pr_debug("%s %#x will fallback - pi %d from %pS, seq %u\n", + pr_debug("%s %#x will fallback - pi %d from %pS, seq %u mptcp-flags %#x\n", __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index, __builtin_return_address(0), - TCP_SKB_CB(skb)->seq); + TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->mptcp_flags); if (!is_master_tp(tp)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATASUB); @@ -666,25 +682,36 @@ return 0; } - /* No mapping here? Exit - it is either already set or still on its way */ - if (!mptcp_is_data_seq(skb)) { - /* Too many packets without a mapping - this subflow is broken */ + if (!tp->mptcp->mapping_present && mptcp_is_data_mpcapable(skb)) { + __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off); + + sub_seq = 1 + tp->mptcp->rcv_isn; + data_seq = meta_tp->rcv_nxt; + data_len = get_unaligned_be16(ptr); + } else if (!mptcp_is_data_seq(skb)) { + /* No mapping here? + * Exit - it is either already set or still on its way + */ if (!tp->mptcp->mapping_present && tp->rcv_nxt - tp->copied_seq > 65536) { + /* Too many packets without a mapping, + * this subflow is broken + */ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); mptcp_send_reset(sk); return 1; } return 0; + } else { + /* Well, then the DSS-mapping is there. So, read it! */ + ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb); + ptr++; + sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn; + ptr++; + data_len = get_unaligned_be16(ptr); } - ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb); - ptr++; - sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn; - ptr++; - data_len = get_unaligned_be16(ptr); - /* If it's an empty skb with DATA_FIN, sub_seq must get fixed. * The draft sets it to 0, but we really would like to have the * real value, to have an easy handling afterwards here in this @@ -1397,7 +1424,7 @@ } /* Handle the DATA_ACK */ -static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb) +static int mptcp_data_ack(struct sock *sk, const struct sk_buff *skb) { struct sock *meta_sk = mptcp_meta_sk(sk); struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); @@ -1425,7 +1452,7 @@ * set by mptcp_clean_rtx_infinite. */ if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd) - return; + return 0; if (unlikely(!tp->mptcp->fully_established) && tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq) @@ -1439,7 +1466,7 @@ * processing. */ if (meta_sk->sk_state == TCP_CLOSE) - return; + return 0; /* Get the data_seq */ if (mptcp_is_data_seq(skb)) { @@ -1463,6 +1490,9 @@ if (after(data_ack, meta_tp->snd_nxt)) goto exit; + /* First valid DATA_ACK, we can stop sending the special MP_CAPABLE */ + tp->mpcb->send_mptcpv1_mpcapable = 0; + /*** Now, update the window - inspired by tcp_ack_update_window ***/ nwin = ntohs(tcp_hdr(skb)->window); @@ -1520,14 +1550,19 @@ meta_sk->sk_write_space(meta_sk); } - if (meta_sk->sk_state != TCP_ESTABLISHED && - mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len)) - return; + if (meta_sk->sk_state != TCP_ESTABLISHED) { + int ret = mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len); + + if (ret < 0) + return 1; + else if (ret > 0) + return 0; + } exit: mptcp_push_pending_frames(meta_sk); - return; + return 0; no_queue: if (tcp_send_head(meta_sk)) @@ -1535,7 +1570,7 @@ mptcp_push_pending_frames(meta_sk); - return; + return 0; } void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk) @@ -1604,6 +1639,7 @@ struct tcp_sock *tp) { const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr; + const struct tcphdr *th = tcp_hdr(skb); /* If the socket is mp-capable we would have a mopt. */ if (!mopt) @@ -1614,9 +1650,21 @@ { const struct mp_capable *mpcapable = (struct mp_capable *)ptr; - if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN && - opsize != MPTCP_SUB_LEN_CAPABLE_ACK) { - mptcp_debug("%s: mp_capable: bad option size %d\n", + if (mpcapable->ver == MPTCP_VERSION_0 && + ((th->syn && opsize != MPTCP_SUB_LEN_CAPABLE_SYN) || + (!th->syn && th->ack && opsize != MPTCP_SUB_LEN_CAPABLE_ACK))) { + mptcp_debug("%s: mp_capable v0: bad option size %d\n", + __func__, opsize); + break; + } + + if (mpcapable->ver == MPTCP_VERSION_1 && + ((th->syn && !th->ack && opsize != MPTCPV1_SUB_LEN_CAPABLE_SYN) || + (th->syn && th->ack && opsize != MPTCPV1_SUB_LEN_CAPABLE_SYNACK) || + (!th->syn && th->ack && opsize != MPTCPV1_SUB_LEN_CAPABLE_ACK && + opsize != MPTCPV1_SUB_LEN_CAPABLE_DATA && + opsize != MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM))) { + mptcp_debug("%s: mp_capable v1: bad option size %d\n", __func__, opsize); break; } @@ -1640,10 +1688,38 @@ mopt->saw_mpc = 1; mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a; - if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN) - mopt->mptcp_sender_key = mpcapable->sender_key; - if (opsize == MPTCP_SUB_LEN_CAPABLE_ACK) - mopt->mptcp_receiver_key = mpcapable->receiver_key; + if (mpcapable->ver == MPTCP_VERSION_0) { + if (opsize == MPTCP_SUB_LEN_CAPABLE_SYN) + mopt->mptcp_sender_key = mpcapable->sender_key; + + if (opsize == MPTCP_SUB_LEN_CAPABLE_ACK) { + mopt->mptcp_sender_key = mpcapable->sender_key; + mopt->mptcp_receiver_key = mpcapable->receiver_key; + } + } else if (mpcapable->ver == MPTCP_VERSION_1) { + if (opsize == MPTCPV1_SUB_LEN_CAPABLE_SYNACK) + mopt->mptcp_sender_key = mpcapable->sender_key; + + if (opsize == MPTCPV1_SUB_LEN_CAPABLE_ACK) { + mopt->mptcp_sender_key = mpcapable->sender_key; + mopt->mptcp_receiver_key = mpcapable->receiver_key; + } + + if (opsize == MPTCPV1_SUB_LEN_CAPABLE_DATA || + opsize == MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM) { + mopt->mptcp_sender_key = mpcapable->sender_key; + mopt->mptcp_receiver_key = mpcapable->receiver_key; + + TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_MPC_DATA; + + ptr += sizeof(struct mp_capable); + TCP_SKB_CB(skb)->dss_off = (ptr - skb_transport_header(skb)); + + /* Is a check-sum present? */ + if (opsize == MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM) + TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_DSS_CSUM; + } + } mopt->mptcp_ver = mpcapable->ver; break; @@ -1917,12 +1993,11 @@ } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2) { msg_parts = 3; } - mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, - (u8 *)no_key, - (u32 *)hash_mac_check, msg_parts, - 1, (u8 *)&mpadd->addr_id, - 4, (u8 *)&mpadd->u.v4.addr.s_addr, - 2, (u8 *)&mpadd->u.v4.port); + mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key, + (u8 *)no_key, (u32 *)hash_mac_check, msg_parts, + 1, (u8 *)&mpadd->addr_id, + 4, (u8 *)&mpadd->u.v4.addr.s_addr, + 2, (u8 *)&mpadd->u.v4.port); if (memcmp(hash_mac_check, recv_hmac, 8) != 0) /* ADD_ADDR2 discarded */ return; @@ -1952,12 +2027,11 @@ } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2) { msg_parts = 3; } - mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, - (u8 *)no_key, - (u32 *)hash_mac_check, msg_parts, - 1, (u8 *)&mpadd->addr_id, - 16, (u8 *)&mpadd->u.v6.addr.s6_addr, - 2, (u8 *)&mpadd->u.v6.port); + mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key, + (u8 *)no_key, (u32 *)hash_mac_check, msg_parts, + 1, (u8 *)&mpadd->addr_id, + 16, (u8 *)&mpadd->u.v6.addr.s6_addr, + 2, (u8 *)&mpadd->u.v6.port); if (memcmp(hash_mac_check, recv_hmac, 8) != 0) /* ADD_ADDR2 discarded */ return; @@ -2115,6 +2189,10 @@ if (sk->sk_state == TCP_RST_WAIT && !th->rst) return true; + if (mopt->saw_mpc && !tp->mpcb->rem_key_set) + mptcp_initialize_recv_vars(mptcp_meta_tp(tp), tp->mpcb, + mopt->mptcp_sender_key); + if (unlikely(mopt->mp_fail)) mptcp_mp_fail_rcvd(sk, th); @@ -2122,7 +2200,8 @@ * If a checksum is not present when its use has been negotiated, the * receiver MUST close the subflow with a RST as it is considered broken. */ - if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum && + if ((mptcp_is_data_seq(skb) || mptcp_is_data_mpcapable(skb)) && + tp->mpcb->dss_csum && !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) { mptcp_send_reset(sk); return true; @@ -2171,7 +2250,8 @@ mopt->saw_low_prio = 0; } - mptcp_data_ack(sk, skb); + if (mptcp_data_ack(sk, skb)) + return true; mptcp_path_array_check(mptcp_meta_sk(sk)); /* Socket may have been mp_killed by a REMOVE_ADDR */ @@ -2297,11 +2377,10 @@ u8 hash_mac_check[20]; struct mptcp_cb *mpcb = tp->mpcb; - mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, - (u8 *)&mpcb->mptcp_loc_key, - (u32 *)hash_mac_check, 2, - 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, - 4, (u8 *)&tp->mptcp->mptcp_loc_nonce); + mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key, + (u8 *)&mpcb->mptcp_loc_key, (u32 *)hash_mac_check, 2, + 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, + 4, (u8 *)&tp->mptcp->mptcp_loc_nonce); if (memcmp(hash_mac_check, (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC); @@ -2315,11 +2394,11 @@ tp->mptcp->pre_established = 1; tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio; - mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, - (u8 *)&mpcb->mptcp_rem_key, - (u32 *)&tp->mptcp->sender_mac[0], 2, - 4, (u8 *)&tp->mptcp->mptcp_loc_nonce, - 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce); + mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key, + (u8 *)&mpcb->mptcp_rem_key, + (u32 *)&tp->mptcp->sender_mac[0], 2, + 4, (u8 *)&tp->mptcp->mptcp_loc_nonce, + 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); } else if (mopt->saw_mpc) { @@ -2329,8 +2408,13 @@ if (mopt->mptcp_ver > tcp_sk(sk)->mptcp_ver) /* TODO Consider adding new MPTCP_INC_STATS entry */ goto fallback; + if (tcp_sk(sk)->mptcp_ver == MPTCP_VERSION_1 && + mopt->mptcp_ver < MPTCP_VERSION_1) + /* TODO Consider adding new MPTCP_INC_STATS entry */ + /* TODO - record this in the cache - use v0 next time */ + goto fallback; - if (mptcp_create_master_sk(sk, mopt->mptcp_sender_key, + if (mptcp_create_master_sk(sk, mopt->mptcp_sender_key, 1, mopt->mptcp_ver, ntohs(tcp_hdr(skb)->window))) return 2; @@ -2358,6 +2442,9 @@ if (tp->mpcb->dss_csum) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CSUMENABLED); + if (tp->mpcb->mptcp_ver >= MPTCP_VERSION_1) + tp->mpcb->send_mptcpv1_mpcapable = 1; + tp->mptcp->include_mpc = 1; /* Ensure that fastopen is handled at the meta-level. */ diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/mptcp/mptcp_ipv4.c mptcp/net/mptcp/mptcp_ipv4.c --- mptcp-mptcp_trunk/net/mptcp/mptcp_ipv4.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/mptcp/mptcp_ipv4.c 2020-05-14 15:15:27.158152059 +0200 @@ -106,6 +106,9 @@ int loc_id; bool low_prio = false; + if (!mpcb->rem_key_set) + return -1; + /* We need to do this as early as possible. Because, if we fail later * (e.g., get_local_id), then reqsk_free tries to remove the * request-socket from the htb in mptcp_hash_request_remove as pprev diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/mptcp/mptcp_ipv6.c mptcp/net/mptcp/mptcp_ipv6.c --- mptcp-mptcp_trunk/net/mptcp/mptcp_ipv6.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/mptcp/mptcp_ipv6.c 2020-05-14 15:15:27.170151859 +0200 @@ -135,6 +135,9 @@ int loc_id; bool low_prio = false; + if (!mpcb->rem_key_set) + return -1; + /* We need to do this as early as possible. Because, if we fail later * (e.g., get_local_id), then reqsk_free tries to remove the * request-socket from the htb in mptcp_hash_request_remove as pprev diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/mptcp/mptcp_output.c mptcp/net/mptcp/mptcp_output.c --- mptcp-mptcp_trunk/net/mptcp/mptcp_output.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/mptcp/mptcp_output.c 2020-05-14 15:15:27.170151859 +0200 @@ -479,30 +479,78 @@ ptr += mptcp_write_dss_mapping(tp, skb, ptr); } +/* Write the MP_CAPABLE with data-option */ +static int mptcp_write_mpcapable_data(const struct tcp_sock *tp, + struct sk_buff *skb, + __be32 *ptr) +{ + struct mp_capable *mpc = (struct mp_capable *)ptr; + u8 length; + + if (tp->mpcb->dss_csum) + length = MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM; + else + length = MPTCPV1_SUB_LEN_CAPABLE_DATA; + + mpc->kind = TCPOPT_MPTCP; + mpc->len = length; + mpc->sub = MPTCP_SUB_CAPABLE; + mpc->ver = MPTCP_VERSION_1; + mpc->a = tp->mpcb->dss_csum; + mpc->b = 0; + mpc->rsv = 0; + mpc->h = 1; + + ptr++; + memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len); + + mpc->sender_key = tp->mpcb->mptcp_loc_key; + mpc->receiver_key = tp->mpcb->mptcp_rem_key; + + /* dss is in a union with inet_skb_parm and + * the IP layer expects zeroed IPCB fields. + */ + memset(TCP_SKB_CB(skb)->dss, 0, mptcp_dss_len); + + return MPTCPV1_SUB_LEN_CAPABLE_DATA_ALIGN / sizeof(*ptr); +} + /* Write the saved DSS mapping to the header */ static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb, __be32 *ptr) { + int length; __be32 *start = ptr; - memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len); + if (tp->mpcb->rem_key_set) { + memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len); + + /* update the data_ack */ + start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt); + + length = mptcp_dss_len / sizeof(*ptr); + } else { + memcpy(ptr, TCP_SKB_CB(skb)->dss, MPTCP_SUB_LEN_DSS_ALIGN); - /* update the data_ack */ - start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt); + ptr++; + memcpy(ptr, TCP_SKB_CB(skb)->dss + 2, MPTCP_SUB_LEN_SEQ_ALIGN); + + length = (MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_SEQ_ALIGN) / sizeof(*ptr); + } /* dss is in a union with inet_skb_parm and * the IP layer expects zeroed IPCB fields. */ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len); - return mptcp_dss_len/sizeof(*ptr); + return length; } static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject) { struct tcp_sock *tp = tcp_sk(sk); const struct sock *meta_sk = mptcp_meta_sk(sk); - const struct mptcp_cb *mpcb = tp->mpcb; + struct mptcp_cb *mpcb = tp->mpcb; struct tcp_skb_cb *tcb; struct sk_buff *subskb = NULL; @@ -544,6 +592,11 @@ mptcp_save_dss_data_seq(tp, subskb); + if (mpcb->send_mptcpv1_mpcapable) { + TCP_SKB_CB(subskb)->mptcp_flags |= MPTCPHDR_MPC_DATA; + mpcb->send_mptcpv1_mpcapable = 0; + } + tcb->seq = tp->write_seq; /* Take into account seg len */ @@ -851,10 +904,7 @@ if (!mptcp_skb_entail(subsk, skb, reinject)) break; - /* Nagle is handled at the MPTCP-layer, so - * always push on the subflow - */ - __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH); + if (reinject <= 0) tcp_update_skb_after_send(meta_sk, skb, meta_tp->tcp_wstamp_ns); meta_tp->lsndtime = tcp_jiffies32; @@ -886,14 +936,12 @@ if (!(path_mask & mptcp_pi_to_flag(subtp->mptcp->path_index))) continue; - /* We have pushed data on this subflow. We ignore the call to - * cwnd_validate in tcp_write_xmit as is_cwnd_limited will never - * be true (we never push more than what the cwnd can accept). - * We need to ensure that we call tcp_cwnd_validate with - * is_cwnd_limited set to true if we have filled the cwnd. + mss_now = tcp_current_mss(subsk); + + /* Nagle is handled at the MPTCP-layer, so + * always push on the subflow */ - tcp_cwnd_validate(subsk, tcp_packets_in_flight(subtp) >= - subtp->snd_cwnd); + __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH); } return !meta_tp->packets_out && tcp_send_head(meta_sk); @@ -988,8 +1036,13 @@ opts->options |= OPTION_MPTCP; if (is_master_tp(tp)) { opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN; - opts->mptcp_ver = tcp_sk(sk)->mptcp_ver; - *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; + opts->mptcp_ver = tp->mptcp_ver; + + if (tp->mptcp_ver >= MPTCP_VERSION_1) + *remaining -= MPTCPV1_SUB_LEN_CAPABLE_SYN_ALIGN; + else + *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; + opts->mp_capable.sender_key = tp->mptcp_loc_key; opts->dss_csum = !!sysctl_mptcp_checksum; } else { @@ -1017,7 +1070,11 @@ opts->mptcp_ver = mtreq->mptcp_ver; opts->mp_capable.sender_key = mtreq->mptcp_loc_key; opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum; - *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; + if (mtreq->mptcp_ver >= MPTCP_VERSION_1) { + *remaining -= MPTCPV1_SUB_LEN_CAPABLE_SYNACK_ALIGN; + } else { + *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN; + } } else { opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK; opts->mp_join_syns.sender_truncated_mac = @@ -1080,7 +1137,12 @@ opts->options |= OPTION_MPTCP; opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_ACK; - *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN; + + if (mpcb->mptcp_ver >= MPTCP_VERSION_1) + *size += MPTCPV1_SUB_LEN_CAPABLE_ACK_ALIGN; + else + *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN; + opts->mptcp_ver = mpcb->mptcp_ver; opts->mp_capable.sender_key = mpcb->mptcp_loc_key; opts->mp_capable.receiver_key = mpcb->mptcp_rem_key; @@ -1111,14 +1173,20 @@ /* If !skb, we come from tcp_current_mss and thus we always * assume that the DSS-option will be set for the data-packet. */ - if (skb && !mptcp_is_data_seq(skb)) { + if (skb && !mptcp_is_data_seq(skb) && mpcb->rem_key_set) { *size += MPTCP_SUB_LEN_ACK_ALIGN; + } else if ((skb && mptcp_is_data_mpcapable(skb)) || + (!skb && tp->mpcb->send_mptcpv1_mpcapable)) { + *size += MPTCPV1_SUB_LEN_CAPABLE_DATA_ALIGN; } else { /* Doesn't matter, if csum included or not. It will be * either 10 or 12, and thus aligned = 12 */ - *size += MPTCP_SUB_LEN_ACK_ALIGN + - MPTCP_SUB_LEN_SEQ_ALIGN; + if (mpcb->rem_key_set) + *size += MPTCP_SUB_LEN_ACK_ALIGN + + MPTCP_SUB_LEN_SEQ_ALIGN; + else + *size += MPTCP_SUB_LEN_SEQ_ALIGN; } *size += MPTCP_SUB_LEN_DSS_ALIGN; @@ -1171,18 +1239,36 @@ mpc->kind = TCPOPT_MPTCP; - if ((OPTION_TYPE_SYN & opts->mptcp_options) || - (OPTION_TYPE_SYNACK & opts->mptcp_options)) { - mpc->sender_key = opts->mp_capable.sender_key; - mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN; + if (OPTION_TYPE_SYN & opts->mptcp_options) { mpc->ver = opts->mptcp_ver; - ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; - } else if (OPTION_TYPE_ACK & opts->mptcp_options) { + + if (mpc->ver >= MPTCP_VERSION_1) { + mpc->len = MPTCPV1_SUB_LEN_CAPABLE_SYN; + ptr += MPTCPV1_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; + } else { + mpc->sender_key = opts->mp_capable.sender_key; + mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN; + ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; + } + } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) { + mpc->ver = opts->mptcp_ver; + + if (mpc->ver >= MPTCP_VERSION_1) { + mpc->len = MPTCPV1_SUB_LEN_CAPABLE_SYNACK; + ptr += MPTCPV1_SUB_LEN_CAPABLE_SYNACK_ALIGN >> 2; + } else { + mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN; + ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2; + } + mpc->sender_key = opts->mp_capable.sender_key; - mpc->receiver_key = opts->mp_capable.receiver_key; + } else if (OPTION_TYPE_ACK & opts->mptcp_options) { mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK; mpc->ver = opts->mptcp_ver; ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2; + + mpc->sender_key = opts->mp_capable.sender_key; + mpc->receiver_key = opts->mp_capable.receiver_key; } mpc->sub = MPTCP_SUB_CAPABLE; @@ -1312,8 +1398,10 @@ } if (OPTION_DATA_ACK & opts->mptcp_options) { - if (!mptcp_is_data_seq(skb)) + if (!mptcp_is_data_seq(skb) && tp->mpcb->rem_key_set) ptr += mptcp_write_dss_data_ack(tp, skb, ptr); + else if (mptcp_is_data_mpcapable(skb)) + ptr += mptcp_write_mpcapable_data(tp, skb, ptr); else ptr += mptcp_write_dss_data_seq(tp, skb, ptr); } diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/mptcp/mptcp_redundant.c mptcp/net/mptcp/mptcp_redundant.c --- mptcp-mptcp_trunk/net/mptcp/mptcp_redundant.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/mptcp/mptcp_redundant.c 2020-05-14 15:11:23.662202401 +0200 @@ -187,7 +187,9 @@ { struct tcp_sock *meta_tp = tcp_sk(meta_sk); - if (red_p->skb && !after(red_p->skb_end_seq, meta_tp->snd_una)) + if (red_p->skb && + (!after(red_p->skb_end_seq, meta_tp->snd_una) || + after(red_p->skb_end_seq, meta_tp->snd_nxt))) red_p->skb = NULL; } @@ -197,9 +199,13 @@ struct sock *meta_sk) { struct sk_buff *skb; - - if (!previous) + if (!previous){ + if (tcp_rtx_queue_head(meta_sk)){ + return tcp_rtx_queue_head(meta_sk); + } return skb_peek(queue); + } + /* sk_data->skb stores the last scheduled packet for this subflow. * If sk_data->skb was scheduled but not sent (e.g., due to nagle), @@ -246,7 +252,8 @@ *limit = 0; if (skb_queue_empty(&mpcb->reinject_queue) && - skb_queue_empty(&meta_sk->sk_write_queue)) + skb_queue_empty(&meta_sk->sk_write_queue) && + tcp_rtx_queue_empty(meta_sk)) /* Nothing to send */ return NULL; diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/mptcp/mptcp_redundant.c.orig mptcp/net/mptcp/mptcp_redundant.c.orig --- mptcp-mptcp_trunk/net/mptcp/mptcp_redundant.c.orig 1970-01-01 01:00:00.000000000 +0100 +++ mptcp/net/mptcp/mptcp_redundant.c.orig 2020-05-11 09:39:24.476475868 +0200 @@ -0,0 +1,391 @@ +/* + * MPTCP Scheduler to reduce latency and jitter. + * + * This scheduler sends all packets redundantly on all available subflows. + * + * Initial Design & Implementation: + * Tobias Erbshaeusser + * Alexander Froemmgen + * + * Initial corrections & modifications: + * Christian Pinedo + * Igor Lopez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +/* Struct to store the data of a single subflow */ +struct redsched_priv { + /* The skb or NULL */ + struct sk_buff *skb; + /* End sequence number of the skb. This number should be checked + * to be valid before the skb field is used + */ + u32 skb_end_seq; +}; + +/* Struct to store the data of the control block */ +struct redsched_cb { + /* The next subflow where a skb should be sent or NULL */ + struct tcp_sock *next_subflow; +}; + +/* Returns the socket data from a given subflow socket */ +static struct redsched_priv *redsched_get_priv(struct tcp_sock *tp) +{ + return (struct redsched_priv *)&tp->mptcp->mptcp_sched[0]; +} + +/* Returns the control block data from a given meta socket */ +static struct redsched_cb *redsched_get_cb(struct tcp_sock *tp) +{ + return (struct redsched_cb *)&tp->mpcb->mptcp_sched[0]; +} + +static bool redsched_get_active_valid_sks(struct sock *meta_sk) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct mptcp_tcp_sock *mptcp; + int active_valid_sks = 0; + + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + + if (subflow_is_active((struct tcp_sock *)sk) && + !mptcp_is_def_unavailable(sk)) + active_valid_sks++; + } + + return active_valid_sks; +} + +static bool redsched_use_subflow(struct sock *meta_sk, + int active_valid_sks, + struct tcp_sock *tp, + struct sk_buff *skb) +{ + if (!skb || !mptcp_is_available((struct sock *)tp, skb, false)) + return false; + + if (TCP_SKB_CB(skb)->path_mask != 0) + return subflow_is_active(tp); + + if (TCP_SKB_CB(skb)->path_mask == 0) { + if (active_valid_sks == -1) + active_valid_sks = redsched_get_active_valid_sks(meta_sk); + + if (subflow_is_backup(tp) && active_valid_sks > 0) + return false; + else + return true; + } + + return false; +} + +#define mptcp_entry_next_rcu(__mptcp) \ + hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ + &(__mptcp)->node)), struct mptcp_tcp_sock, node) + +static void redsched_update_next_subflow(struct tcp_sock *tp, + struct redsched_cb *red_cb) +{ + struct mptcp_tcp_sock *mptcp = mptcp_entry_next_rcu(tp->mptcp); + + if (mptcp) + red_cb->next_subflow = mptcp->tp; + else + red_cb->next_subflow = NULL; +} + +static struct sock *red_get_available_subflow(struct sock *meta_sk, + struct sk_buff *skb, + bool zero_wnd_test) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct redsched_cb *red_cb = redsched_get_cb(meta_tp); + struct tcp_sock *first_tp = red_cb->next_subflow, *tp; + struct mptcp_tcp_sock *mptcp; + int found = 0; + + /* Answer data_fin on same subflow */ + if (meta_sk->sk_shutdown & RCV_SHUTDOWN && + skb && mptcp_is_data_fin(skb)) { + mptcp_for_each_sub(mpcb, mptcp) { + struct sock *sk = mptcp_to_sock(mptcp); + + if (tcp_sk(sk)->mptcp->path_index == + mpcb->dfin_path_index && + mptcp_is_available(sk, skb, zero_wnd_test)) + return sk; + } + } + + if (!first_tp && !hlist_empty(&mpcb->conn_list)) { + first_tp = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&mpcb->conn_list)), + struct mptcp_tcp_sock, node)->tp; + } + tp = first_tp; + + /* still NULL (no subflow in conn_list?) */ + if (!first_tp) + return NULL; + + /* Search for a subflow to send it. + * + * We want to pick a subflow that is after 'first_tp' in the list of subflows. + * Thus, the first mptcp_for_each_sub()-loop tries to walk the list up + * to the subflow 'tp' and then checks whether any one of the remaining + * ones is eligible to send. + * The second mptcp_for_each-sub()-loop is then iterating from the + * beginning of the list up to 'first_tp'. + */ + mptcp_for_each_sub(mpcb, mptcp) { + /* We go up to the subflow 'tp' and start from there */ + if (tp == mptcp->tp) + found = 1; + + if (!found) + continue; + tp = mptcp->tp; + + if (mptcp_is_available((struct sock *)tp, skb, + zero_wnd_test)) { + redsched_update_next_subflow(tp, red_cb); + return (struct sock *)tp; + } + } + + mptcp_for_each_sub(mpcb, mptcp) { + tp = mptcp->tp; + + if (tp == first_tp) + break; + + if (mptcp_is_available((struct sock *)tp, skb, + zero_wnd_test)) { + redsched_update_next_subflow(tp, red_cb); + return (struct sock *)tp; + } + } + + /* No space */ + return NULL; +} + +/* Corrects the stored skb pointers if they are invalid */ +static void redsched_correct_skb_pointers(struct sock *meta_sk, + struct redsched_priv *red_p) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + + if (red_p->skb && + (!after(red_p->skb_end_seq, meta_tp->snd_una) || + after(red_p->skb_end_seq, meta_tp->snd_nxt))) + red_p->skb = NULL; +} + +/* Returns the next skb from the queue */ +static struct sk_buff *redsched_next_skb_from_queue(struct sk_buff_head *queue, + struct sk_buff *previous, + struct sock *meta_sk) +{ + struct sk_buff *skb; + + if (!previous) + return skb_peek(queue); + + /* sk_data->skb stores the last scheduled packet for this subflow. + * If sk_data->skb was scheduled but not sent (e.g., due to nagle), + * we have to schedule it again. + * + * For the redundant scheduler, there are two cases: + * 1. sk_data->skb was not sent on another subflow: + * we have to schedule it again to ensure that we do not + * skip this packet. + * 2. sk_data->skb was already sent on another subflow: + * with regard to the redundant semantic, we have to + * schedule it again. However, we keep it simple and ignore it, + * as it was already sent by another subflow. + * This might be changed in the future. + * + * For case 1, send_head is equal previous, as only a single + * packet can be skipped. + */ + if (tcp_send_head(meta_sk) == previous) + return tcp_send_head(meta_sk); + + skb = skb_rb_next(previous); + if (skb) + return skb; + + return tcp_send_head(meta_sk); +} + +static struct sk_buff *mptcp_red_next_segment(struct sock *meta_sk, + int *reinject, + struct sock **subsk, + unsigned int *limit) +{ + struct tcp_sock *meta_tp = tcp_sk(meta_sk); + struct mptcp_cb *mpcb = meta_tp->mpcb; + struct redsched_cb *red_cb = redsched_get_cb(meta_tp); + struct tcp_sock *first_tp = red_cb->next_subflow, *tp; + struct mptcp_tcp_sock *mptcp; + int active_valid_sks = -1; + struct sk_buff *skb; + int found = 0; + + /* As we set it, we have to reset it as well. */ + *limit = 0; + + if (skb_queue_empty(&mpcb->reinject_queue) && + skb_queue_empty(&meta_sk->sk_write_queue)) + /* Nothing to send */ + return NULL; + + /* First try reinjections */ + skb = skb_peek(&mpcb->reinject_queue); + if (skb) { + *subsk = get_available_subflow(meta_sk, skb, false); + if (!*subsk) + return NULL; + *reinject = 1; + return skb; + } + + /* Then try indistinctly redundant and normal skbs */ + + if (!first_tp && !hlist_empty(&mpcb->conn_list)) { + first_tp = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&mpcb->conn_list)), + struct mptcp_tcp_sock, node)->tp; + } + + /* still NULL (no subflow in conn_list?) */ + if (!first_tp) + return NULL; + + tp = first_tp; + + *reinject = 0; + active_valid_sks = redsched_get_active_valid_sks(meta_sk); + + /* We want to pick a subflow that is after 'first_tp' in the list of subflows. + * Thus, the first mptcp_for_each_sub()-loop tries to walk the list up + * to the subflow 'tp' and then checks whether any one of the remaining + * ones can send a segment. + * The second mptcp_for_each-sub()-loop is then iterating from the + * beginning of the list up to 'first_tp'. + */ + mptcp_for_each_sub(mpcb, mptcp) { + struct redsched_priv *red_p; + + if (tp == mptcp->tp) + found = 1; + + if (!found) + continue; + + tp = mptcp->tp; + + /* Correct the skb pointers of the current subflow */ + red_p = redsched_get_priv(tp); + redsched_correct_skb_pointers(meta_sk, red_p); + + skb = redsched_next_skb_from_queue(&meta_sk->sk_write_queue, + red_p->skb, meta_sk); + if (skb && redsched_use_subflow(meta_sk, active_valid_sks, tp, + skb)) { + red_p->skb = skb; + red_p->skb_end_seq = TCP_SKB_CB(skb)->end_seq; + redsched_update_next_subflow(tp, red_cb); + *subsk = (struct sock *)tp; + + if (TCP_SKB_CB(skb)->path_mask) + *reinject = -1; + return skb; + } + } + + mptcp_for_each_sub(mpcb, mptcp) { + struct redsched_priv *red_p; + + tp = mptcp->tp; + + if (tp == first_tp) + break; + + /* Correct the skb pointers of the current subflow */ + red_p = redsched_get_priv(tp); + redsched_correct_skb_pointers(meta_sk, red_p); + + skb = redsched_next_skb_from_queue(&meta_sk->sk_write_queue, + red_p->skb, meta_sk); + if (skb && redsched_use_subflow(meta_sk, active_valid_sks, tp, + skb)) { + red_p->skb = skb; + red_p->skb_end_seq = TCP_SKB_CB(skb)->end_seq; + redsched_update_next_subflow(tp, red_cb); + *subsk = (struct sock *)tp; + + if (TCP_SKB_CB(skb)->path_mask) + *reinject = -1; + return skb; + } + } + + /* Nothing to send */ + return NULL; +} + +static void redsched_release(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct redsched_cb *red_cb = redsched_get_cb(tp); + + /* Check if the next subflow would be the released one. If yes correct + * the pointer + */ + if (red_cb->next_subflow == tp) + redsched_update_next_subflow(tp, red_cb); +} + +static struct mptcp_sched_ops mptcp_sched_red = { + .get_subflow = red_get_available_subflow, + .next_segment = mptcp_red_next_segment, + .release = redsched_release, + .name = "redundant", + .owner = THIS_MODULE, +}; + +static int __init red_register(void) +{ + BUILD_BUG_ON(sizeof(struct redsched_priv) > MPTCP_SCHED_SIZE); + BUILD_BUG_ON(sizeof(struct redsched_cb) > MPTCP_SCHED_DATA_SIZE); + + if (mptcp_register_scheduler(&mptcp_sched_red)) + return -1; + + return 0; +} + +static void red_unregister(void) +{ + mptcp_unregister_scheduler(&mptcp_sched_red); +} + +module_init(red_register); +module_exit(red_unregister); + +MODULE_AUTHOR("Tobias Erbshaeusser, Alexander Froemmgen"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("REDUNDANT MPTCP"); +MODULE_VERSION("0.90"); diff -aurN '--exclude=.git' mptcp-mptcp_trunk/net/mptcp/mptcp_sched.c mptcp/net/mptcp/mptcp_sched.c --- mptcp-mptcp_trunk/net/mptcp/mptcp_sched.c 2020-02-20 18:07:47.000000000 +0100 +++ mptcp/net/mptcp/mptcp_sched.c 2020-05-11 09:40:13.463584360 +0200 @@ -76,7 +76,7 @@ */ space = (tp->snd_cwnd - in_flight) * tp->mss_cache; - if (tp->write_seq - tp->snd_nxt > space) + if (tp->write_seq - tp->snd_nxt >= space) return true; if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp))) @@ -391,10 +391,11 @@ unsigned int *limit) { struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject); - unsigned int mss_now; + unsigned int mss_now, in_flight_space; + int remaining_in_flight_space; + u32 max_len, max_segs, window; struct tcp_sock *subtp; u16 gso_max_segs; - u32 max_len, max_segs, window, needed; /* As we set it, we have to reset it as well. */ *limit = 0; @@ -424,9 +425,6 @@ /* The following is similar to tcp_mss_split_point, but * we do not care about nagle, because we will anyways * use TCP_NAGLE_PUSH, which overrides this. - * - * So, we first limit according to the cwnd/gso-size and then according - * to the subflow's window. */ gso_max_segs = (*subsk)->sk_gso_max_segs; @@ -436,16 +434,30 @@ if (!max_segs) return NULL; - max_len = mss_now * max_segs; - window = tcp_wnd_end(subtp) - subtp->write_seq; + /* max_len is what would fit in the cwnd (respecting the 2GSO-limit of + * tcp_cwnd_test), but ignoring whatever was already queued. + */ + max_len = min(mss_now * max_segs, skb->len); - needed = min(skb->len, window); - if (max_len <= skb->len) - /* Take max_win, which is actually the cwnd/gso-size */ - *limit = max_len; + in_flight_space = (subtp->snd_cwnd - tcp_packets_in_flight(subtp)) * mss_now; + remaining_in_flight_space = (int)in_flight_space - (subtp->write_seq - subtp->snd_nxt); + + if (remaining_in_flight_space <= 0) + WARN_ONCE(1, "in_flight %u cwnd %u wseq %u snxt %u mss_now %u cache %u", + tcp_packets_in_flight(subtp), subtp->snd_cwnd, + subtp->write_seq, subtp->snd_nxt, mss_now, subtp->mss_cache); else - /* Or, take the window */ - *limit = needed; + /* max_len now fits exactly in the write-queue, taking into + * account what was already queued. + */ + max_len = min_t(u32, max_len, remaining_in_flight_space); + + window = tcp_wnd_end(subtp) - subtp->write_seq; + + /* max_len now also respects the announced receive-window */ + max_len = min(max_len, window); + + *limit = max_len; return skb; }