mirror of
https://github.com/Ysurac/openmptcprouter.git
synced 2025-02-13 20:01:55 +00:00
24122 lines
696 KiB
Diff
24122 lines
696 KiB
Diff
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
|
|
index 165abcb656c5..5d06ce2df29c 100644
|
|
--- a/Documentation/admin-guide/kernel-parameters.txt
|
|
+++ b/Documentation/admin-guide/kernel-parameters.txt
|
|
@@ -2748,6 +2748,10 @@
|
|
allocations which rules out almost all kernel
|
|
allocations. Use with caution!
|
|
|
|
+ mptcp_htable_entries=
|
|
+ [KNL,NET] Set number of hash buckets for MPTCP token
|
|
+ hashtables.
|
|
+
|
|
MTD_Partition= [MTD]
|
|
Format: <name>,<region-number>,<size>,<offset>
|
|
|
|
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
|
|
index 8af3771a3ebf..e8fecb8f6370 100644
|
|
--- a/Documentation/networking/ip-sysctl.txt
|
|
+++ b/Documentation/networking/ip-sysctl.txt
|
|
@@ -818,6 +818,18 @@ tcp_rx_skb_cache - BOOLEAN
|
|
|
|
Default: 0 (disabled)
|
|
|
|
+MPTCP variables:
|
|
+
|
|
+mptcp_enabled - INTEGER
|
|
+ Enable or disable Multipath TCP for new connections.
|
|
+ Possible values are:
|
|
+
|
|
+ 0: Multipath TCP is disabled on all TCP-sockets that are newly created.
|
|
+ 1: Multipath TCP is enabled by default on all new TCP-sockets. Note that
|
|
+ existing sockets in LISTEN-state will still use regular TCP.
|
|
+ 2: Enables Multipath TCP only upon the request of the application
|
|
+ throught the socket-option MPTCP_ENABLED.
|
|
+
|
|
UDP variables:
|
|
|
|
udp_l3mdev_accept - BOOLEAN
|
|
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
|
|
index 535ee41ee421..9f82f93e6e77 100644
|
|
--- a/drivers/infiniband/hw/cxgb4/cm.c
|
|
+++ b/drivers/infiniband/hw/cxgb4/cm.c
|
|
@@ -3950,7 +3950,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
|
|
*/
|
|
memset(&tmp_opt, 0, sizeof(tmp_opt));
|
|
tcp_clear_options(&tmp_opt);
|
|
- tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL);
|
|
+ tcp_parse_options(&init_net, skb, &tmp_opt, NULL, 0, NULL, NULL);
|
|
|
|
req = __skb_push(skb, sizeof(*req));
|
|
memset(req, 0, sizeof(*req));
|
|
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
|
|
index b04b5bd43f54..57e35d51db8c 100644
|
|
--- a/include/linux/skbuff.h
|
|
+++ b/include/linux/skbuff.h
|
|
@@ -717,7 +717,7 @@ struct sk_buff {
|
|
* want to keep them across layers you have to do a skb_clone()
|
|
* first. This is owned by whoever has the skb queued ATM.
|
|
*/
|
|
- char cb[48] __aligned(8);
|
|
+ char cb[80] __aligned(8);
|
|
|
|
union {
|
|
struct {
|
|
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
|
|
index 358deb4ff830..aebfedba9838 100644
|
|
--- a/include/linux/tcp.h
|
|
+++ b/include/linux/tcp.h
|
|
@@ -54,7 +54,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
|
|
/* TCP Fast Open */
|
|
#define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */
|
|
#define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */
|
|
-#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */
|
|
+#define TCP_FASTOPEN_COOKIE_SIZE 4 /* the size employed by this impl. */
|
|
|
|
/* TCP Fast Open Cookie as stored in memory */
|
|
struct tcp_fastopen_cookie {
|
|
@@ -74,6 +74,56 @@ struct tcp_sack_block {
|
|
u32 end_seq;
|
|
};
|
|
|
|
+struct tcp_out_options {
|
|
+ u16 options; /* bit field of OPTION_* */
|
|
+ u16 mss; /* 0 to disable */
|
|
+ u8 ws; /* window scale, 0 to disable */
|
|
+ u8 num_sack_blocks; /* number of SACK blocks to include */
|
|
+ u8 hash_size; /* bytes in hash_location */
|
|
+ __u8 *hash_location; /* temporary pointer, overloaded */
|
|
+ __u32 tsval, tsecr; /* need to include OPTION_TS */
|
|
+ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
|
|
+#ifdef CONFIG_MPTCP
|
|
+ u16 mptcp_options; /* bit field of MPTCP related OPTION_* */
|
|
+ u8 dss_csum:1, /* dss-checksum required? */
|
|
+ add_addr_v4:1,
|
|
+ add_addr_v6:1,
|
|
+ mptcp_ver:4;
|
|
+
|
|
+ union {
|
|
+ struct {
|
|
+ __u64 sender_key; /* sender's key for mptcp */
|
|
+ __u64 receiver_key; /* receiver's key for mptcp */
|
|
+ } mp_capable;
|
|
+
|
|
+ struct {
|
|
+ __u64 sender_truncated_mac;
|
|
+ __u32 sender_nonce;
|
|
+ /* random number of the sender */
|
|
+ __u32 token; /* token for mptcp */
|
|
+ u8 low_prio:1;
|
|
+ } mp_join_syns;
|
|
+ };
|
|
+
|
|
+ struct {
|
|
+ __u64 trunc_mac;
|
|
+ struct in_addr addr;
|
|
+ u16 port;
|
|
+ u8 addr_id;
|
|
+ } add_addr4;
|
|
+
|
|
+ struct {
|
|
+ __u64 trunc_mac;
|
|
+ struct in6_addr addr;
|
|
+ u16 port;
|
|
+ u8 addr_id;
|
|
+ } add_addr6;
|
|
+
|
|
+ u16 remove_addrs; /* list of address id */
|
|
+ u8 addr_id; /* address id (mp_join or add_address) */
|
|
+#endif /* CONFIG_MPTCP */
|
|
+};
|
|
+
|
|
/*These are used to set the sack_ok field in struct tcp_options_received */
|
|
#define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */
|
|
#define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/
|
|
@@ -97,6 +147,9 @@ struct tcp_options_received {
|
|
u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
|
|
};
|
|
|
|
+struct mptcp_cb;
|
|
+struct mptcp_tcp_sock;
|
|
+
|
|
static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
|
|
{
|
|
rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
|
|
@@ -135,6 +188,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
|
|
return (struct tcp_request_sock *)req;
|
|
}
|
|
|
|
+struct tcp_md5sig_key;
|
|
+
|
|
struct tcp_sock {
|
|
/* inet_connection_sock has to be the first member of tcp_sock */
|
|
struct inet_connection_sock inet_conn;
|
|
@@ -295,6 +350,7 @@ struct tcp_sock {
|
|
u32 rate_interval_us; /* saved rate sample: time elapsed */
|
|
|
|
u32 rcv_wnd; /* Current receiver window */
|
|
+ u32 rcv_right_edge; /* Highest announced right edge */
|
|
u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
|
|
u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */
|
|
u32 pushed_seq; /* Last pushed seq, required to talk to windows */
|
|
@@ -397,6 +453,44 @@ struct tcp_sock {
|
|
*/
|
|
struct request_sock __rcu *fastopen_rsk;
|
|
u32 *saved_syn;
|
|
+
|
|
+ /* MPTCP/TCP-specific callbacks */
|
|
+ const struct tcp_sock_ops *ops;
|
|
+
|
|
+ struct mptcp_cb *mpcb;
|
|
+ struct sock *meta_sk;
|
|
+ /* We keep these flags even if CONFIG_MPTCP is not checked, because
|
|
+ * it allows checking MPTCP capability just by checking the mpc flag,
|
|
+ * rather than adding ifdefs everywhere.
|
|
+ */
|
|
+ u32 mpc:1, /* Other end is multipath capable */
|
|
+ inside_tk_table:1, /* Is the tcp_sock inside the token-table? */
|
|
+ send_mp_fclose:1,
|
|
+ request_mptcp:1, /* Did we send out an MP_CAPABLE?
|
|
+ * (this speeds up mptcp_doit() in tcp_recvmsg)
|
|
+ */
|
|
+ pf:1, /* Potentially Failed state: when this flag is set, we
|
|
+ * stop using the subflow
|
|
+ */
|
|
+ mp_killed:1, /* Killed with a tcp_done in mptcp? */
|
|
+ is_master_sk:1,
|
|
+ close_it:1, /* Must close socket in mptcp_data_ready? */
|
|
+ closing:1,
|
|
+ mptcp_ver:4,
|
|
+ mptcp_sched_setsockopt:1,
|
|
+ mptcp_pm_setsockopt:1,
|
|
+ record_master_info:1,
|
|
+ tcp_disconnect:1;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+#ifdef CONFIG_MPTCP
|
|
+#define MPTCP_SCHED_NAME_MAX 16
|
|
+#define MPTCP_PM_NAME_MAX 16
|
|
+ struct hlist_nulls_node tk_table;
|
|
+ u32 mptcp_loc_token;
|
|
+ u64 mptcp_loc_key;
|
|
+ char mptcp_sched_name[MPTCP_SCHED_NAME_MAX];
|
|
+ char mptcp_pm_name[MPTCP_PM_NAME_MAX];
|
|
+#endif /* CONFIG_MPTCP */
|
|
};
|
|
|
|
enum tsq_enum {
|
|
@@ -408,6 +502,8 @@ enum tsq_enum {
|
|
TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
|
|
* tcp_v{4|6}_mtu_reduced()
|
|
*/
|
|
+ MPTCP_PATH_MANAGER_DEFERRED, /* MPTCP deferred creation of new subflows */
|
|
+ MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */
|
|
};
|
|
|
|
enum tsq_flags {
|
|
@@ -417,6 +513,8 @@ enum tsq_flags {
|
|
TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
|
|
TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
|
|
TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
|
|
+ TCPF_PATH_MANAGER_DEFERRED = (1UL << MPTCP_PATH_MANAGER_DEFERRED),
|
|
+ TCPF_SUB_DEFERRED = (1UL << MPTCP_SUB_DEFERRED),
|
|
};
|
|
|
|
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
|
|
@@ -440,6 +538,7 @@ struct tcp_timewait_sock {
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
struct tcp_md5sig_key *tw_md5_key;
|
|
#endif
|
|
+ struct mptcp_tw *mptcp_tw;
|
|
};
|
|
|
|
static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
|
|
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
|
|
index ae2ba897675c..aa91a56bd7af 100644
|
|
--- a/include/net/inet_common.h
|
|
+++ b/include/net/inet_common.h
|
|
@@ -2,6 +2,7 @@
|
|
#ifndef _INET_COMMON_H
|
|
#define _INET_COMMON_H
|
|
|
|
+#include <net/sock.h>
|
|
#include <linux/indirect_call_wrapper.h>
|
|
|
|
extern const struct proto_ops inet_stream_ops;
|
|
@@ -16,6 +17,8 @@
|
|
struct sockaddr;
|
|
struct socket;
|
|
|
|
+int inet_create(struct net *net, struct socket *sock, int protocol, int kern);
|
|
+int inet6_create(struct net *net, struct socket *sock, int protocol, int kern);
|
|
int inet_release(struct socket *sock);
|
|
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
|
|
int addr_len, int flags);
|
|
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
|
|
index 13792c0ef46e..e99cc510610f 100644
|
|
--- a/include/net/inet_connection_sock.h
|
|
+++ b/include/net/inet_connection_sock.h
|
|
@@ -25,6 +25,7 @@
|
|
|
|
struct inet_bind_bucket;
|
|
struct tcp_congestion_ops;
|
|
+struct tcp_options_received;
|
|
|
|
/*
|
|
* Pointers to address related TCP functions
|
|
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
|
|
index 34c4436fd18f..828f79528b32 100644
|
|
--- a/include/net/inet_sock.h
|
|
+++ b/include/net/inet_sock.h
|
|
@@ -79,7 +79,7 @@ struct inet_request_sock {
|
|
#define ireq_state req.__req_common.skc_state
|
|
#define ireq_family req.__req_common.skc_family
|
|
|
|
- u16 snd_wscale : 4,
|
|
+ u32 snd_wscale : 4,
|
|
rcv_wscale : 4,
|
|
tstamp_ok : 1,
|
|
sack_ok : 1,
|
|
@@ -87,6 +87,8 @@ struct inet_request_sock {
|
|
ecn_ok : 1,
|
|
acked : 1,
|
|
no_srccheck: 1,
|
|
+ mptcp_rqsk : 1,
|
|
+ saw_mpc : 1,
|
|
smc_ok : 1;
|
|
u32 ir_mark;
|
|
union {
|
|
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
|
|
new file mode 100644
|
|
index 000000000000..196b8939cbab
|
|
--- /dev/null
|
|
+++ b/include/net/mptcp.h
|
|
@@ -0,0 +1,1577 @@
|
|
+/*
|
|
+ * MPTCP implementation
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
|
|
+ *
|
|
+ * Current Maintainer & Author:
|
|
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
|
|
+ *
|
|
+ * Additional authors:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ * Gregory Detal <gregory.detal@uclouvain.be>
|
|
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
|
|
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
|
|
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
|
|
+ * Andreas Ripke <ripke@neclab.eu>
|
|
+ * Vlad Dogaru <vlad.dogaru@intel.com>
|
|
+ * Octavian Purdila <octavian.purdila@intel.com>
|
|
+ * John Ronan <jronan@tssg.org>
|
|
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
|
|
+ * Brandon Heller <brandonh@stanford.edu>
|
|
+ *
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#ifndef _MPTCP_H
|
|
+#define _MPTCP_H
|
|
+
|
|
+#include <linux/inetdevice.h>
|
|
+#include <linux/ipv6.h>
|
|
+#include <linux/list.h>
|
|
+#include <linux/net.h>
|
|
+#include <linux/netpoll.h>
|
|
+#include <linux/siphash.h>
|
|
+#include <linux/skbuff.h>
|
|
+#include <linux/socket.h>
|
|
+#include <linux/tcp.h>
|
|
+#include <linux/kernel.h>
|
|
+
|
|
+#include <asm/byteorder.h>
|
|
+#include <asm/unaligned.h>
|
|
+#include <crypto/hash.h>
|
|
+#include <crypto/sha.h>
|
|
+#include <net/tcp.h>
|
|
+
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ #define ntohll(x) be64_to_cpu(x)
|
|
+ #define htonll(x) cpu_to_be64(x)
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ #define ntohll(x) (x)
|
|
+ #define htonll(x) (x)
|
|
+#endif
|
|
+
|
|
+struct mptcp_loc4 {
|
|
+ u8 loc4_id;
|
|
+ u8 low_prio:1;
|
|
+ int if_idx;
|
|
+ struct in_addr addr;
|
|
+};
|
|
+
|
|
+struct mptcp_rem4 {
|
|
+ u8 rem4_id;
|
|
+ __be16 port;
|
|
+ struct in_addr addr;
|
|
+};
|
|
+
|
|
+struct mptcp_loc6 {
|
|
+ u8 loc6_id;
|
|
+ u8 low_prio:1;
|
|
+ int if_idx;
|
|
+ struct in6_addr addr;
|
|
+};
|
|
+
|
|
+struct mptcp_rem6 {
|
|
+ u8 rem6_id;
|
|
+ __be16 port;
|
|
+ struct in6_addr addr;
|
|
+};
|
|
+
|
|
+struct mptcp_request_sock {
|
|
+ struct tcp_request_sock req;
|
|
+ struct hlist_nulls_node hash_entry;
|
|
+
|
|
+ union {
|
|
+ struct {
|
|
+ /* Only on initial subflows */
|
|
+ u64 mptcp_loc_key;
|
|
+ u64 mptcp_rem_key;
|
|
+ u32 mptcp_loc_token;
|
|
+ };
|
|
+
|
|
+ struct {
|
|
+ /* Only on additional subflows */
|
|
+ u32 mptcp_rem_nonce;
|
|
+ u32 mptcp_loc_nonce;
|
|
+ u64 mptcp_hash_tmac;
|
|
+ };
|
|
+ };
|
|
+
|
|
+ u8 loc_id;
|
|
+ u8 rem_id; /* Address-id in the MP_JOIN */
|
|
+ u16 dss_csum:1,
|
|
+ rem_key_set:1,
|
|
+ is_sub:1, /* Is this a new subflow? */
|
|
+ low_prio:1, /* Interface set to low-prio? */
|
|
+ rcv_low_prio:1,
|
|
+ mptcp_ver:4;
|
|
+};
|
|
+
|
|
+struct mptcp_options_received {
|
|
+ u16 saw_mpc:1,
|
|
+ dss_csum:1,
|
|
+ drop_me:1,
|
|
+
|
|
+ is_mp_join:1,
|
|
+ join_ack:1,
|
|
+
|
|
+ saw_low_prio:2, /* 0x1 - low-prio set for this subflow
|
|
+ * 0x2 - low-prio set for another subflow
|
|
+ */
|
|
+ low_prio:1,
|
|
+
|
|
+ saw_add_addr:2, /* Saw at least one add_addr option:
|
|
+ * 0x1: IPv4 - 0x2: IPv6
|
|
+ */
|
|
+ more_add_addr:1, /* Saw one more add-addr. */
|
|
+
|
|
+ saw_rem_addr:1, /* Saw at least one rem_addr option */
|
|
+ more_rem_addr:1, /* Saw one more rem-addr. */
|
|
+
|
|
+ mp_fail:1,
|
|
+ mp_fclose:1;
|
|
+ u8 rem_id; /* Address-id in the MP_JOIN */
|
|
+ u8 prio_addr_id; /* Address-id in the MP_PRIO */
|
|
+
|
|
+ const unsigned char *add_addr_ptr; /* Pointer to add-address option */
|
|
+ const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */
|
|
+
|
|
+ u32 data_ack;
|
|
+ u32 data_seq;
|
|
+ u16 data_len;
|
|
+
|
|
+ u8 mptcp_ver; /* MPTCP version */
|
|
+
|
|
+ /* Key inside the option (from mp_capable or fast_close) */
|
|
+ u64 mptcp_sender_key;
|
|
+ u64 mptcp_receiver_key;
|
|
+
|
|
+ u32 mptcp_rem_token; /* Remote token */
|
|
+
|
|
+ u32 mptcp_recv_nonce;
|
|
+ u64 mptcp_recv_tmac;
|
|
+ u8 mptcp_recv_mac[20];
|
|
+};
|
|
+
|
|
+struct mptcp_tcp_sock {
|
|
+ struct hlist_node node;
|
|
+ struct hlist_node cb_list;
|
|
+ struct mptcp_options_received rx_opt;
|
|
+
|
|
+ /* Those three fields record the current mapping */
|
|
+ u64 map_data_seq;
|
|
+ u32 map_subseq;
|
|
+ u16 map_data_len;
|
|
+ u16 slave_sk:1,
|
|
+ fully_established:1,
|
|
+ second_packet:1,
|
|
+ attached:1,
|
|
+ send_mp_fail:1,
|
|
+ include_mpc:1,
|
|
+ mapping_present:1,
|
|
+ map_data_fin:1,
|
|
+ low_prio:1, /* use this socket as backup */
|
|
+ rcv_low_prio:1, /* Peer sent low-prio option to us */
|
|
+ send_mp_prio:1, /* Trigger to send mp_prio on this socket */
|
|
+ pre_established:1; /* State between sending 3rd ACK and
|
|
+ * receiving the fourth ack of new subflows.
|
|
+ */
|
|
+
|
|
+ /* isn: needed to translate abs to relative subflow seqnums */
|
|
+ u32 snt_isn;
|
|
+ u32 rcv_isn;
|
|
+ u8 path_index;
|
|
+ u8 loc_id;
|
|
+ u8 rem_id;
|
|
+ u8 sk_err;
|
|
+
|
|
+#define MPTCP_SCHED_SIZE 16
|
|
+ u8 mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8);
|
|
+
|
|
+ int init_rcv_wnd;
|
|
+ u32 infinite_cutoff_seq;
|
|
+ struct delayed_work work;
|
|
+ u32 mptcp_loc_nonce;
|
|
+ struct tcp_sock *tp;
|
|
+ u32 last_end_data_seq;
|
|
+
|
|
+ /* MP_JOIN subflow: timer for retransmitting the 3rd ack */
|
|
+ struct timer_list mptcp_ack_timer;
|
|
+
|
|
+ /* HMAC of the third ack */
|
|
+ char sender_mac[SHA256_DIGEST_SIZE];
|
|
+};
|
|
+
|
|
+struct mptcp_tw {
|
|
+ struct list_head list;
|
|
+ u64 loc_key;
|
|
+ u64 rcv_nxt;
|
|
+ struct mptcp_cb __rcu *mpcb;
|
|
+ u8 meta_tw:1,
|
|
+ in_list:1;
|
|
+};
|
|
+
|
|
+#define MPTCP_PM_NAME_MAX 16
|
|
+struct mptcp_pm_ops {
|
|
+ struct list_head list;
|
|
+
|
|
+ /* Signal the creation of a new MPTCP-session. */
|
|
+ void (*new_session)(const struct sock *meta_sk);
|
|
+ void (*release_sock)(struct sock *meta_sk);
|
|
+ void (*fully_established)(struct sock *meta_sk);
|
|
+ void (*close_session)(struct sock *meta_sk);
|
|
+ void (*new_remote_address)(struct sock *meta_sk);
|
|
+ int (*get_local_id)(const struct sock *meta_sk, sa_family_t family,
|
|
+ union inet_addr *addr, bool *low_prio);
|
|
+ void (*addr_signal)(struct sock *sk, unsigned *size,
|
|
+ struct tcp_out_options *opts, struct sk_buff *skb);
|
|
+ void (*add_raddr)(struct mptcp_cb *mpcb, const union inet_addr *addr,
|
|
+ sa_family_t family, __be16 port, u8 id);
|
|
+ void (*rem_raddr)(struct mptcp_cb *mpcb, u8 rem_id);
|
|
+ void (*init_subsocket_v4)(struct sock *sk, struct in_addr addr);
|
|
+ void (*init_subsocket_v6)(struct sock *sk, struct in6_addr addr);
|
|
+ void (*established_subflow)(struct sock *sk);
|
|
+ void (*delete_subflow)(struct sock *sk);
|
|
+ void (*prio_changed)(struct sock *sk, int low_prio);
|
|
+
|
|
+ char name[MPTCP_PM_NAME_MAX];
|
|
+ struct module *owner;
|
|
+};
|
|
+
|
|
+struct mptcp_sched_ops {
|
|
+ struct list_head list;
|
|
+
|
|
+ struct sock * (*get_subflow)(struct sock *meta_sk,
|
|
+ struct sk_buff *skb,
|
|
+ bool zero_wnd_test);
|
|
+ struct sk_buff * (*next_segment)(struct sock *meta_sk,
|
|
+ int *reinject,
|
|
+ struct sock **subsk,
|
|
+ unsigned int *limit);
|
|
+ void (*init)(struct sock *sk);
|
|
+ void (*release)(struct sock *sk);
|
|
+
|
|
+ char name[MPTCP_SCHED_NAME_MAX];
|
|
+ struct module *owner;
|
|
+};
|
|
+
|
|
+struct mptcp_cb {
|
|
+ /* list of sockets in this multipath connection */
|
|
+ struct hlist_head conn_list;
|
|
+ /* list of sockets that need a call to release_cb */
|
|
+ struct hlist_head callback_list;
|
|
+
|
|
+ /* Lock used for protecting the different rcu-lists of mptcp_cb */
|
|
+ spinlock_t mpcb_list_lock;
|
|
+
|
|
+ /* High-order bits of 64-bit sequence numbers */
|
|
+ u32 snd_high_order[2];
|
|
+ u32 rcv_high_order[2];
|
|
+
|
|
+ u16 send_infinite_mapping:1,
|
|
+ send_mptcpv1_mpcapable:1,
|
|
+ rem_key_set:1,
|
|
+ in_time_wait:1,
|
|
+ list_rcvd:1, /* XXX TO REMOVE */
|
|
+ addr_signal:1, /* Path-manager wants us to call addr_signal */
|
|
+ dss_csum:1,
|
|
+ server_side:1,
|
|
+ infinite_mapping_rcv:1,
|
|
+ infinite_mapping_snd:1,
|
|
+ dfin_combined:1, /* Was the DFIN combined with subflow-fin? */
|
|
+ passive_close:1,
|
|
+ snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */
|
|
+ rcv_hiseq_index:1, /* Index in rcv_high_order of rcv_nxt */
|
|
+ tcp_ca_explicit_set:1; /* was meta CC set by app? */
|
|
+
|
|
+#define MPTCP_SCHED_DATA_SIZE 8
|
|
+ u8 mptcp_sched[MPTCP_SCHED_DATA_SIZE] __aligned(8);
|
|
+ const struct mptcp_sched_ops *sched_ops;
|
|
+
|
|
+ struct sk_buff_head reinject_queue;
|
|
+ /* First cache-line boundary is here minus 8 bytes. But from the
|
|
+ * reinject-queue only the next and prev pointers are regularly
|
|
+ * accessed. Thus, the whole data-path is on a single cache-line.
|
|
+ */
|
|
+
|
|
+ u64 csum_cutoff_seq;
|
|
+ u64 infinite_rcv_seq;
|
|
+
|
|
+ /***** Start of fields, used for connection closure */
|
|
+ unsigned char mptw_state;
|
|
+ u8 dfin_path_index;
|
|
+
|
|
+ struct list_head tw_list;
|
|
+
|
|
+ /***** Start of fields, used for subflow establishment and closure */
|
|
+ refcount_t mpcb_refcnt;
|
|
+
|
|
+ /* Mutex needed, because otherwise mptcp_close will complain that the
|
|
+ * socket is owned by the user.
|
|
+ * E.g., mptcp_sub_close_wq is taking the meta-lock.
|
|
+ */
|
|
+ struct mutex mpcb_mutex;
|
|
+
|
|
+ /***** Start of fields, used for subflow establishment */
|
|
+ struct sock *meta_sk;
|
|
+
|
|
+ /* Master socket, also part of the conn_list, this
|
|
+ * socket is the one that the application sees.
|
|
+ */
|
|
+ struct sock *master_sk;
|
|
+
|
|
+ __u64 mptcp_loc_key;
|
|
+ __u64 mptcp_rem_key;
|
|
+ __u32 mptcp_loc_token;
|
|
+ __u32 mptcp_rem_token;
|
|
+
|
|
+#define MPTCP_PM_SIZE 608
|
|
+ u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8);
|
|
+ const struct mptcp_pm_ops *pm_ops;
|
|
+
|
|
+ unsigned long path_index_bits;
|
|
+
|
|
+ __u8 mptcp_ver;
|
|
+
|
|
+ /* Original snd/rcvbuf of the initial subflow.
|
|
+ * Used for the new subflows on the server-side to allow correct
|
|
+ * autotuning
|
|
+ */
|
|
+ int orig_sk_rcvbuf;
|
|
+ int orig_sk_sndbuf;
|
|
+ u32 orig_window_clamp;
|
|
+
|
|
+ struct tcp_info *master_info;
|
|
+};
|
|
+
|
|
+#define MPTCP_VERSION_0 0
|
|
+#define MPTCP_VERSION_1 1
|
|
+
|
|
+#define MPTCP_SUB_CAPABLE 0
|
|
+#define MPTCP_SUB_LEN_CAPABLE_SYN 12
|
|
+#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12
|
|
+#define MPTCP_SUB_LEN_CAPABLE_ACK 20
|
|
+#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20
|
|
+
|
|
+#define MPTCPV1_SUB_LEN_CAPABLE_SYN 4
|
|
+#define MPTCPV1_SUB_LEN_CAPABLE_SYN_ALIGN 4
|
|
+#define MPTCPV1_SUB_LEN_CAPABLE_SYNACK 12
|
|
+#define MPTCPV1_SUB_LEN_CAPABLE_SYNACK_ALIGN 12
|
|
+#define MPTCPV1_SUB_LEN_CAPABLE_ACK 20
|
|
+#define MPTCPV1_SUB_LEN_CAPABLE_ACK_ALIGN 20
|
|
+#define MPTCPV1_SUB_LEN_CAPABLE_DATA 22
|
|
+#define MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM 24
|
|
+#define MPTCPV1_SUB_LEN_CAPABLE_DATA_ALIGN 24
|
|
+
|
|
+#define MPTCP_SUB_JOIN 1
|
|
+#define MPTCP_SUB_LEN_JOIN_SYN 12
|
|
+#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12
|
|
+#define MPTCP_SUB_LEN_JOIN_SYNACK 16
|
|
+#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16
|
|
+#define MPTCP_SUB_LEN_JOIN_ACK 24
|
|
+#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24
|
|
+
|
|
+#define MPTCP_SUB_DSS 2
|
|
+#define MPTCP_SUB_LEN_DSS 4
|
|
+#define MPTCP_SUB_LEN_DSS_ALIGN 4
|
|
+
|
|
+/* Lengths for seq and ack are the ones without the generic MPTCP-option header,
|
|
+ * as they are part of the DSS-option.
|
|
+ * To get the total length, just add the different options together.
|
|
+ */
|
|
+#define MPTCP_SUB_LEN_SEQ 10
|
|
+#define MPTCP_SUB_LEN_SEQ_CSUM 12
|
|
+#define MPTCP_SUB_LEN_SEQ_ALIGN 12
|
|
+
|
|
+#define MPTCP_SUB_LEN_SEQ_64 14
|
|
+#define MPTCP_SUB_LEN_SEQ_CSUM_64 16
|
|
+#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16
|
|
+
|
|
+#define MPTCP_SUB_LEN_ACK 4
|
|
+#define MPTCP_SUB_LEN_ACK_ALIGN 4
|
|
+
|
|
+#define MPTCP_SUB_LEN_ACK_64 8
|
|
+#define MPTCP_SUB_LEN_ACK_64_ALIGN 8
|
|
+
|
|
+/* This is the "default" option-length we will send out most often.
|
|
+ * MPTCP DSS-header
|
|
+ * 32-bit data sequence number
|
|
+ * 32-bit data ack
|
|
+ *
|
|
+ * It is necessary to calculate the effective MSS we will be using when
|
|
+ * sending data.
|
|
+ */
|
|
+#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \
|
|
+ MPTCP_SUB_LEN_SEQ_ALIGN + \
|
|
+ MPTCP_SUB_LEN_ACK_ALIGN)
|
|
+
|
|
+#define MPTCP_SUB_ADD_ADDR 3
|
|
+#define MPTCP_SUB_LEN_ADD_ADDR4 8
|
|
+#define MPTCP_SUB_LEN_ADD_ADDR4_VER1 16
|
|
+#define MPTCP_SUB_LEN_ADD_ADDR6 20
|
|
+#define MPTCP_SUB_LEN_ADD_ADDR6_VER1 28
|
|
+#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8
|
|
+#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1 16
|
|
+#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20
|
|
+#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1 28
|
|
+
|
|
+#define MPTCP_SUB_REMOVE_ADDR 4
|
|
+#define MPTCP_SUB_LEN_REMOVE_ADDR 4
|
|
+
|
|
+#define MPTCP_SUB_PRIO 5
|
|
+#define MPTCP_SUB_LEN_PRIO 3
|
|
+#define MPTCP_SUB_LEN_PRIO_ADDR 4
|
|
+#define MPTCP_SUB_LEN_PRIO_ALIGN 4
|
|
+
|
|
+#define MPTCP_SUB_FAIL 6
|
|
+#define MPTCP_SUB_LEN_FAIL 12
|
|
+#define MPTCP_SUB_LEN_FAIL_ALIGN 12
|
|
+
|
|
+#define MPTCP_SUB_FCLOSE 7
|
|
+#define MPTCP_SUB_LEN_FCLOSE 12
|
|
+#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12
|
|
+
|
|
+
|
|
+#define OPTION_MPTCP (1 << 5)
|
|
+
|
|
+/* Max number of fastclose retransmissions */
|
|
+#define MPTCP_FASTCLOSE_RETRIES 3
|
|
+
|
|
+#ifdef CONFIG_MPTCP
|
|
+
|
|
+/* Used for checking if the mptcp initialization has been successful */
|
|
+extern bool mptcp_init_failed;
|
|
+
|
|
+/* MPTCP options */
|
|
+#define OPTION_TYPE_SYN (1 << 0)
|
|
+#define OPTION_TYPE_SYNACK (1 << 1)
|
|
+#define OPTION_TYPE_ACK (1 << 2)
|
|
+#define OPTION_MP_CAPABLE (1 << 3)
|
|
+#define OPTION_DATA_ACK (1 << 4)
|
|
+#define OPTION_ADD_ADDR (1 << 5)
|
|
+#define OPTION_MP_JOIN (1 << 6)
|
|
+#define OPTION_MP_FAIL (1 << 7)
|
|
+#define OPTION_MP_FCLOSE (1 << 8)
|
|
+#define OPTION_REMOVE_ADDR (1 << 9)
|
|
+#define OPTION_MP_PRIO (1 << 10)
|
|
+
|
|
+/* MPTCP flags: both TX and RX */
|
|
+#define MPTCPHDR_SEQ 0x01 /* DSS.M option is present */
|
|
+#define MPTCPHDR_FIN 0x02 /* DSS.F option is present */
|
|
+#define MPTCPHDR_SEQ64_INDEX 0x04 /* index of seq in mpcb->snd_high_order */
|
|
+#define MPTCPHDR_MPC_DATA 0x08
|
|
+/* MPTCP flags: RX only */
|
|
+#define MPTCPHDR_ACK 0x10
|
|
+#define MPTCPHDR_SEQ64_SET 0x20 /* Did we received a 64-bit seq number? */
|
|
+#define MPTCPHDR_SEQ64_OFO 0x40 /* Is it not in our circular array? */
|
|
+#define MPTCPHDR_DSS_CSUM 0x80
|
|
+/* MPTCP flags: TX only */
|
|
+#define MPTCPHDR_INF 0x10
|
|
+#define MPTCP_REINJECT 0x20 /* Did we reinject this segment? */
|
|
+
|
|
+struct mptcp_option {
|
|
+ __u8 kind;
|
|
+ __u8 len;
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u8 ver:4,
|
|
+ sub:4;
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ __u8 sub:4,
|
|
+ ver:4;
|
|
+#else
|
|
+#error "Adjust your <asm/byteorder.h> defines"
|
|
+#endif
|
|
+};
|
|
+
|
|
+struct mp_capable {
|
|
+ __u8 kind;
|
|
+ __u8 len;
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u8 ver:4,
|
|
+ sub:4;
|
|
+ __u8 h:1,
|
|
+ rsv:5,
|
|
+ b:1,
|
|
+ a:1;
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ __u8 sub:4,
|
|
+ ver:4;
|
|
+ __u8 a:1,
|
|
+ b:1,
|
|
+ rsv:5,
|
|
+ h:1;
|
|
+#else
|
|
+#error "Adjust your <asm/byteorder.h> defines"
|
|
+#endif
|
|
+ __u64 sender_key;
|
|
+ __u64 receiver_key;
|
|
+} __attribute__((__packed__));
|
|
+
|
|
+struct mp_join {
|
|
+ __u8 kind;
|
|
+ __u8 len;
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u8 b:1,
|
|
+ rsv:3,
|
|
+ sub:4;
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ __u8 sub:4,
|
|
+ rsv:3,
|
|
+ b:1;
|
|
+#else
|
|
+#error "Adjust your <asm/byteorder.h> defines"
|
|
+#endif
|
|
+ __u8 addr_id;
|
|
+ union {
|
|
+ struct {
|
|
+ u32 token;
|
|
+ u32 nonce;
|
|
+ } syn;
|
|
+ struct {
|
|
+ __u64 mac;
|
|
+ u32 nonce;
|
|
+ } synack;
|
|
+ struct {
|
|
+ __u8 mac[20];
|
|
+ } ack;
|
|
+ } u;
|
|
+} __attribute__((__packed__));
|
|
+
|
|
+struct mp_dss {
|
|
+ __u8 kind;
|
|
+ __u8 len;
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u16 rsv1:4,
|
|
+ sub:4,
|
|
+ A:1,
|
|
+ a:1,
|
|
+ M:1,
|
|
+ m:1,
|
|
+ F:1,
|
|
+ rsv2:3;
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ __u16 sub:4,
|
|
+ rsv1:4,
|
|
+ rsv2:3,
|
|
+ F:1,
|
|
+ m:1,
|
|
+ M:1,
|
|
+ a:1,
|
|
+ A:1;
|
|
+#else
|
|
+#error "Adjust your <asm/byteorder.h> defines"
|
|
+#endif
|
|
+};
|
|
+
|
|
+struct mp_add_addr {
|
|
+ __u8 kind;
|
|
+ __u8 len;
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ union {
|
|
+ struct {
|
|
+ __u8 ipver:4,
|
|
+ sub:4;
|
|
+ } v0;
|
|
+ struct {
|
|
+ __u8 echo:1,
|
|
+ rsv:3,
|
|
+ sub:4;
|
|
+ } v1;
|
|
+ } u_bit;
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ union {
|
|
+ struct {
|
|
+ __u8 sub:4,
|
|
+ ipver:4;
|
|
+ } v0;
|
|
+ struct {
|
|
+ __u8 sub:4,
|
|
+ rsv:3,
|
|
+ echo:1;
|
|
+ } v1;
|
|
+ } u_bit;
|
|
+#else
|
|
+#error "Adjust your <asm/byteorder.h> defines"
|
|
+#endif
|
|
+ __u8 addr_id;
|
|
+ union {
|
|
+ struct {
|
|
+ struct in_addr addr;
|
|
+ __be16 port;
|
|
+ __u8 mac[8];
|
|
+ } v4;
|
|
+ struct {
|
|
+ struct in6_addr addr;
|
|
+ __be16 port;
|
|
+ __u8 mac[8];
|
|
+ } v6;
|
|
+ } u;
|
|
+} __attribute__((__packed__));
|
|
+
|
|
+struct mp_remove_addr {
|
|
+ __u8 kind;
|
|
+ __u8 len;
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u8 rsv:4,
|
|
+ sub:4;
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ __u8 sub:4,
|
|
+ rsv:4;
|
|
+#else
|
|
+#error "Adjust your <asm/byteorder.h> defines"
|
|
+#endif
|
|
+ /* list of addr_id */
|
|
+ __u8 addrs_id;
|
|
+};
|
|
+
|
|
+struct mp_fail {
|
|
+ __u8 kind;
|
|
+ __u8 len;
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u16 rsv1:4,
|
|
+ sub:4,
|
|
+ rsv2:8;
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ __u16 sub:4,
|
|
+ rsv1:4,
|
|
+ rsv2:8;
|
|
+#else
|
|
+#error "Adjust your <asm/byteorder.h> defines"
|
|
+#endif
|
|
+ __be64 data_seq;
|
|
+} __attribute__((__packed__));
|
|
+
|
|
+struct mp_fclose {
|
|
+ __u8 kind;
|
|
+ __u8 len;
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u16 rsv1:4,
|
|
+ sub:4,
|
|
+ rsv2:8;
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ __u16 sub:4,
|
|
+ rsv1:4,
|
|
+ rsv2:8;
|
|
+#else
|
|
+#error "Adjust your <asm/byteorder.h> defines"
|
|
+#endif
|
|
+ __u64 key;
|
|
+} __attribute__((__packed__));
|
|
+
|
|
+struct mp_prio {
|
|
+ __u8 kind;
|
|
+ __u8 len;
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u8 b:1,
|
|
+ rsv:3,
|
|
+ sub:4;
|
|
+#elif defined(__BIG_ENDIAN_BITFIELD)
|
|
+ __u8 sub:4,
|
|
+ rsv:3,
|
|
+ b:1;
|
|
+#else
|
|
+#error "Adjust your <asm/byteorder.h> defines"
|
|
+#endif
|
|
+ __u8 addr_id;
|
|
+} __attribute__((__packed__));
|
|
+
|
|
+struct mptcp_hashtable {
|
|
+ struct hlist_nulls_head *hashtable;
|
|
+ unsigned int mask;
|
|
+};
|
|
+
|
|
+static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum)
|
|
+{
|
|
+ return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2);
|
|
+}
|
|
+
|
|
+#define MPTCP_ENABLE 0x01
|
|
+#define MPTCP_SOCKOPT 0x02
|
|
+#define MPTCP_CLIENT_DISABLE 0x04
|
|
+#define MPTCP_SERVER_DISABLE 0x08
|
|
+
|
|
+extern int sysctl_mptcp_enabled;
|
|
+extern int sysctl_mptcp_version;
|
|
+extern int sysctl_mptcp_checksum;
|
|
+extern int sysctl_mptcp_debug;
|
|
+extern int sysctl_mptcp_syn_retries;
|
|
+
|
|
+extern struct workqueue_struct *mptcp_wq;
|
|
+
|
|
+#define mptcp_debug(fmt, args...) \
|
|
+ do { \
|
|
+ if (unlikely(sysctl_mptcp_debug)) \
|
|
+ pr_err(fmt, ##args); \
|
|
+ } while (0)
|
|
+
|
|
+static inline struct sock *mptcp_to_sock(const struct mptcp_tcp_sock *mptcp)
|
|
+{
|
|
+ return (struct sock *)mptcp->tp;
|
|
+}
|
|
+
|
|
+#define mptcp_for_each_sub(__mpcb, __mptcp) \
|
|
+ hlist_for_each_entry_rcu(__mptcp, &((__mpcb)->conn_list), node)
|
|
+
|
|
+/* Must be called with the appropriate lock held */
|
|
+#define mptcp_for_each_sub_safe(__mpcb, __mptcp, __tmp) \
|
|
+ hlist_for_each_entry_safe(__mptcp, __tmp, &((__mpcb)->conn_list), node)
|
|
+
|
|
+/* Iterates over all bit set to 1 in a bitset */
|
|
+#define mptcp_for_each_bit_set(b, i) \
|
|
+ for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1)
|
|
+
|
|
+#define mptcp_for_each_bit_unset(b, i) \
|
|
+ mptcp_for_each_bit_set(~b, i)
|
|
+
|
|
+#define MPTCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mptcp.mptcp_statistics, field)
|
|
+#define MPTCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mptcp.mptcp_statistics, field)
|
|
+
|
|
+enum
|
|
+{
|
|
+ MPTCP_MIB_NUM = 0,
|
|
+ MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */
|
|
+ MPTCP_MIB_MPCAPABLEACTIVE, /* Sent SYN with MP_CAPABLE */
|
|
+ MPTCP_MIB_MPCAPABLEACTIVEACK, /* Received SYN/ACK with MP_CAPABLE */
|
|
+ MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */
|
|
+ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
|
|
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
|
|
+ MPTCP_MIB_MPCAPABLERETRANSFALLBACK,/* Client-side stopped sending MP_CAPABLE after too many SYN-retransmissions */
|
|
+ MPTCP_MIB_CSUMENABLED, /* Created MPTCP-connection with DSS-checksum enabled */
|
|
+ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */
|
|
+ MPTCP_MIB_MPFAILRX, /* Received an MP_FAIL */
|
|
+ MPTCP_MIB_CSUMFAIL, /* Received segment with invalid checksum */
|
|
+ MPTCP_MIB_FASTCLOSERX, /* Recevied a FAST_CLOSE */
|
|
+ MPTCP_MIB_FASTCLOSETX, /* Sent a FAST_CLOSE */
|
|
+ MPTCP_MIB_FBACKSUB, /* Fallback upon ack without data-ack on new subflow */
|
|
+ MPTCP_MIB_FBACKINIT, /* Fallback upon ack without data-ack on initial subflow */
|
|
+ MPTCP_MIB_FBDATASUB, /* Fallback upon data without DSS at the beginning on new subflow */
|
|
+ MPTCP_MIB_FBDATAINIT, /* Fallback upon data without DSS at the beginning on initial subflow */
|
|
+ MPTCP_MIB_REMADDRSUB, /* Remove subflow due to REMOVE_ADDR */
|
|
+ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */
|
|
+ MPTCP_MIB_JOINFALLBACK, /* Received MP_JOIN on session that has fallen back to reg. TCP */
|
|
+ MPTCP_MIB_JOINSYNTX, /* Sent a SYN + MP_JOIN */
|
|
+ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */
|
|
+ MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */
|
|
+ MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */
|
|
+ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */
|
|
+ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
|
|
+ MPTCP_MIB_JOINACKFAIL, /* Third ACK on new subflow did not contain an MP_JOIN */
|
|
+ MPTCP_MIB_JOINACKRTO, /* Retransmission timer for third ACK + MP_JOIN timed out */
|
|
+ MPTCP_MIB_JOINACKRXMIT, /* Retransmitted an ACK + MP_JOIN */
|
|
+ MPTCP_MIB_NODSSWINDOW, /* Received too many packets without a DSS-option */
|
|
+ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
|
|
+ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
|
|
+ MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */
|
|
+ MPTCP_MIB_DSSTRIMHEAD, /* Trimmed segment at the head (coalescing middlebox) */
|
|
+ MPTCP_MIB_DSSSPLITTAIL, /* Trimmed segment at the tail (coalescing middlebox) */
|
|
+ MPTCP_MIB_PURGEOLD, /* Removed old skb from the rcv-queue due to missing DSS-mapping */
|
|
+ MPTCP_MIB_ADDADDRRX, /* Received an ADD_ADDR */
|
|
+ MPTCP_MIB_ADDADDRTX, /* Sent an ADD_ADDR */
|
|
+ MPTCP_MIB_REMADDRRX, /* Received a REMOVE_ADDR */
|
|
+ MPTCP_MIB_REMADDRTX, /* Sent a REMOVE_ADDR */
|
|
+ MPTCP_MIB_JOINALTERNATEPORT, /* Established a subflow on a different destination port-number */
|
|
+ MPTCP_MIB_CURRESTAB, /* Current established MPTCP connections */
|
|
+ __MPTCP_MIB_MAX
|
|
+};
|
|
+
|
|
+#define MPTCP_MIB_MAX __MPTCP_MIB_MAX
|
|
+struct mptcp_mib {
|
|
+ unsigned long mibs[MPTCP_MIB_MAX];
|
|
+};
|
|
+
|
|
+extern struct lock_class_key meta_key;
|
|
+extern char *meta_key_name;
|
|
+extern struct lock_class_key meta_slock_key;
|
|
+extern char *meta_slock_key_name;
|
|
+
|
|
+extern siphash_key_t mptcp_secret;
|
|
+
|
|
+/* This is needed to ensure that two subsequent key/nonce-generation result in
|
|
+ * different keys/nonces if the IPs and ports are the same.
|
|
+ */
|
|
+extern u32 mptcp_seed;
|
|
+
|
|
+extern struct mptcp_hashtable mptcp_tk_htable;
|
|
+
|
|
+/* Request-sockets can be hashed in the tk_htb for collision-detection or in
|
|
+ * the regular htb for join-connections. We need to define different NULLS
|
|
+ * values so that we can correctly detect a request-socket that has been
|
|
+ * recycled. See also c25eb3bfb9729.
|
|
+ */
|
|
+#define MPTCP_REQSK_NULLS_BASE (1U << 29)
|
|
+
|
|
+
|
|
+void mptcp_data_ready(struct sock *sk);
|
|
+void mptcp_write_space(struct sock *sk);
|
|
+
|
|
+void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,
|
|
+ struct sock *sk);
|
|
+void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied);
|
|
+int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
|
|
+ gfp_t flags);
|
|
+void mptcp_del_sock(struct sock *sk);
|
|
+void mptcp_update_metasocket(const struct sock *meta_sk);
|
|
+void mptcp_reinject_data(struct sock *orig_sk, int clone_it);
|
|
+void mptcp_update_sndbuf(const struct tcp_sock *tp);
|
|
+void mptcp_send_fin(struct sock *meta_sk);
|
|
+void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority);
|
|
+bool mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
+ int push_one, gfp_t gfp);
|
|
+void tcp_parse_mptcp_options(const struct sk_buff *skb,
|
|
+ struct mptcp_options_received *mopt);
|
|
+void mptcp_parse_options(const uint8_t *ptr, int opsize,
|
|
+ struct mptcp_options_received *mopt,
|
|
+ const struct sk_buff *skb,
|
|
+ struct tcp_sock *tp);
|
|
+void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,
|
|
+ unsigned *remaining);
|
|
+void mptcp_synack_options(struct request_sock *req,
|
|
+ struct tcp_out_options *opts,
|
|
+ unsigned *remaining);
|
|
+void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
|
|
+ struct tcp_out_options *opts, unsigned *size);
|
|
+void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
|
|
+ const struct tcp_out_options *opts,
|
|
+ struct sk_buff *skb);
|
|
+void mptcp_close(struct sock *meta_sk, long timeout);
|
|
+bool mptcp_doit(struct sock *sk);
|
|
+int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key,
|
|
+ int rem_key_set, __u8 mptcp_ver, u32 window);
|
|
+int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req);
|
|
+int mptcp_check_req_master(struct sock *sk, struct sock *child,
|
|
+ struct request_sock *req, const struct sk_buff *skb,
|
|
+ const struct mptcp_options_received *mopt,
|
|
+ int drop, u32 tsoff);
|
|
+struct sock *mptcp_check_req_child(struct sock *meta_sk,
|
|
+ struct sock *child,
|
|
+ struct request_sock *req,
|
|
+ struct sk_buff *skb,
|
|
+ const struct mptcp_options_received *mopt);
|
|
+u32 __mptcp_select_window(struct sock *sk);
|
|
+void mptcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
|
|
+ __u32 *rcv_wnd, __u32 *window_clamp,
|
|
+ int wscale_ok, __u8 *rcv_wscale,
|
|
+ __u32 init_rcv_wnd);
|
|
+unsigned int mptcp_current_mss(struct sock *meta_sk);
|
|
+void mptcp_hmac(u8 ver, const u8 *key_1, const u8 *key_2, u8 *hash_out,
|
|
+ int arg_num, ...);
|
|
+void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk);
|
|
+void mptcp_fin(struct sock *meta_sk);
|
|
+void mptcp_meta_retransmit_timer(struct sock *meta_sk);
|
|
+void mptcp_sub_retransmit_timer(struct sock *sk);
|
|
+int mptcp_write_wakeup(struct sock *meta_sk, int mib);
|
|
+void mptcp_sub_close_wq(struct work_struct *work);
|
|
+void mptcp_sub_close(struct sock *sk, unsigned long delay);
|
|
+struct sock *mptcp_select_ack_sock(const struct sock *meta_sk);
|
|
+void mptcp_prepare_for_backlog(struct sock *sk, struct sk_buff *skb);
|
|
+void mptcp_initialize_recv_vars(struct tcp_sock *meta_tp, struct mptcp_cb *mpcb,
|
|
+ __u64 remote_key);
|
|
+int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb);
|
|
+void mptcp_ack_handler(struct timer_list *t);
|
|
+bool mptcp_check_rtt(const struct tcp_sock *tp, int time);
|
|
+int mptcp_check_snd_buf(const struct tcp_sock *tp);
|
|
+bool mptcp_handle_options(struct sock *sk, const struct tcphdr *th,
|
|
+ const struct sk_buff *skb);
|
|
+void __init mptcp_init(void);
|
|
+void mptcp_destroy_sock(struct sock *sk);
|
|
+int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
|
|
+ const struct sk_buff *skb,
|
|
+ const struct mptcp_options_received *mopt);
|
|
+unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,
|
|
+ int large_allowed);
|
|
+int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw);
|
|
+void mptcp_twsk_destructor(struct tcp_timewait_sock *tw);
|
|
+void mptcp_time_wait(struct sock *sk, int state, int timeo);
|
|
+void mptcp_disconnect(struct sock *meta_sk);
|
|
+bool mptcp_should_expand_sndbuf(const struct sock *sk);
|
|
+int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb);
|
|
+void mptcp_tsq_flags(struct sock *sk);
|
|
+void mptcp_tsq_sub_deferred(struct sock *meta_sk);
|
|
+struct mp_join *mptcp_find_join(const struct sk_buff *skb);
|
|
+void mptcp_hash_remove_bh(struct tcp_sock *meta_tp);
|
|
+struct sock *mptcp_hash_find(const struct net *net, const u32 token);
|
|
+int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw);
|
|
+int mptcp_do_join_short(struct sk_buff *skb,
|
|
+ const struct mptcp_options_received *mopt,
|
|
+ struct net *net);
|
|
+void mptcp_reqsk_destructor(struct request_sock *req);
|
|
+void mptcp_connect_init(struct sock *sk);
|
|
+void mptcp_sub_force_close(struct sock *sk);
|
|
+int mptcp_sub_len_remove_addr_align(u16 bitfield);
|
|
+void mptcp_join_reqsk_init(const struct mptcp_cb *mpcb,
|
|
+ const struct request_sock *req,
|
|
+ struct sk_buff *skb);
|
|
+void mptcp_reqsk_init(struct request_sock *req, const struct sock *sk,
|
|
+ const struct sk_buff *skb, bool want_cookie);
|
|
+int mptcp_conn_request(struct sock *sk, struct sk_buff *skb);
|
|
+void mptcp_enable_sock(struct sock *sk);
|
|
+void mptcp_disable_sock(struct sock *sk);
|
|
+void mptcp_disable_static_key(void);
|
|
+void mptcp_cookies_reqsk_init(struct request_sock *req,
|
|
+ struct mptcp_options_received *mopt,
|
|
+ struct sk_buff *skb);
|
|
+void mptcp_mpcb_put(struct mptcp_cb *mpcb);
|
|
+int mptcp_finish_handshake(struct sock *child, struct sk_buff *skb);
|
|
+int mptcp_get_info(const struct sock *meta_sk, char __user *optval, int optlen);
|
|
+void mptcp_clear_sk(struct sock *sk, int size);
|
|
+
|
|
+/* MPTCP-path-manager registration/initialization functions */
|
|
+int mptcp_register_path_manager(struct mptcp_pm_ops *pm);
|
|
+void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm);
|
|
+void mptcp_init_path_manager(struct mptcp_cb *mpcb);
|
|
+void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb);
|
|
+void mptcp_fallback_default(struct mptcp_cb *mpcb);
|
|
+void mptcp_get_default_path_manager(char *name);
|
|
+int mptcp_set_scheduler(struct sock *sk, const char *name);
|
|
+int mptcp_set_path_manager(struct sock *sk, const char *name);
|
|
+int mptcp_set_default_path_manager(const char *name);
|
|
+extern struct mptcp_pm_ops mptcp_pm_default;
|
|
+
|
|
+/* MPTCP-scheduler registration/initialization functions */
|
|
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
|
|
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
|
|
+void mptcp_init_scheduler(struct mptcp_cb *mpcb);
|
|
+void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb);
|
|
+void mptcp_get_default_scheduler(char *name);
|
|
+int mptcp_set_default_scheduler(const char *name);
|
|
+bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb,
|
|
+ bool zero_wnd_test);
|
|
+bool mptcp_is_def_unavailable(struct sock *sk);
|
|
+bool subflow_is_active(const struct tcp_sock *tp);
|
|
+bool subflow_is_backup(const struct tcp_sock *tp);
|
|
+struct sock *get_available_subflow(struct sock *meta_sk, struct sk_buff *skb,
|
|
+ bool zero_wnd_test);
|
|
+struct sk_buff *mptcp_next_segment(struct sock *meta_sk,
|
|
+ int *reinject,
|
|
+ struct sock **subsk,
|
|
+ unsigned int *limit);
|
|
+extern struct mptcp_sched_ops mptcp_sched_default;
|
|
+
|
|
+/* Initializes function-pointers and MPTCP-flags */
|
|
+static inline void mptcp_init_tcp_sock(struct sock *sk)
|
|
+{
|
|
+ if (!mptcp_init_failed && sysctl_mptcp_enabled == MPTCP_ENABLE)
|
|
+ mptcp_enable_sock(sk);
|
|
+}
|
|
+
|
|
+static inline void mptcp_init_listen(struct sock *sk)
|
|
+{
|
|
+ if (!mptcp_init_failed &&
|
|
+ sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP &&
|
|
+#ifdef CONFIG_TCP_MD5SIG
|
|
+ !rcu_access_pointer(tcp_sk(sk)->md5sig_info) &&
|
|
+#endif
|
|
+ sysctl_mptcp_enabled & MPTCP_ENABLE &&
|
|
+ !(sysctl_mptcp_enabled & MPTCP_SERVER_DISABLE))
|
|
+ mptcp_enable_sock(sk);
|
|
+}
|
|
+
|
|
+static inline void mptcp_init_connect(struct sock *sk)
|
|
+{
|
|
+ if (!mptcp_init_failed &&
|
|
+ sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP &&
|
|
+#ifdef CONFIG_TCP_MD5SIG
|
|
+ !rcu_access_pointer(tcp_sk(sk)->md5sig_info) &&
|
|
+#endif
|
|
+ sysctl_mptcp_enabled & MPTCP_ENABLE &&
|
|
+ !(sysctl_mptcp_enabled & MPTCP_CLIENT_DISABLE))
|
|
+ mptcp_enable_sock(sk);
|
|
+}
|
|
+
|
|
+static inline int mptcp_pi_to_flag(int pi)
|
|
+{
|
|
+ return 1 << (pi - 1);
|
|
+}
|
|
+
|
|
+static inline
|
|
+struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req)
|
|
+{
|
|
+ return (struct mptcp_request_sock *)req;
|
|
+}
|
|
+
|
|
+static inline
|
|
+struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req)
|
|
+{
|
|
+ return (struct request_sock *)req;
|
|
+}
|
|
+
|
|
+static inline bool mptcp_can_sendpage(struct sock *sk)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ if (tcp_sk(sk)->mpcb->dss_csum)
|
|
+ return false;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (!(sk_it->sk_route_caps & NETIF_F_SG))
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static inline void mptcp_push_pending_frames(struct sock *meta_sk)
|
|
+{
|
|
+ /* We check packets out and send-head here. TCP only checks the
|
|
+ * send-head. But, MPTCP also checks packets_out, as this is an
|
|
+ * indication that we might want to do opportunistic reinjection.
|
|
+ */
|
|
+ if (tcp_sk(meta_sk)->packets_out || tcp_send_head(meta_sk)) {
|
|
+ struct tcp_sock *tp = tcp_sk(meta_sk);
|
|
+
|
|
+ /* We don't care about the MSS, because it will be set in
|
|
+ * mptcp_write_xmit.
|
|
+ */
|
|
+ __tcp_push_pending_frames(meta_sk, 0, tp->nonagle);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void mptcp_send_reset(struct sock *sk)
|
|
+{
|
|
+ if (tcp_need_reset(sk->sk_state))
|
|
+ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
|
|
+ mptcp_sub_force_close(sk);
|
|
+}
|
|
+
|
|
+static inline void mptcp_sub_force_close_all(struct mptcp_cb *mpcb,
|
|
+ struct sock *except)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct hlist_node *tmp;
|
|
+
|
|
+ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (sk_it != except)
|
|
+ mptcp_send_reset(sk_it);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline bool mptcp_is_data_mpcapable(const struct sk_buff *skb)
|
|
+{
|
|
+ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_MPC_DATA;
|
|
+}
|
|
+
|
|
+static inline bool mptcp_is_data_seq(const struct sk_buff *skb)
|
|
+{
|
|
+ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ;
|
|
+}
|
|
+
|
|
+static inline bool mptcp_is_data_fin(const struct sk_buff *skb)
|
|
+{
|
|
+ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN;
|
|
+}
|
|
+
|
|
+/* Is it a data-fin while in infinite mapping mode?
|
|
+ * In infinite mode, a subflow-fin is in fact a data-fin.
|
|
+ */
|
|
+static inline bool mptcp_is_data_fin2(const struct sk_buff *skb,
|
|
+ const struct tcp_sock *tp)
|
|
+{
|
|
+ return mptcp_is_data_fin(skb) ||
|
|
+ (tp->mpcb->infinite_mapping_rcv &&
|
|
+ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN));
|
|
+}
|
|
+
|
|
+static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
|
|
+{
|
|
+ u64 data_seq_high = (u32)(data_seq >> 32);
|
|
+
|
|
+ if (mpcb->rcv_high_order[0] == data_seq_high)
|
|
+ return 0;
|
|
+ else if (mpcb->rcv_high_order[1] == data_seq_high)
|
|
+ return MPTCPHDR_SEQ64_INDEX;
|
|
+ else
|
|
+ return MPTCPHDR_SEQ64_OFO;
|
|
+}
|
|
+
|
|
+/* Sets the data_seq and returns pointer to the in-skb field of the data_seq.
|
|
+ * If the packet has a 64-bit dseq, the pointer points to the last 32 bits.
|
|
+ */
|
|
+static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb,
|
|
+ u32 *data_seq,
|
|
+ struct mptcp_cb *mpcb)
|
|
+{
|
|
+ __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);
|
|
+
|
|
+ if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
|
|
+ u64 data_seq64 = get_unaligned_be64(ptr);
|
|
+
|
|
+ if (mpcb)
|
|
+ TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);
|
|
+
|
|
+ *data_seq = (u32)data_seq64;
|
|
+ ptr++;
|
|
+ } else {
|
|
+ *data_seq = get_unaligned_be32(ptr);
|
|
+ }
|
|
+
|
|
+ return ptr;
|
|
+}
|
|
+
|
|
+static inline struct sock *mptcp_meta_sk(const struct sock *sk)
|
|
+{
|
|
+ return tcp_sk(sk)->meta_sk;
|
|
+}
|
|
+
|
|
+static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
|
|
+{
|
|
+ return tcp_sk(tp->meta_sk);
|
|
+}
|
|
+
|
|
+static inline int is_meta_tp(const struct tcp_sock *tp)
|
|
+{
|
|
+ return tp->mpcb && mptcp_meta_tp(tp) == tp;
|
|
+}
|
|
+
|
|
+static inline int is_meta_sk(const struct sock *sk)
|
|
+{
|
|
+ return sk->sk_state != TCP_NEW_SYN_RECV &&
|
|
+ sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP &&
|
|
+ mptcp(tcp_sk(sk)) && mptcp_meta_sk(sk) == sk;
|
|
+}
|
|
+
|
|
+static inline int is_master_tp(const struct tcp_sock *tp)
|
|
+{
|
|
+ return !mptcp(tp) || (!tp->mptcp->slave_sk && !is_meta_tp(tp));
|
|
+}
|
|
+
|
|
+static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt)
|
|
+{
|
|
+ mopt->saw_mpc = 0;
|
|
+ mopt->dss_csum = 0;
|
|
+ mopt->drop_me = 0;
|
|
+
|
|
+ mopt->is_mp_join = 0;
|
|
+ mopt->join_ack = 0;
|
|
+
|
|
+ mopt->saw_low_prio = 0;
|
|
+ mopt->low_prio = 0;
|
|
+
|
|
+ mopt->saw_add_addr = 0;
|
|
+ mopt->more_add_addr = 0;
|
|
+
|
|
+ mopt->saw_rem_addr = 0;
|
|
+ mopt->more_rem_addr = 0;
|
|
+
|
|
+ mopt->mp_fail = 0;
|
|
+ mopt->mp_fclose = 0;
|
|
+}
|
|
+
|
|
+static inline void mptcp_reset_mopt(struct tcp_sock *tp)
|
|
+{
|
|
+ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
|
|
+
|
|
+ mopt->saw_low_prio = 0;
|
|
+ mopt->saw_add_addr = 0;
|
|
+ mopt->more_add_addr = 0;
|
|
+ mopt->saw_rem_addr = 0;
|
|
+ mopt->more_rem_addr = 0;
|
|
+ mopt->join_ack = 0;
|
|
+ mopt->mp_fail = 0;
|
|
+ mopt->mp_fclose = 0;
|
|
+}
|
|
+
|
|
+static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb,
|
|
+ const struct mptcp_cb *mpcb)
|
|
+{
|
|
+ return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
|
|
+ MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
|
|
+}
|
|
+
|
|
+static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index,
|
|
+ u32 data_seq_32)
|
|
+{
|
|
+ return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
|
|
+}
|
|
+
|
|
+static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
|
|
+ meta_tp->rcv_nxt);
|
|
+}
|
|
+
|
|
+static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
|
|
+{
|
|
+ if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
|
|
+ mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp,
|
|
+ u32 old_rcv_nxt)
|
|
+{
|
|
+ if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
|
|
+ mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline int mptcp_sk_can_send(const struct sock *sk)
|
|
+{
|
|
+ return tcp_passive_fastopen(sk) ||
|
|
+ ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
|
|
+ !tcp_sk(sk)->mptcp->pre_established);
|
|
+}
|
|
+
|
|
+static inline int mptcp_sk_can_recv(const struct sock *sk)
|
|
+{
|
|
+ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2);
|
|
+}
|
|
+
|
|
+static inline int mptcp_sk_can_send_ack(const struct sock *sk)
|
|
+{
|
|
+ return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV |
|
|
+ TCPF_CLOSE | TCPF_LISTEN)) &&
|
|
+ !tcp_sk(sk)->mptcp->pre_established;
|
|
+}
|
|
+
|
|
+static inline bool mptcp_can_sg(const struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ if (tcp_sk(meta_sk)->mpcb->dss_csum)
|
|
+ return false;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (!mptcp_sk_can_send(sk))
|
|
+ continue;
|
|
+ if (!(sk->sk_route_caps & NETIF_F_SG))
|
|
+ return false;
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static inline void mptcp_set_rto(struct sock *sk)
|
|
+{
|
|
+ struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk));
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ __u32 max_rto = 0;
|
|
+
|
|
+ /* We are in recovery-phase on the MPTCP-level. Do not update the
|
|
+ * RTO, because this would kill exponential backoff.
|
|
+ */
|
|
+ if (micsk->icsk_retransmits)
|
|
+ return;
|
|
+
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if ((mptcp_sk_can_send(sk_it) || sk_it->sk_state == TCP_SYN_RECV) &&
|
|
+ inet_csk(sk_it)->icsk_retransmits == 0 &&
|
|
+ inet_csk(sk_it)->icsk_backoff == 0 &&
|
|
+ inet_csk(sk_it)->icsk_rto > max_rto)
|
|
+ max_rto = inet_csk(sk_it)->icsk_rto;
|
|
+ }
|
|
+ if (max_rto) {
|
|
+ micsk->icsk_rto = max_rto << 1;
|
|
+
|
|
+ /* A successfull rto-measurement - reset backoff counter */
|
|
+ micsk->icsk_backoff = 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void mptcp_sub_close_passive(struct sock *sk)
|
|
+{
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk);
|
|
+
|
|
+ /* Only close, if the app did a send-shutdown (passive close), and we
|
|
+ * received the data-ack of the data-fin.
|
|
+ */
|
|
+ if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq)
|
|
+ mptcp_sub_close(sk, 0);
|
|
+}
|
|
+
|
|
+static inline void mptcp_fallback_close(struct mptcp_cb *mpcb,
|
|
+ struct sock *except)
|
|
+{
|
|
+ mptcp_sub_force_close_all(mpcb, except);
|
|
+
|
|
+ if (mpcb->pm_ops->close_session)
|
|
+ mpcb->pm_ops->close_session(mptcp_meta_sk(except));
|
|
+}
|
|
+
|
|
+static inline bool mptcp_fallback_infinite(struct sock *sk, int flag)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+
|
|
+ /* If data has been acknowleged on the meta-level, fully_established
|
|
+ * will have been set before and thus we will not fall back to infinite
|
|
+ * mapping.
|
|
+ */
|
|
+ if (likely(tp->mptcp->fully_established))
|
|
+ return false;
|
|
+
|
|
+ if (!(flag & MPTCP_FLAG_DATA_ACKED))
|
|
+ return false;
|
|
+
|
|
+ /* Don't fallback twice ;) */
|
|
+ if (mpcb->infinite_mapping_snd)
|
|
+ return false;
|
|
+
|
|
+ pr_debug("%s %#x will fallback - pi %d, src %pI4:%u dst %pI4:%u rcv_nxt %u from %pS\n",
|
|
+ __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,
|
|
+ &inet_sk(sk)->inet_saddr, ntohs(inet_sk(sk)->inet_sport),
|
|
+ &inet_sk(sk)->inet_daddr, ntohs(inet_sk(sk)->inet_dport),
|
|
+ tp->rcv_nxt, __builtin_return_address(0));
|
|
+ if (!is_master_tp(tp)) {
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBACKSUB);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ mpcb->infinite_mapping_snd = 1;
|
|
+ mpcb->infinite_mapping_rcv = 1;
|
|
+ mpcb->infinite_rcv_seq = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
|
|
+ tp->mptcp->fully_established = 1;
|
|
+
|
|
+ mptcp_fallback_close(mpcb, sk);
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBACKINIT);
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static inline bool mptcp_v6_is_v4_mapped(const struct sock *sk)
|
|
+{
|
|
+ return sk->sk_family == AF_INET6 &&
|
|
+ ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED;
|
|
+}
|
|
+
|
|
+/* We are in or are becoming to be in infinite mapping mode */
|
|
+static inline bool mptcp_in_infinite_mapping_weak(const struct mptcp_cb *mpcb)
|
|
+{
|
|
+ return mpcb->infinite_mapping_rcv ||
|
|
+ mpcb->infinite_mapping_snd ||
|
|
+ mpcb->send_infinite_mapping;
|
|
+}
|
|
+
|
|
+static inline bool mptcp_can_new_subflow(const struct sock *meta_sk)
|
|
+{
|
|
+ /* Has been removed from the tk-table. Thus, no new subflows.
|
|
+ *
|
|
+ * Check for close-state is necessary, because we may have been closed
|
|
+ * without passing by mptcp_close().
|
|
+ *
|
|
+ * When falling back, no new subflows are allowed either.
|
|
+ */
|
|
+ return meta_sk->sk_state != TCP_CLOSE &&
|
|
+ tcp_sk(meta_sk)->inside_tk_table &&
|
|
+ !tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv &&
|
|
+ !tcp_sk(meta_sk)->mpcb->send_infinite_mapping;
|
|
+}
|
|
+
|
|
+static inline int mptcp_subflow_count(const struct mptcp_cb *mpcb)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ int i = 0;
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp)
|
|
+ i++;
|
|
+
|
|
+ return i;
|
|
+}
|
|
+
|
|
+/* TCP and MPTCP mpc flag-depending functions */
|
|
+u16 mptcp_select_window(struct sock *sk);
|
|
+void mptcp_tcp_set_rto(struct sock *sk);
|
|
+
|
|
+#else /* CONFIG_MPTCP */
|
|
+#define mptcp_debug(fmt, args...) \
|
|
+ do { \
|
|
+ } while (0)
|
|
+
|
|
+static inline struct sock *mptcp_to_sock(const struct mptcp_tcp_sock *mptcp)
|
|
+{
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+#define mptcp_for_each_sub(__mpcb, __mptcp) \
|
|
+ if (0)
|
|
+
|
|
+#define MPTCP_INC_STATS(net, field) \
|
|
+ do { \
|
|
+ } while(0)
|
|
+
|
|
+#define MPTCP_DEC_STATS(net, field) \
|
|
+ do { \
|
|
+ } while(0)
|
|
+
|
|
+static inline bool mptcp_is_data_fin(const struct sk_buff *skb)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+static inline bool mptcp_is_data_seq(const struct sk_buff *skb)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+static inline struct sock *mptcp_meta_sk(const struct sock *sk)
|
|
+{
|
|
+ return NULL;
|
|
+}
|
|
+static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
|
|
+{
|
|
+ return NULL;
|
|
+}
|
|
+static inline int is_meta_sk(const struct sock *sk)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+static inline int is_master_tp(const struct tcp_sock *tp)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+static inline void mptcp_del_sock(const struct sock *sk) {}
|
|
+static inline void mptcp_update_metasocket(const struct sock *meta_sk) {}
|
|
+static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {}
|
|
+static inline void mptcp_update_sndbuf(const struct tcp_sock *tp) {}
|
|
+static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb,
|
|
+ const struct sock *sk) {}
|
|
+static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {}
|
|
+static inline void mptcp_set_rto(const struct sock *sk) {}
|
|
+static inline void mptcp_send_fin(const struct sock *meta_sk) {}
|
|
+static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize,
|
|
+ struct mptcp_options_received *mopt,
|
|
+ const struct sk_buff *skb,
|
|
+ const struct tcp_sock *tp) {}
|
|
+static inline void mptcp_syn_options(const struct sock *sk,
|
|
+ struct tcp_out_options *opts,
|
|
+ unsigned *remaining) {}
|
|
+static inline void mptcp_synack_options(struct request_sock *req,
|
|
+ struct tcp_out_options *opts,
|
|
+ unsigned *remaining) {}
|
|
+
|
|
+static inline void mptcp_established_options(struct sock *sk,
|
|
+ struct sk_buff *skb,
|
|
+ struct tcp_out_options *opts,
|
|
+ unsigned *size) {}
|
|
+static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
|
|
+ const struct tcp_out_options *opts,
|
|
+ struct sk_buff *skb) {}
|
|
+static inline void mptcp_close(struct sock *meta_sk, long timeout) {}
|
|
+static inline bool mptcp_doit(struct sock *sk)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+static inline int mptcp_check_req_fastopen(struct sock *child,
|
|
+ struct request_sock *req)
|
|
+{
|
|
+ return 1;
|
|
+}
|
|
+static inline int mptcp_check_req_master(const struct sock *sk,
|
|
+ const struct sock *child,
|
|
+ const struct request_sock *req,
|
|
+ const struct sk_buff *skb,
|
|
+ const struct mptcp_options_received *mopt,
|
|
+ int drop,
|
|
+ u32 tsoff)
|
|
+{
|
|
+ return 1;
|
|
+}
|
|
+static inline struct sock *mptcp_check_req_child(const struct sock *meta_sk,
|
|
+ const struct sock *child,
|
|
+ const struct request_sock *req,
|
|
+ struct sk_buff *skb,
|
|
+ const struct mptcp_options_received *mopt)
|
|
+{
|
|
+ return NULL;
|
|
+}
|
|
+static inline unsigned int mptcp_current_mss(struct sock *meta_sk)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+static inline void mptcp_sub_close_passive(struct sock *sk) {}
|
|
+static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {}
|
|
+static inline void mptcp_prepare_for_backlog(struct sock *sk, struct sk_buff *skb) {}
|
|
+static inline bool mptcp_check_rtt(const struct tcp_sock *tp, int time)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+static inline int mptcp_check_snd_buf(const struct tcp_sock *tp)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+static inline void mptcp_push_pending_frames(struct sock *meta_sk) {}
|
|
+static inline void mptcp_send_reset(const struct sock *sk) {}
|
|
+static inline void mptcp_sub_force_close_all(struct mptcp_cb *mpcb,
|
|
+ struct sock *except) {}
|
|
+static inline bool mptcp_handle_options(struct sock *sk,
|
|
+ const struct tcphdr *th,
|
|
+ struct sk_buff *skb)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+static inline void mptcp_reset_mopt(struct tcp_sock *tp) {}
|
|
+static inline void __init mptcp_init(void) {}
|
|
+static inline bool mptcp_can_sg(const struct sock *meta_sk)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+static inline unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk,
|
|
+ u32 mss_now, int large_allowed)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+static inline void mptcp_destroy_sock(struct sock *sk) {}
|
|
+static inline int mptcp_rcv_synsent_state_process(struct sock *sk,
|
|
+ struct sock **skptr,
|
|
+ struct sk_buff *skb,
|
|
+ const struct mptcp_options_received *mopt)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+static inline bool mptcp_can_sendpage(struct sock *sk)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+static inline int mptcp_init_tw_sock(struct sock *sk,
|
|
+ struct tcp_timewait_sock *tw)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {}
|
|
+static inline void mptcp_disconnect(struct sock *meta_sk) {}
|
|
+static inline void mptcp_tsq_flags(struct sock *sk) {}
|
|
+static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {}
|
|
+static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {}
|
|
+static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
|
|
+ const struct sk_buff *skb) {}
|
|
+static inline void mptcp_init_tcp_sock(struct sock *sk) {}
|
|
+static inline void mptcp_init_listen(struct sock *sk) {}
|
|
+static inline void mptcp_init_connect(struct sock *sk) {}
|
|
+static inline void mptcp_disable_static_key(void) {}
|
|
+static inline void mptcp_cookies_reqsk_init(struct request_sock *req,
|
|
+ struct mptcp_options_received *mopt,
|
|
+ struct sk_buff *skb) {}
|
|
+static inline void mptcp_mpcb_put(struct mptcp_cb *mpcb) {}
|
|
+static inline void mptcp_fin(struct sock *meta_sk) {}
|
|
+static inline bool mptcp_in_infinite_mapping_weak(const struct mptcp_cb *mpcb)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+static inline bool mptcp_can_new_subflow(const struct sock *meta_sk)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_MPTCP */
|
|
+
|
|
+#endif /* _MPTCP_H */
|
|
diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h
|
|
new file mode 100644
|
|
index 000000000000..c58d42b11f6a
|
|
--- /dev/null
|
|
+++ b/include/net/mptcp_v4.h
|
|
@@ -0,0 +1,76 @@
|
|
+/*
|
|
+ * MPTCP implementation
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
|
|
+ *
|
|
+ * Current Maintainer & Author:
|
|
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
|
|
+ *
|
|
+ * Additional authors:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ * Gregory Detal <gregory.detal@uclouvain.be>
|
|
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
|
|
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
|
|
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
|
|
+ * Andreas Ripke <ripke@neclab.eu>
|
|
+ * Vlad Dogaru <vlad.dogaru@intel.com>
|
|
+ * Octavian Purdila <octavian.purdila@intel.com>
|
|
+ * John Ronan <jronan@tssg.org>
|
|
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
|
|
+ * Brandon Heller <brandonh@stanford.edu>
|
|
+ *
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#ifndef MPTCP_V4_H_
|
|
+#define MPTCP_V4_H_
|
|
+
|
|
+
|
|
+#include <linux/in.h>
|
|
+#include <linux/skbuff.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <net/request_sock.h>
|
|
+#include <net/sock.h>
|
|
+
|
|
+extern struct request_sock_ops mptcp_request_sock_ops;
|
|
+extern const struct inet_connection_sock_af_ops mptcp_v4_specific;
|
|
+extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
|
|
+extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
|
|
+
|
|
+#ifdef CONFIG_MPTCP
|
|
+
|
|
+int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
|
|
+struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
|
|
+ const __be32 laddr, const struct net *net);
|
|
+int __mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
|
|
+ __be16 sport, struct mptcp_rem4 *rem,
|
|
+ struct sock **subsk);
|
|
+int mptcp_pm_v4_init(void);
|
|
+void mptcp_pm_v4_undo(void);
|
|
+u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
|
|
+u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
|
|
+ u32 seed);
|
|
+
|
|
+static inline int mptcp_init4_subsockets(struct sock *meta_sk,
|
|
+ const struct mptcp_loc4 *loc,
|
|
+ struct mptcp_rem4 *rem)
|
|
+{
|
|
+ return __mptcp_init4_subsockets(meta_sk, loc, 0, rem, NULL);
|
|
+}
|
|
+
|
|
+#else
|
|
+
|
|
+static inline int mptcp_v4_do_rcv(const struct sock *meta_sk,
|
|
+ const struct sk_buff *skb)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_MPTCP */
|
|
+
|
|
+#endif /* MPTCP_V4_H_ */
|
|
diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h
|
|
new file mode 100644
|
|
index 000000000000..93e8c87c2eb1
|
|
--- /dev/null
|
|
+++ b/include/net/mptcp_v6.h
|
|
@@ -0,0 +1,77 @@
|
|
+/*
|
|
+ * MPTCP implementation
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
|
|
+ *
|
|
+ * Current Maintainer & Author:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ *
|
|
+ * Additional authors:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ * Gregory Detal <gregory.detal@uclouvain.be>
|
|
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
|
|
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
|
|
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
|
|
+ * Andreas Ripke <ripke@neclab.eu>
|
|
+ * Vlad Dogaru <vlad.dogaru@intel.com>
|
|
+ * Octavian Purdila <octavian.purdila@intel.com>
|
|
+ * John Ronan <jronan@tssg.org>
|
|
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
|
|
+ * Brandon Heller <brandonh@stanford.edu>
|
|
+ *
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#ifndef _MPTCP_V6_H
|
|
+#define _MPTCP_V6_H
|
|
+
|
|
+#include <linux/in6.h>
|
|
+#include <net/if_inet6.h>
|
|
+
|
|
+#include <net/mptcp.h>
|
|
+
|
|
+
|
|
+#ifdef CONFIG_MPTCP
|
|
+extern const struct inet_connection_sock_af_ops mptcp_v6_mapped;
|
|
+extern const struct inet_connection_sock_af_ops mptcp_v6_specific;
|
|
+extern struct request_sock_ops mptcp6_request_sock_ops;
|
|
+extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
|
|
+extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
|
|
+
|
|
+int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
|
|
+struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
|
|
+ const struct in6_addr *laddr, const struct net *net);
|
|
+int __mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
|
|
+ __be16 sport, struct mptcp_rem6 *rem,
|
|
+ struct sock **subsk);
|
|
+int mptcp_pm_v6_init(void);
|
|
+void mptcp_pm_v6_undo(void);
|
|
+__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
|
|
+ __be16 sport, __be16 dport);
|
|
+u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
|
|
+ __be16 sport, __be16 dport, u32 seed);
|
|
+
|
|
+static inline int mptcp_init6_subsockets(struct sock *meta_sk,
|
|
+ const struct mptcp_loc6 *loc,
|
|
+ struct mptcp_rem6 *rem)
|
|
+{
|
|
+ return __mptcp_init6_subsockets(meta_sk, loc, 0, rem, NULL);
|
|
+}
|
|
+
|
|
+#else /* CONFIG_MPTCP */
|
|
+
|
|
+#define mptcp_v6_mapped ipv6_mapped
|
|
+
|
|
+static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_MPTCP */
|
|
+
|
|
+#endif /* _MPTCP_V6_H */
|
|
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
|
|
index 167e390ac9d4..7233acfcdb4d 100644
|
|
--- a/include/net/net_namespace.h
|
|
+++ b/include/net/net_namespace.h
|
|
@@ -19,6 +19,7 @@
|
|
#include <net/netns/packet.h>
|
|
#include <net/netns/ipv4.h>
|
|
#include <net/netns/ipv6.h>
|
|
+#include <net/netns/mptcp.h>
|
|
#include <net/netns/nexthop.h>
|
|
#include <net/netns/ieee802154_6lowpan.h>
|
|
#include <net/netns/sctp.h>
|
|
@@ -123,6 +124,9 @@ struct net {
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
struct netns_ipv6 ipv6;
|
|
#endif
|
|
+#if IS_ENABLED(CONFIG_MPTCP)
|
|
+ struct netns_mptcp mptcp;
|
|
+#endif
|
|
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
|
|
struct netns_ieee802154_lowpan ieee802154_lowpan;
|
|
#endif
|
|
diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h
|
|
new file mode 100644
|
|
index 000000000000..6680f3bbcfc8
|
|
--- /dev/null
|
|
+++ b/include/net/netns/mptcp.h
|
|
@@ -0,0 +1,52 @@
|
|
+/*
|
|
+ * MPTCP implementation - MPTCP namespace
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
|
|
+ *
|
|
+ * Current Maintainer:
|
|
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
|
|
+ *
|
|
+ * Additional authors:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ * Gregory Detal <gregory.detal@uclouvain.be>
|
|
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
|
|
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
|
|
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
|
|
+ * Andreas Ripke <ripke@neclab.eu>
|
|
+ * Vlad Dogaru <vlad.dogaru@intel.com>
|
|
+ * Octavian Purdila <octavian.purdila@intel.com>
|
|
+ * John Ronan <jronan@tssg.org>
|
|
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
|
|
+ * Brandon Heller <brandonh@stanford.edu>
|
|
+ *
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#ifndef __NETNS_MPTCP_H__
|
|
+#define __NETNS_MPTCP_H__
|
|
+
|
|
+#include <linux/compiler.h>
|
|
+
|
|
+enum {
|
|
+ MPTCP_PM_FULLMESH = 0,
|
|
+ MPTCP_PM_MAX
|
|
+};
|
|
+
|
|
+struct mptcp_mib;
|
|
+
|
|
+struct netns_mptcp {
|
|
+ DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics);
|
|
+
|
|
+#ifdef CONFIG_PROC_FS
|
|
+ struct proc_dir_entry *proc_net_mptcp;
|
|
+#endif
|
|
+
|
|
+ void *path_managers[MPTCP_PM_MAX];
|
|
+};
|
|
+
|
|
+#endif /* __NETNS_MPTCP_H__ */
|
|
diff --git a/include/net/snmp.h b/include/net/snmp.h
|
|
index cb8ced4380a6..0aa0d10af2ce 100644
|
|
--- a/include/net/snmp.h
|
|
+++ b/include/net/snmp.h
|
|
@@ -86,7 +86,6 @@ struct icmpv6msg_mib_device {
|
|
atomic_long_t mibs[ICMP6MSG_MIB_MAX];
|
|
};
|
|
|
|
-
|
|
/* TCP */
|
|
#define TCP_MIB_MAX __TCP_MIB_MAX
|
|
struct tcp_mib {
|
|
diff --git a/include/net/sock.h b/include/net/sock.h
|
|
index 079b5f6f13d8..8ae33ecd9d0a 100644
|
|
--- a/include/net/sock.h
|
|
+++ b/include/net/sock.h
|
|
@@ -821,6 +821,7 @@ enum sock_flags {
|
|
SOCK_TXTIME,
|
|
SOCK_XDP, /* XDP is attached */
|
|
SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
|
|
+ SOCK_MPTCP, /* MPTCP set on this socket */
|
|
};
|
|
|
|
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
|
|
@@ -1133,6 +1134,7 @@ struct proto {
|
|
void (*unhash)(struct sock *sk);
|
|
void (*rehash)(struct sock *sk);
|
|
int (*get_port)(struct sock *sk, unsigned short snum);
|
|
+ void (*clear_sk)(struct sock *sk, int size);
|
|
|
|
/* Keeping track of sockets in use */
|
|
#ifdef CONFIG_PROC_FS
|
|
diff --git a/include/net/tcp.h b/include/net/tcp.h
|
|
index b914959cd2c6..b290be3e510c 100644
|
|
--- a/include/net/tcp.h
|
|
+++ b/include/net/tcp.h
|
|
@@ -182,6 +182,7 @@
|
|
#define TCPOPT_SACK 5 /* SACK Block */
|
|
#define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
|
|
#define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
|
|
+#define TCPOPT_MPTCP 30
|
|
#define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */
|
|
#define TCPOPT_EXP 254 /* Experimental */
|
|
/* Magic number to be after the option value for sharing TCP
|
|
@@ -238,6 +239,31 @@
|
|
*/
|
|
#define TFO_SERVER_WO_SOCKOPT1 0x400
|
|
|
|
+/* Flags from tcp_input.c for tcp_ack */
|
|
+#define FLAG_DATA 0x01 /* Incoming frame contained data. */
|
|
+#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
|
|
+#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
|
|
+#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
|
|
+#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
|
|
+#define FLAG_DATA_SACKED 0x20 /* New SACK. */
|
|
+#define FLAG_ECE 0x40 /* ECE in this ACK */
|
|
+#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
|
|
+#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
|
|
+#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
|
|
+#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
|
|
+#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
|
|
+#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
|
|
+#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
|
|
+#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
|
|
+#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
|
|
+#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
|
|
+
|
|
+#define MPTCP_FLAG_DATA_ACKED 0x20000
|
|
+
|
|
+#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
|
|
+#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
|
|
+#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
|
|
+#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
|
|
|
|
/* sysctl variables for tcp */
|
|
extern int sysctl_tcp_max_orphans;
|
|
@@ -310,6 +336,98 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
|
|
#define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
|
|
#define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
|
|
|
|
+/**** START - Exports needed for MPTCP ****/
|
|
+extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
|
|
+extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
|
|
+
|
|
+struct mptcp_options_received;
|
|
+
|
|
+void tcp_cleanup_rbuf(struct sock *sk, int copied);
|
|
+int tcp_close_state(struct sock *sk);
|
|
+void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
|
|
+ const struct sk_buff *skb);
|
|
+int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib);
|
|
+void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb);
|
|
+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
|
|
+ gfp_t gfp_mask);
|
|
+u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now);
|
|
+unsigned int tcp_mss_split_point(const struct sock *sk,
|
|
+ const struct sk_buff *skb,
|
|
+ unsigned int mss_now,
|
|
+ unsigned int max_segs,
|
|
+ int nonagle);
|
|
+bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
|
|
+ unsigned int cur_mss, int nonagle);
|
|
+bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
|
|
+ unsigned int cur_mss);
|
|
+unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb);
|
|
+int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now);
|
|
+int __pskb_trim_head(struct sk_buff *skb, int len);
|
|
+void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);
|
|
+void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);
|
|
+void tcp_reset(struct sock *sk);
|
|
+bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
|
|
+ const u32 ack_seq, const u32 nwin);
|
|
+bool tcp_urg_mode(const struct tcp_sock *tp);
|
|
+void tcp_ack_probe(struct sock *sk);
|
|
+void tcp_rearm_rto(struct sock *sk);
|
|
+int tcp_write_timeout(struct sock *sk);
|
|
+bool retransmits_timed_out(struct sock *sk,
|
|
+ unsigned int boundary,
|
|
+ unsigned int timeout);
|
|
+void tcp_write_err(struct sock *sk);
|
|
+void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr);
|
|
+void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
|
|
+ u64 prior_wstamp);
|
|
+void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now);
|
|
+
|
|
+void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
+ struct request_sock *req);
|
|
+void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb);
|
|
+struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb);
|
|
+void tcp_v4_reqsk_destructor(struct request_sock *req);
|
|
+
|
|
+void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
+ struct request_sock *req);
|
|
+void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
|
|
+struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb);
|
|
+int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
|
|
+int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
|
|
+void tcp_v6_destroy_sock(struct sock *sk);
|
|
+void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
|
|
+void tcp_v6_hash(struct sock *sk);
|
|
+struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb);
|
|
+struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
|
|
+ struct request_sock *req,
|
|
+ struct dst_entry *dst,
|
|
+ struct request_sock *req_unhash,
|
|
+ bool *own_req);
|
|
+void tcp_v6_reqsk_destructor(struct request_sock *req);
|
|
+
|
|
+unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
|
|
+ int large_allowed);
|
|
+u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb);
|
|
+void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, u32 prior_snd_una);
|
|
+
|
|
+void skb_clone_fraglist(struct sk_buff *skb);
|
|
+
|
|
+void inet_twsk_free(struct inet_timewait_sock *tw);
|
|
+int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb);
|
|
+/* These states need RST on ABORT according to RFC793 */
|
|
+static inline bool tcp_need_reset(int state)
|
|
+{
|
|
+ return (1 << state) &
|
|
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
|
|
+ TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
|
|
+}
|
|
+
|
|
+int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
|
|
+ bool *fragstolen);
|
|
+void tcp_ofo_queue(struct sock *sk);
|
|
+void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb);
|
|
+int linear_payload_sz(bool first_skb);
|
|
+/**** END - Exports needed for MPTCP ****/
|
|
+
|
|
void tcp_tasklet_init(void);
|
|
|
|
int tcp_v4_err(struct sk_buff *skb, u32);
|
|
@@ -411,7 +529,9 @@ int tcp_mmap(struct file *file, struct socket *sock,
|
|
#endif
|
|
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
|
|
struct tcp_options_received *opt_rx,
|
|
- int estab, struct tcp_fastopen_cookie *foc);
|
|
+ struct mptcp_options_received *mopt_rx,
|
|
+ int estab, struct tcp_fastopen_cookie *foc,
|
|
+ struct tcp_sock *tp);
|
|
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
|
|
|
|
/*
|
|
@@ -430,6 +550,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
|
|
|
|
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
|
|
void tcp_v4_mtu_reduced(struct sock *sk);
|
|
+void tcp_v6_mtu_reduced(struct sock *sk);
|
|
void tcp_req_err(struct sock *sk, u32 seq, bool abort);
|
|
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
|
|
struct sock *tcp_create_openreq_child(const struct sock *sk,
|
|
@@ -453,6 +574,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
|
|
struct request_sock *req,
|
|
struct tcp_fastopen_cookie *foc,
|
|
enum tcp_synack_type synack_type);
|
|
+void tcp_reset_vars(struct sock *sk);
|
|
int tcp_disconnect(struct sock *sk, int flags);
|
|
|
|
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
|
|
@@ -462,6 +584,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
|
|
/* From syncookies.c */
|
|
struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
|
|
struct request_sock *req,
|
|
+ const struct mptcp_options_received *mopt,
|
|
struct dst_entry *dst, u32 tsoff);
|
|
int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
|
|
u32 cookie);
|
|
@@ -547,7 +670,8 @@ static inline u32 tcp_cookie_time(void)
|
|
|
|
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
|
|
u16 *mssp);
|
|
-__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
|
|
+__u32 cookie_v4_init_sequence(struct request_sock *req, const struct sock *sk,
|
|
+ const struct sk_buff *skb, __u16 *mss);
|
|
u64 cookie_init_timestamp(struct request_sock *req);
|
|
bool cookie_timestamp_decode(const struct net *net,
|
|
struct tcp_options_received *opt);
|
|
@@ -561,7 +685,8 @@ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
|
|
|
|
u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
|
|
const struct tcphdr *th, u16 *mssp);
|
|
-__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
|
|
+__u32 cookie_v6_init_sequence(struct request_sock *req, const struct sock *sk,
|
|
+ const struct sk_buff *skb, __u16 *mss);
|
|
#endif
|
|
/* tcp_output.c */
|
|
|
|
@@ -597,10 +722,16 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
|
|
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
|
|
const struct sk_buff *next_skb);
|
|
|
|
+u16 tcp_select_window(struct sock *sk);
|
|
+bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
+ int push_one, gfp_t gfp);
|
|
+
|
|
/* tcp_input.c */
|
|
void tcp_rearm_rto(struct sock *sk);
|
|
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
|
|
void tcp_reset(struct sock *sk);
|
|
+void tcp_set_rto(struct sock *sk);
|
|
+bool tcp_should_expand_sndbuf(const struct sock *sk);
|
|
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
|
|
void tcp_fin(struct sock *sk);
|
|
|
|
@@ -645,7 +776,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
|
|
}
|
|
|
|
/* tcp.c */
|
|
-void tcp_get_info(struct sock *, struct tcp_info *);
|
|
+void tcp_get_info(struct sock *, struct tcp_info *, bool no_lock);
|
|
|
|
/* Read 'sendfile()'-style from a TCP socket */
|
|
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
|
|
@@ -723,7 +854,7 @@ static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
|
|
* Rcv_nxt can be after the window if our peer push more data
|
|
* than the offered window.
|
|
*/
|
|
-static inline u32 tcp_receive_window(const struct tcp_sock *tp)
|
|
+static inline u32 tcp_receive_window_now(const struct tcp_sock *tp)
|
|
{
|
|
s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
|
|
|
|
@@ -732,6 +863,32 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp)
|
|
return (u32) win;
|
|
}
|
|
|
|
+/* right edge only moves forward, even if window shrinks due
|
|
+ * to mptcp meta
|
|
+ */
|
|
+static inline void tcp_update_rcv_right_edge(struct tcp_sock *tp)
|
|
+{
|
|
+ if (after(tp->rcv_wup + tp->rcv_wnd, tp->rcv_right_edge))
|
|
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
|
+}
|
|
+
|
|
+/* Compute receive window which will never shrink. The way MPTCP handles
|
|
+ * the receive window can cause the effective right edge to shrink,
|
|
+ * causing valid segments to become out of window.
|
|
+ * This function should be used when checking if a segment is valid for
|
|
+ * the max right edge announced.
|
|
+ */
|
|
+static inline u32 tcp_receive_window_no_shrink(const struct tcp_sock *tp)
|
|
+{
|
|
+ s32 win = tp->rcv_right_edge - tp->rcv_nxt;
|
|
+
|
|
+ win = max_t(s32, win, tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt);
|
|
+
|
|
+ if (unlikely(win < 0))
|
|
+ win = 0;
|
|
+ return (u32) win;
|
|
+}
|
|
+
|
|
/* Choose a new window, without checks for shrinking, and without
|
|
* scaling applied to the result. The caller does these things
|
|
* if necessary. This is a "raw" window selection.
|
|
@@ -829,6 +986,12 @@ struct tcp_skb_cb {
|
|
u16 tcp_gso_size;
|
|
};
|
|
};
|
|
+
|
|
+#ifdef CONFIG_MPTCP
|
|
+ __u8 mptcp_flags; /* flags for the MPTCP layer */
|
|
+ __u8 dss_off; /* Number of 4-byte words until
|
|
+ * seq-number */
|
|
+#endif
|
|
__u8 tcp_flags; /* TCP header flags. (tcp[13]) */
|
|
|
|
__u8 sacked; /* State flags for SACK. */
|
|
@@ -847,6 +1010,14 @@ struct tcp_skb_cb {
|
|
has_rxtstamp:1, /* SKB has a RX timestamp */
|
|
unused:5;
|
|
__u32 ack_seq; /* Sequence number ACK'd */
|
|
+
|
|
+#ifdef CONFIG_MPTCP
|
|
+ union { /* For MPTCP outgoing frames */
|
|
+ __u32 path_mask; /* paths that tried to send this skb */
|
|
+ __u32 dss[6]; /* DSS options */
|
|
+ };
|
|
+#endif
|
|
+
|
|
union {
|
|
struct {
|
|
/* There is space for up to 24 bytes */
|
|
@@ -1088,6 +1259,8 @@ struct tcp_congestion_ops {
|
|
int tcp_set_allowed_congestion_control(char *allowed);
|
|
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
|
|
bool reinit, bool cap_net_admin);
|
|
+int __tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
|
|
+ bool reinit, bool cap_net_admin);
|
|
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
|
|
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
|
|
|
|
@@ -1389,6 +1562,19 @@ static inline int tcp_win_from_space(const struct sock *sk, int space)
|
|
space - (space>>tcp_adv_win_scale);
|
|
}
|
|
|
|
+#ifdef CONFIG_MPTCP
|
|
+extern struct static_key mptcp_static_key;
|
|
+static inline bool mptcp(const struct tcp_sock *tp)
|
|
+{
|
|
+ return static_key_false(&mptcp_static_key) && tp->mpc;
|
|
+}
|
|
+#else
|
|
+static inline bool mptcp(const struct tcp_sock *tp)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+#endif
|
|
+
|
|
/* Note: caller must be prepared to deal with negative returns */
|
|
static inline int tcp_space(const struct sock *sk)
|
|
{
|
|
@@ -1981,6 +2167,30 @@ struct tcp_sock_af_ops {
|
|
#endif
|
|
};
|
|
|
|
+/* TCP/MPTCP-specific functions */
|
|
+struct tcp_sock_ops {
|
|
+ u32 (*__select_window)(struct sock *sk);
|
|
+ u16 (*select_window)(struct sock *sk);
|
|
+ void (*select_initial_window)(const struct sock *sk, int __space,
|
|
+ __u32 mss, __u32 *rcv_wnd,
|
|
+ __u32 *window_clamp, int wscale_ok,
|
|
+ __u8 *rcv_wscale, __u32 init_rcv_wnd);
|
|
+ void (*init_buffer_space)(struct sock *sk);
|
|
+ void (*set_rto)(struct sock *sk);
|
|
+ bool (*should_expand_sndbuf)(const struct sock *sk);
|
|
+ void (*send_fin)(struct sock *sk);
|
|
+ bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
+ int push_one, gfp_t gfp);
|
|
+ void (*send_active_reset)(struct sock *sk, gfp_t priority);
|
|
+ int (*write_wakeup)(struct sock *sk, int mib);
|
|
+ void (*retransmit_timer)(struct sock *sk);
|
|
+ void (*time_wait)(struct sock *sk, int state, int timeo);
|
|
+ void (*cleanup_rbuf)(struct sock *sk, int copied);
|
|
+ int (*set_cong_ctrl)(struct sock *sk, const char *name, bool load,
|
|
+ bool reinit, bool cap_net_admin);
|
|
+};
|
|
+extern const struct tcp_sock_ops tcp_specific;
|
|
+
|
|
struct tcp_request_sock_ops {
|
|
u16 mss_clamp;
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
@@ -1991,12 +2201,13 @@ struct tcp_request_sock_ops {
|
|
const struct sock *sk,
|
|
const struct sk_buff *skb);
|
|
#endif
|
|
- void (*init_req)(struct request_sock *req,
|
|
- const struct sock *sk_listener,
|
|
- struct sk_buff *skb);
|
|
+ int (*init_req)(struct request_sock *req,
|
|
+ const struct sock *sk_listener,
|
|
+ struct sk_buff *skb,
|
|
+ bool want_cookie);
|
|
#ifdef CONFIG_SYN_COOKIES
|
|
- __u32 (*cookie_init_seq)(const struct sk_buff *skb,
|
|
- __u16 *mss);
|
|
+ __u32 (*cookie_init_seq)(struct request_sock *req, const struct sock *sk,
|
|
+ const struct sk_buff *skb, __u16 *mss);
|
|
#endif
|
|
struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
|
|
const struct request_sock *req);
|
|
@@ -2010,15 +2221,17 @@ struct tcp_request_sock_ops {
|
|
|
|
#ifdef CONFIG_SYN_COOKIES
|
|
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
|
|
+ struct request_sock *req,
|
|
const struct sock *sk, struct sk_buff *skb,
|
|
__u16 *mss)
|
|
{
|
|
tcp_synq_overflow(sk);
|
|
__NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
|
|
- return ops->cookie_init_seq(skb, mss);
|
|
+ return ops->cookie_init_seq(req, sk, skb, mss);
|
|
}
|
|
#else
|
|
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
|
|
+ struct request_sock *req,
|
|
const struct sock *sk, struct sk_buff *skb,
|
|
__u16 *mss)
|
|
{
|
|
diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h
|
|
index cc00118acca1..11084091e798 100644
|
|
--- a/include/net/tcp_states.h
|
|
+++ b/include/net/tcp_states.h
|
|
@@ -22,6 +22,7 @@ enum {
|
|
TCP_LISTEN,
|
|
TCP_CLOSING, /* Now a valid state */
|
|
TCP_NEW_SYN_RECV,
|
|
+ TCP_RST_WAIT,
|
|
|
|
TCP_MAX_STATES /* Leave at the end! */
|
|
};
|
|
@@ -43,6 +44,7 @@ enum {
|
|
TCPF_LISTEN = (1 << TCP_LISTEN),
|
|
TCPF_CLOSING = (1 << TCP_CLOSING),
|
|
TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV),
|
|
+ TCPF_RST_WAIT = (1 << TCP_RST_WAIT),
|
|
};
|
|
|
|
#endif /* _LINUX_TCP_STATES_H */
|
|
diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
|
|
index a8f6020f1196..5e70b086fdfb 100644
|
|
--- a/include/net/transp_v6.h
|
|
+++ b/include/net/transp_v6.h
|
|
@@ -58,6 +58,8 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
|
|
|
|
/* address family specific functions */
|
|
extern const struct inet_connection_sock_af_ops ipv4_specific;
|
|
+extern const struct inet_connection_sock_af_ops ipv6_mapped;
|
|
+extern const struct inet_connection_sock_af_ops ipv6_specific;
|
|
|
|
void inet6_destroy_sock(struct sock *sk);
|
|
|
|
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
|
|
index cf97f6339acb..cf48dc87a734 100644
|
|
--- a/include/trace/events/tcp.h
|
|
+++ b/include/trace/events/tcp.h
|
|
@@ -10,6 +10,7 @@
|
|
#include <linux/tracepoint.h>
|
|
#include <net/ipv6.h>
|
|
#include <net/tcp.h>
|
|
+#include <net/mptcp.h>
|
|
#include <linux/sock_diag.h>
|
|
|
|
#define TP_STORE_V4MAPPED(__entry, saddr, daddr) \
|
|
@@ -181,6 +182,13 @@
|
|
TP_ARGS(sk)
|
|
);
|
|
|
|
+DEFINE_EVENT(tcp_event_sk_skb, mptcp_retransmit,
|
|
+
|
|
+ TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
|
|
+
|
|
+ TP_ARGS(sk, skb)
|
|
+);
|
|
+
|
|
TRACE_EVENT(tcp_retransmit_synack,
|
|
|
|
TP_PROTO(const struct sock *sk, const struct request_sock *req),
|
|
@@ -248,6 +256,7 @@
|
|
__field(__u32, srtt)
|
|
__field(__u32, rcv_wnd)
|
|
__field(__u64, sock_cookie)
|
|
+ __field(__u8, mptcp)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
@@ -274,13 +283,15 @@
|
|
__entry->ssthresh = tcp_current_ssthresh(sk);
|
|
__entry->srtt = tp->srtt_us >> 3;
|
|
__entry->sock_cookie = sock_gen_cookie(sk);
|
|
+ __entry->mptcp = mptcp(tp) ? tp->mptcp->path_index : 0;
|
|
),
|
|
|
|
- TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx",
|
|
+ TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx mptcp=%d",
|
|
__entry->saddr, __entry->daddr, __entry->mark,
|
|
__entry->data_len, __entry->snd_nxt, __entry->snd_una,
|
|
__entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
|
|
- __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie)
|
|
+ __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie,
|
|
+ __entry->mptcp)
|
|
);
|
|
|
|
#endif /* _TRACE_TCP_H */
|
|
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
|
|
index 63038eb23560..7150eb62db86 100644
|
|
--- a/include/uapi/linux/bpf.h
|
|
+++ b/include/uapi/linux/bpf.h
|
|
@@ -3438,6 +3438,7 @@ enum {
|
|
BPF_TCP_LISTEN,
|
|
BPF_TCP_CLOSING, /* Now a valid state */
|
|
BPF_TCP_NEW_SYN_RECV,
|
|
+ BPF_TCP_RST_WAIT,
|
|
|
|
BPF_TCP_MAX_STATES /* Leave at the end! */
|
|
};
|
|
diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
|
|
index 7fea0fd7d6f5..7255e08393db 100644
|
|
--- a/include/uapi/linux/if.h
|
|
+++ b/include/uapi/linux/if.h
|
|
@@ -132,6 +132,9 @@ enum net_device_flags {
|
|
#define IFF_ECHO IFF_ECHO
|
|
#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */
|
|
|
|
+#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */
|
|
+#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */
|
|
+
|
|
#define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
|
|
IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
|
|
|
|
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
|
|
index 60e1241d4b77..ff6185b1d79f 100644
|
|
--- a/include/uapi/linux/in.h
|
|
+++ b/include/uapi/linux/in.h
|
|
@@ -76,6 +76,8 @@ enum {
|
|
#define IPPROTO_MPLS IPPROTO_MPLS
|
|
IPPROTO_RAW = 255, /* Raw IP packets */
|
|
#define IPPROTO_RAW IPPROTO_RAW
|
|
+ IPPROTO_MPTCP = 262, /* Multipath TCP connection */
|
|
+#define IPPROTO_MPTCP IPPROTO_MPTCP
|
|
IPPROTO_MAX
|
|
};
|
|
#endif
|
|
diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
|
|
new file mode 100644
|
|
index 000000000000..02078c80c846
|
|
--- /dev/null
|
|
+++ b/include/uapi/linux/mptcp.h
|
|
@@ -0,0 +1,151 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
|
+/*
|
|
+ * Netlink API for Multipath TCP
|
|
+ *
|
|
+ * Author: Gregory Detal <gregory.detal@tessares.net>
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#ifndef _LINUX_MPTCP_H
|
|
+#define _LINUX_MPTCP_H
|
|
+
|
|
+#define MPTCP_GENL_NAME "mptcp"
|
|
+#define MPTCP_GENL_EV_GRP_NAME "mptcp_events"
|
|
+#define MPTCP_GENL_CMD_GRP_NAME "mptcp_commands"
|
|
+#define MPTCP_GENL_VER 0x1
|
|
+
|
|
+/*
|
|
+ * ATTR types defined for MPTCP
|
|
+ */
|
|
+enum {
|
|
+ MPTCP_ATTR_UNSPEC = 0,
|
|
+
|
|
+ MPTCP_ATTR_TOKEN, /* u32 */
|
|
+ MPTCP_ATTR_FAMILY, /* u16 */
|
|
+ MPTCP_ATTR_LOC_ID, /* u8 */
|
|
+ MPTCP_ATTR_REM_ID, /* u8 */
|
|
+ MPTCP_ATTR_SADDR4, /* u32 */
|
|
+ MPTCP_ATTR_SADDR6, /* struct in6_addr */
|
|
+ MPTCP_ATTR_DADDR4, /* u32 */
|
|
+ MPTCP_ATTR_DADDR6, /* struct in6_addr */
|
|
+ MPTCP_ATTR_SPORT, /* u16 */
|
|
+ MPTCP_ATTR_DPORT, /* u16 */
|
|
+ MPTCP_ATTR_BACKUP, /* u8 */
|
|
+ MPTCP_ATTR_ERROR, /* u8 */
|
|
+ MPTCP_ATTR_FLAGS, /* u16 */
|
|
+ MPTCP_ATTR_TIMEOUT, /* u32 */
|
|
+ MPTCP_ATTR_IF_IDX, /* s32 */
|
|
+
|
|
+ __MPTCP_ATTR_AFTER_LAST
|
|
+};
|
|
+
|
|
+#define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1)
|
|
+
|
|
+/*
|
|
+ * Events generated by MPTCP:
|
|
+ * - MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6,
|
|
+ * sport, dport
|
|
+ * A new connection has been created. It is the good time to allocate
|
|
+ * memory and send ADD_ADDR if needed. Depending on the traffic-patterns
|
|
+ * it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent.
|
|
+ *
|
|
+ * - MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6,
|
|
+ * sport, dport
|
|
+ * A connection is established (can start new subflows).
|
|
+ *
|
|
+ * - MPTCP_EVENT_CLOSED: token
|
|
+ * A connection has stopped.
|
|
+ *
|
|
+ * - MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport]
|
|
+ * A new address has been announced by the peer.
|
|
+ *
|
|
+ * - MPTCP_EVENT_REMOVED: token, rem_id
|
|
+ * An address has been lost by the peer.
|
|
+ *
|
|
+ * - MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id,
|
|
+ * saddr4 | saddr6, daddr4 | daddr6, sport,
|
|
+ * dport, backup, if_idx [, error]
|
|
+ * A new subflow has been established. 'error' should not be set.
|
|
+ *
|
|
+ * - MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6,
|
|
+ * daddr4 | daddr6, sport, dport, backup, if_idx
|
|
+ * [, error]
|
|
+ * A subflow has been closed. An error (copy of sk_err) could be set if an
|
|
+ * error has been detected for this subflow.
|
|
+ *
|
|
+ * - MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6,
|
|
+ * daddr4 | daddr6, sport, dport, backup, if_idx
|
|
+ * [, error]
|
|
+ * The priority of a subflow has changed. 'error' should not be set.
|
|
+ *
|
|
+ * Commands for MPTCP:
|
|
+ * - MPTCP_CMD_ANNOUNCE: token, loc_id, family, saddr4 | saddr6 [, sport]
|
|
+ * Announce a new address to the peer.
|
|
+ *
|
|
+ * - MPTCP_CMD_REMOVE: token, loc_id
|
|
+ * Announce that an address has been lost to the peer.
|
|
+ *
|
|
+ * - MPTCP_CMD_SUB_CREATE: token, family, loc_id, rem_id, daddr4 | daddr6,
|
|
+ * dport [, saddr4 | saddr6, sport, backup, if_idx]
|
|
+ * Create a new subflow.
|
|
+ *
|
|
+ * - MPTCP_CMD_SUB_DESTROY: token, family, saddr4 | saddr6, daddr4 | daddr6,
|
|
+ * sport, dport
|
|
+ * Close a subflow.
|
|
+ *
|
|
+ * - MPTCP_CMD_SUB_PRIORITY: token, family, saddr4 | saddr6, daddr4 | daddr6,
|
|
+ * sport, dport, backup
|
|
+ * Change the priority of a subflow.
|
|
+ *
|
|
+ * - MPTCP_CMD_SET_FILTER: flags
|
|
+ * Set the filter on events. Set MPTCPF_* flags to only receive specific
|
|
+ * events. Default is to receive all events.
|
|
+ *
|
|
+ * - MPTCP_CMD_EXIST: token
|
|
+ * Check if this token is linked to an existing socket.
|
|
+ */
|
|
+enum {
|
|
+ MPTCP_CMD_UNSPEC = 0,
|
|
+
|
|
+ MPTCP_EVENT_CREATED,
|
|
+ MPTCP_EVENT_ESTABLISHED,
|
|
+ MPTCP_EVENT_CLOSED,
|
|
+
|
|
+ MPTCP_CMD_ANNOUNCE,
|
|
+ MPTCP_CMD_REMOVE,
|
|
+ MPTCP_EVENT_ANNOUNCED,
|
|
+ MPTCP_EVENT_REMOVED,
|
|
+
|
|
+ MPTCP_CMD_SUB_CREATE,
|
|
+ MPTCP_CMD_SUB_DESTROY,
|
|
+ MPTCP_EVENT_SUB_ESTABLISHED,
|
|
+ MPTCP_EVENT_SUB_CLOSED,
|
|
+
|
|
+ MPTCP_CMD_SUB_PRIORITY,
|
|
+ MPTCP_EVENT_SUB_PRIORITY,
|
|
+
|
|
+ MPTCP_CMD_SET_FILTER,
|
|
+
|
|
+ MPTCP_CMD_EXIST,
|
|
+
|
|
+ __MPTCP_CMD_AFTER_LAST
|
|
+};
|
|
+
|
|
+#define MPTCP_CMD_MAX (__MPTCP_CMD_AFTER_LAST - 1)
|
|
+
|
|
+enum {
|
|
+ MPTCPF_EVENT_CREATED = (1 << 1),
|
|
+ MPTCPF_EVENT_ESTABLISHED = (1 << 2),
|
|
+ MPTCPF_EVENT_CLOSED = (1 << 3),
|
|
+ MPTCPF_EVENT_ANNOUNCED = (1 << 4),
|
|
+ MPTCPF_EVENT_REMOVED = (1 << 5),
|
|
+ MPTCPF_EVENT_SUB_ESTABLISHED = (1 << 6),
|
|
+ MPTCPF_EVENT_SUB_CLOSED = (1 << 7),
|
|
+ MPTCPF_EVENT_SUB_PRIORITY = (1 << 8),
|
|
+};
|
|
+
|
|
+#endif /* _LINUX_MPTCP_H */
|
|
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
|
|
index 81e697978e8b..09ef515261d2 100644
|
|
--- a/include/uapi/linux/tcp.h
|
|
+++ b/include/uapi/linux/tcp.h
|
|
@@ -18,9 +18,15 @@
|
|
#ifndef _UAPI_LINUX_TCP_H
|
|
#define _UAPI_LINUX_TCP_H
|
|
|
|
-#include <linux/types.h>
|
|
+#ifndef __KERNEL__
|
|
+#include <sys/socket.h>
|
|
+#endif
|
|
+
|
|
#include <asm/byteorder.h>
|
|
+#include <linux/in.h>
|
|
+#include <linux/in6.h>
|
|
#include <linux/socket.h>
|
|
+#include <linux/types.h>
|
|
|
|
struct tcphdr {
|
|
__be16 source;
|
|
@@ -134,6 +140,13 @@ enum {
|
|
#define TCP_REPAIR_OFF 0
|
|
#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */
|
|
|
|
+#define MPTCP_ENABLED 42
|
|
+#define MPTCP_SCHEDULER 43
|
|
+#define MPTCP_PATH_MANAGER 44
|
|
+#define MPTCP_INFO 45
|
|
+
|
|
+#define MPTCP_INFO_FLAG_SAVE_MASTER 0x01
|
|
+
|
|
struct tcp_repair_opt {
|
|
__u32 opt_code;
|
|
__u32 opt_val;
|
|
@@ -305,6 +318,53 @@ enum {
|
|
TCP_NLA_SRTT, /* smoothed RTT in usecs */
|
|
};
|
|
|
|
+struct mptcp_meta_info {
|
|
+ __u8 mptcpi_state;
|
|
+ __u8 mptcpi_retransmits;
|
|
+ __u8 mptcpi_probes;
|
|
+ __u8 mptcpi_backoff;
|
|
+
|
|
+ __u32 mptcpi_rto;
|
|
+ __u32 mptcpi_unacked;
|
|
+
|
|
+ /* Times. */
|
|
+ __u32 mptcpi_last_data_sent;
|
|
+ __u32 mptcpi_last_data_recv;
|
|
+ __u32 mptcpi_last_ack_recv;
|
|
+
|
|
+ __u32 mptcpi_total_retrans;
|
|
+
|
|
+ __u64 mptcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
|
|
+ __u64 mptcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
|
|
+};
|
|
+
|
|
+struct mptcp_sub_info {
|
|
+ union {
|
|
+ struct sockaddr src;
|
|
+ struct sockaddr_in src_v4;
|
|
+ struct sockaddr_in6 src_v6;
|
|
+ };
|
|
+
|
|
+ union {
|
|
+ struct sockaddr dst;
|
|
+ struct sockaddr_in dst_v4;
|
|
+ struct sockaddr_in6 dst_v6;
|
|
+ };
|
|
+};
|
|
+
|
|
+struct mptcp_info {
|
|
+ __u32 tcp_info_len; /* Length of each struct tcp_info in subflows pointer */
|
|
+ __u32 sub_len; /* Total length of memory pointed to by subflows pointer */
|
|
+ __u32 meta_len; /* Length of memory pointed to by meta_info */
|
|
+ __u32 sub_info_len; /* Length of each struct mptcp_sub_info in subflow_info pointer */
|
|
+ __u32 total_sub_info_len; /* Total length of memory pointed to by subflow_info */
|
|
+
|
|
+ struct mptcp_meta_info *meta_info;
|
|
+ struct tcp_info *initial;
|
|
+ struct tcp_info *subflows; /* Pointer to array of tcp_info structs */
|
|
+ struct mptcp_sub_info *subflow_info;
|
|
+};
|
|
+
|
|
/* for TCP_MD5SIG socket option */
|
|
#define TCP_MD5SIG_MAXKEYLEN 80
|
|
|
|
diff --git a/net/Kconfig b/net/Kconfig
|
|
index 0b2fecc83452..66f9158a3040 100644
|
|
--- a/net/Kconfig
|
|
+++ b/net/Kconfig
|
|
@@ -94,6 +94,7 @@ if INET
|
|
source "net/ipv4/Kconfig"
|
|
source "net/ipv6/Kconfig"
|
|
source "net/netlabel/Kconfig"
|
|
+source "net/mptcp/Kconfig"
|
|
|
|
endif # if INET
|
|
|
|
diff --git a/net/Makefile b/net/Makefile
|
|
index 449fc0b221f8..08683343642e 100644
|
|
--- a/net/Makefile
|
|
+++ b/net/Makefile
|
|
@@ -20,6 +20,7 @@ obj-$(CONFIG_TLS) += tls/
|
|
obj-$(CONFIG_XFRM) += xfrm/
|
|
obj-$(CONFIG_UNIX_SCM) += unix/
|
|
obj-$(CONFIG_NET) += ipv6/
|
|
+obj-$(CONFIG_MPTCP) += mptcp/
|
|
obj-$(CONFIG_BPFILTER) += bpfilter/
|
|
obj-$(CONFIG_PACKET) += packet/
|
|
obj-$(CONFIG_NET_KEY) += key/
|
|
diff --git a/net/core/dev.c b/net/core/dev.c
|
|
index a03036456221..aebb337662c3 100644
|
|
--- a/net/core/dev.c
|
|
+++ b/net/core/dev.c
|
|
@@ -7892,7 +7892,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
|
|
|
|
dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
|
|
IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
|
|
- IFF_AUTOMEDIA)) |
|
|
+ IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) |
|
|
(dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
|
|
IFF_ALLMULTI));
|
|
|
|
diff --git a/net/core/filter.c b/net/core/filter.c
|
|
index 5ebc973ed4c5..516fc8689088 100644
|
|
--- a/net/core/filter.c
|
|
+++ b/net/core/filter.c
|
|
@@ -73,6 +73,7 @@
|
|
#include <net/lwtunnel.h>
|
|
#include <net/ipv6_stubs.h>
|
|
#include <net/bpf_sk_storage.h>
|
|
+#include <net/mptcp.h>
|
|
|
|
/**
|
|
* sk_filter_trim_cap - run a packet through a socket filter
|
|
@@ -4280,6 +4281,19 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
|
|
if (sk->sk_mark != val) {
|
|
sk->sk_mark = val;
|
|
sk_dst_reset(sk);
|
|
+
|
|
+ if (is_meta_sk(sk)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (val != sk_it->sk_mark) {
|
|
+ sk_it->sk_mark = val;
|
|
+ sk_dst_reset(sk_it);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
}
|
|
break;
|
|
default:
|
|
@@ -4302,6 +4316,14 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
|
|
if (val == -1)
|
|
val = 0;
|
|
inet->tos = val;
|
|
+
|
|
+ /* Update TOS on mptcp subflow */
|
|
+ if (is_meta_sk(sk)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp)
|
|
+ inet_sk(mptcp_to_sock(mptcp))->tos = val;
|
|
+ }
|
|
}
|
|
break;
|
|
default:
|
|
@@ -4324,6 +4346,17 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
|
|
if (val == -1)
|
|
val = 0;
|
|
np->tclass = val;
|
|
+
|
|
+ if (is_meta_sk(sk)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (sk_it->sk_family == AF_INET6)
|
|
+ inet6_sk(sk_it)->tclass = val;
|
|
+ }
|
|
+ }
|
|
}
|
|
break;
|
|
default:
|
|
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
|
|
index 283ddb2dbc7d..8f526a0d1912 100644
|
|
--- a/net/core/net-traces.c
|
|
+++ b/net/core/net-traces.c
|
|
@@ -60,3 +60,5 @@
|
|
EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset);
|
|
+
|
|
+EXPORT_TRACEPOINT_SYMBOL_GPL(mptcp_retransmit);
|
|
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
|
|
index ac083685214e..62bf97b4d5de 100644
|
|
--- a/net/core/skbuff.c
|
|
+++ b/net/core/skbuff.c
|
|
@@ -582,7 +582,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb)
|
|
skb_drop_list(&skb_shinfo(skb)->frag_list);
|
|
}
|
|
|
|
-static void skb_clone_fraglist(struct sk_buff *skb)
|
|
+void skb_clone_fraglist(struct sk_buff *skb)
|
|
{
|
|
struct sk_buff *list;
|
|
|
|
diff --git a/net/core/sock.c b/net/core/sock.c
|
|
index 57b7a10703c3..8d716113e273 100644
|
|
--- a/net/core/sock.c
|
|
+++ b/net/core/sock.c
|
|
@@ -135,6 +135,11 @@
|
|
|
|
#include <trace/events/sock.h>
|
|
|
|
+#ifdef CONFIG_MPTCP
|
|
+#include <net/mptcp.h>
|
|
+#include <net/inet_common.h>
|
|
+#endif
|
|
+
|
|
#include <net/tcp.h>
|
|
#include <net/busy_poll.h>
|
|
|
|
@@ -1063,6 +1068,19 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
|
|
} else if (val != sk->sk_mark) {
|
|
sk->sk_mark = val;
|
|
sk_dst_reset(sk);
|
|
+
|
|
+ if (is_meta_sk(sk)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (val != sk_it->sk_mark) {
|
|
+ sk_it->sk_mark = val;
|
|
+ sk_dst_reset(sk_it);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
}
|
|
break;
|
|
|
|
@@ -1563,6 +1581,23 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
|
|
*/
|
|
static inline void sock_lock_init(struct sock *sk)
|
|
{
|
|
+#ifdef CONFIG_MPTCP
|
|
+ /* Reclassify the lock-class for subflows */
|
|
+ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
|
|
+ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) {
|
|
+ sock_lock_init_class_and_name(sk, meta_slock_key_name,
|
|
+ &meta_slock_key,
|
|
+ meta_key_name,
|
|
+ &meta_key);
|
|
+
|
|
+ /* We don't yet have the mptcp-point.
|
|
+ * Thus we still need inet_sock_destruct
|
|
+ */
|
|
+ sk->sk_destruct = inet_sock_destruct;
|
|
+ return;
|
|
+ }
|
|
+#endif
|
|
+
|
|
if (sk->sk_kern_sock)
|
|
sock_lock_init_class_and_name(
|
|
sk,
|
|
@@ -1611,8 +1646,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
|
|
sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
|
|
if (!sk)
|
|
return sk;
|
|
- if (want_init_on_alloc(priority))
|
|
- sk_prot_clear_nulls(sk, prot->obj_size);
|
|
+ if (want_init_on_alloc(priority)) {
|
|
+ if (prot->clear_sk)
|
|
+ prot->clear_sk(sk, prot->obj_size);
|
|
+ else
|
|
+ sk_prot_clear_nulls(sk, prot->obj_size);
|
|
+ }
|
|
} else
|
|
sk = kmalloc(prot->obj_size, priority);
|
|
|
|
@@ -1846,6 +1885,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
|
|
atomic_set(&newsk->sk_zckey, 0);
|
|
|
|
sock_reset_flag(newsk, SOCK_DONE);
|
|
+ sock_reset_flag(newsk, SOCK_MPTCP);
|
|
|
|
/* sk->sk_memcg will be populated at accept() time */
|
|
newsk->sk_memcg = NULL;
|
|
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
|
|
index a926de2e42b5..6d73dc6e2586 100644
|
|
--- a/net/ipv4/Kconfig
|
|
+++ b/net/ipv4/Kconfig
|
|
@@ -655,6 +655,51 @@ config TCP_CONG_BBR
|
|
bufferbloat, policers, or AQM schemes that do not provide a delay
|
|
signal. It requires the fq ("Fair Queue") pacing packet scheduler.
|
|
|
|
+config TCP_CONG_LIA
|
|
+ tristate "MPTCP Linked Increase"
|
|
+ depends on MPTCP
|
|
+ default n
|
|
+ ---help---
|
|
+ MultiPath TCP Linked Increase Congestion Control
|
|
+ To enable it, just put 'lia' in tcp_congestion_control
|
|
+
|
|
+config TCP_CONG_OLIA
|
|
+ tristate "MPTCP Opportunistic Linked Increase"
|
|
+ depends on MPTCP
|
|
+ default n
|
|
+ ---help---
|
|
+ MultiPath TCP Opportunistic Linked Increase Congestion Control
|
|
+ To enable it, just put 'olia' in tcp_congestion_control
|
|
+
|
|
+config TCP_CONG_WVEGAS
|
|
+ tristate "MPTCP WVEGAS CONGESTION CONTROL"
|
|
+ depends on MPTCP
|
|
+ default n
|
|
+ ---help---
|
|
+ wVegas congestion control for MPTCP
|
|
+ To enable it, just put 'wvegas' in tcp_congestion_control
|
|
+
|
|
+config TCP_CONG_BALIA
|
|
+ tristate "MPTCP BALIA CONGESTION CONTROL"
|
|
+ depends on MPTCP
|
|
+ default n
|
|
+ ---help---
|
|
+ Multipath TCP Balanced Linked Adaptation Congestion Control
|
|
+ To enable it, just put 'balia' in tcp_congestion_control
|
|
+
|
|
+config TCP_CONG_MCTCPDESYNC
|
|
+ tristate "DESYNCHRONIZED MCTCP CONGESTION CONTROL (EXPERIMENTAL)"
|
|
+ depends on MPTCP
|
|
+ default n
|
|
+ ---help---
|
|
+ Desynchronized MultiChannel TCP Congestion Control. This is experimental
|
|
+ code that only supports single path and must have set mptcp_ndiffports
|
|
+ larger than one.
|
|
+ To enable it, just put 'mctcpdesync' in tcp_congestion_control
|
|
+ For further details see:
|
|
+ http://ieeexplore.ieee.org/abstract/document/6911722/
|
|
+ https://doi.org/10.1016/j.comcom.2015.07.010
|
|
+
|
|
choice
|
|
prompt "Default TCP congestion control"
|
|
default DEFAULT_CUBIC
|
|
@@ -692,6 +737,21 @@ choice
|
|
config DEFAULT_BBR
|
|
bool "BBR" if TCP_CONG_BBR=y
|
|
|
|
+ config DEFAULT_LIA
|
|
+ bool "Lia" if TCP_CONG_LIA=y
|
|
+
|
|
+ config DEFAULT_OLIA
|
|
+ bool "Olia" if TCP_CONG_OLIA=y
|
|
+
|
|
+ config DEFAULT_WVEGAS
|
|
+ bool "Wvegas" if TCP_CONG_WVEGAS=y
|
|
+
|
|
+ config DEFAULT_BALIA
|
|
+ bool "Balia" if TCP_CONG_BALIA=y
|
|
+
|
|
+ config DEFAULT_MCTCPDESYNC
|
|
+ bool "Mctcpdesync (EXPERIMENTAL)" if TCP_CONG_MCTCPDESYNC=y
|
|
+
|
|
config DEFAULT_RENO
|
|
bool "Reno"
|
|
endchoice
|
|
@@ -712,6 +772,10 @@ config DEFAULT_TCP_CONG
|
|
default "vegas" if DEFAULT_VEGAS
|
|
default "westwood" if DEFAULT_WESTWOOD
|
|
default "veno" if DEFAULT_VENO
|
|
+ default "lia" if DEFAULT_LIA
|
|
+ default "olia" if DEFAULT_OLIA
|
|
+ default "wvegas" if DEFAULT_WVEGAS
|
|
+ default "balia" if DEFAULT_BALIA
|
|
default "reno" if DEFAULT_RENO
|
|
default "dctcp" if DEFAULT_DCTCP
|
|
default "cdg" if DEFAULT_CDG
|
|
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
|
|
index c800220c404d..b8f10024780a 100644
|
|
--- a/net/ipv4/af_inet.c
|
|
+++ b/net/ipv4/af_inet.c
|
|
@@ -100,6 +100,7 @@
|
|
#include <net/ip_fib.h>
|
|
#include <net/inet_connection_sock.h>
|
|
#include <net/tcp.h>
|
|
+#include <net/mptcp.h>
|
|
#include <net/udp.h>
|
|
#include <net/udplite.h>
|
|
#include <net/ping.h>
|
|
@@ -150,6 +151,9 @@ void inet_sock_destruct(struct sock *sk)
|
|
return;
|
|
}
|
|
|
|
+ if (sock_flag(sk, SOCK_MPTCP))
|
|
+ mptcp_disable_static_key();
|
|
+
|
|
WARN_ON(atomic_read(&sk->sk_rmem_alloc));
|
|
WARN_ON(refcount_read(&sk->sk_wmem_alloc));
|
|
WARN_ON(sk->sk_wmem_queued);
|
|
@@ -227,6 +231,8 @@ int inet_listen(struct socket *sock, int backlog)
|
|
tcp_fastopen_init_key_once(sock_net(sk));
|
|
}
|
|
|
|
+ mptcp_init_listen(sk);
|
|
+
|
|
err = inet_csk_listen_start(sk, backlog);
|
|
if (err)
|
|
goto out;
|
|
@@ -244,8 +250,7 @@ int inet_listen(struct socket *sock, int backlog)
|
|
* Create an inet socket.
|
|
*/
|
|
|
|
-static int inet_create(struct net *net, struct socket *sock, int protocol,
|
|
- int kern)
|
|
+int inet_create(struct net *net, struct socket *sock, int protocol, int kern)
|
|
{
|
|
struct sock *sk;
|
|
struct inet_protosw *answer;
|
|
@@ -739,6 +744,24 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags,
|
|
lock_sock(sk2);
|
|
|
|
sock_rps_record_flow(sk2);
|
|
+
|
|
+ if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(sk2)->mpcb, mptcp) {
|
|
+ sock_rps_record_flow(mptcp_to_sock(mptcp));
|
|
+ }
|
|
+
|
|
+ if (tcp_sk(sk2)->mpcb->master_sk) {
|
|
+ struct sock *sk_it = tcp_sk(sk2)->mpcb->master_sk;
|
|
+
|
|
+ write_lock_bh(&sk_it->sk_callback_lock);
|
|
+ rcu_assign_pointer(sk_it->sk_wq, &newsock->wq);
|
|
+ sk_it->sk_socket = newsock;
|
|
+ write_unlock_bh(&sk_it->sk_callback_lock);
|
|
+ }
|
|
+ }
|
|
+
|
|
WARN_ON(!((1 << sk2->sk_state) &
|
|
(TCPF_ESTABLISHED | TCPF_SYN_RECV |
|
|
TCPF_CLOSE_WAIT | TCPF_CLOSE)));
|
|
@@ -1978,6 +2001,9 @@ static int __init inet_init(void)
|
|
if (init_ipv4_mibs())
|
|
panic("%s: Cannot init ipv4 mibs\n", __func__);
|
|
|
|
+ /* We must initialize MPTCP before TCP. */
|
|
+ mptcp_init();
|
|
+
|
|
/* Setup TCP slab cache for open requests. */
|
|
tcp_init();
|
|
|
|
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
|
|
index 85a88425edc4..f3de2d6eb1a4 100644
|
|
--- a/net/ipv4/inet_connection_sock.c
|
|
+++ b/net/ipv4/inet_connection_sock.c
|
|
@@ -19,6 +19,7 @@
|
|
#include <net/route.h>
|
|
#include <net/tcp_states.h>
|
|
#include <net/xfrm.h>
|
|
+#include <net/mptcp.h>
|
|
#include <net/tcp.h>
|
|
#include <net/sock_reuseport.h>
|
|
#include <net/addrconf.h>
|
|
@@ -730,7 +731,10 @@ static void reqsk_timer_handler(struct timer_list *t)
|
|
int max_retries, thresh;
|
|
u8 defer_accept;
|
|
|
|
- if (inet_sk_state_load(sk_listener) != TCP_LISTEN)
|
|
+ if (!is_meta_sk(sk_listener) && inet_sk_state_load(sk_listener) != TCP_LISTEN)
|
|
+ goto drop;
|
|
+
|
|
+ if (is_meta_sk(sk_listener) && !mptcp_can_new_subflow(sk_listener))
|
|
goto drop;
|
|
|
|
max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
|
|
@@ -819,7 +823,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
|
|
const struct request_sock *req,
|
|
const gfp_t priority)
|
|
{
|
|
- struct sock *newsk = sk_clone_lock(sk, priority);
|
|
+ struct sock *newsk;
|
|
+
|
|
+ newsk = sk_clone_lock(sk, priority);
|
|
|
|
if (newsk) {
|
|
struct inet_connection_sock *newicsk = inet_csk(newsk);
|
|
@@ -1019,7 +1025,14 @@ void inet_csk_listen_stop(struct sock *sk)
|
|
*/
|
|
while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
|
|
struct sock *child = req->sk;
|
|
+ bool mutex_taken = false;
|
|
+ struct mptcp_cb *mpcb = tcp_sk(child)->mpcb;
|
|
|
|
+ if (is_meta_sk(child)) {
|
|
+ WARN_ON(refcount_inc_not_zero(&mpcb->mpcb_refcnt) == 0);
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ mutex_taken = true;
|
|
+ }
|
|
local_bh_disable();
|
|
bh_lock_sock(child);
|
|
WARN_ON(sock_owned_by_user(child));
|
|
@@ -1029,6 +1042,10 @@ void inet_csk_listen_stop(struct sock *sk)
|
|
reqsk_put(req);
|
|
bh_unlock_sock(child);
|
|
local_bh_enable();
|
|
+ if (mutex_taken) {
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ mptcp_mpcb_put(mpcb);
|
|
+ }
|
|
sock_put(child);
|
|
|
|
cond_resched();
|
|
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
|
|
index aa3fd61818c4..8b3e955ec165 100644
|
|
--- a/net/ipv4/ip_sockglue.c
|
|
+++ b/net/ipv4/ip_sockglue.c
|
|
@@ -44,6 +44,8 @@
|
|
#endif
|
|
#include <net/ip_fib.h>
|
|
|
|
+#include <net/mptcp.h>
|
|
+
|
|
#include <linux/errqueue.h>
|
|
#include <linux/uaccess.h>
|
|
|
|
@@ -657,7 +659,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
|
|
break;
|
|
old = rcu_dereference_protected(inet->inet_opt,
|
|
lockdep_sock_is_held(sk));
|
|
- if (inet->is_icsk) {
|
|
+ if (inet->is_icsk && !is_meta_sk(sk)) {
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
if (sk->sk_family == PF_INET ||
|
|
@@ -751,6 +753,20 @@ static int do_ip_setsockopt(struct sock *sk, int level,
|
|
inet->tos = val;
|
|
sk->sk_priority = rt_tos2priority(val);
|
|
sk_dst_reset(sk);
|
|
+ /* Update TOS on mptcp subflow */
|
|
+ if (is_meta_sk(sk)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (inet_sk(sk_it)->tos != inet_sk(sk)->tos) {
|
|
+ inet_sk(sk_it)->tos = inet_sk(sk)->tos;
|
|
+ sk_it->sk_priority = sk->sk_priority;
|
|
+ sk_dst_reset(sk_it);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
}
|
|
break;
|
|
case IP_TTL:
|
|
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
|
|
index 2b45d1455592..f988be944eda 100644
|
|
--- a/net/ipv4/syncookies.c
|
|
+++ b/net/ipv4/syncookies.c
|
|
@@ -12,6 +12,8 @@
|
|
#include <linux/siphash.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/export.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
#include <net/secure_seq.h>
|
|
#include <net/tcp.h>
|
|
#include <net/route.h>
|
|
@@ -175,7 +177,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
|
|
}
|
|
EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
|
|
|
|
-__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
|
|
+__u32 cookie_v4_init_sequence(struct request_sock *req, const struct sock *sk,
|
|
+ const struct sk_buff *skb, __u16 *mssp)
|
|
{
|
|
const struct iphdr *iph = ip_hdr(skb);
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
@@ -200,14 +203,33 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
|
|
|
|
struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
|
|
struct request_sock *req,
|
|
+ const struct mptcp_options_received *mopt,
|
|
struct dst_entry *dst, u32 tsoff)
|
|
{
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
struct sock *child;
|
|
bool own_req;
|
|
+#ifdef CONFIG_MPTCP
|
|
+ int ret;
|
|
+#endif
|
|
|
|
child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
|
|
NULL, &own_req);
|
|
+
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (!child)
|
|
+ goto listen_overflow;
|
|
+
|
|
+ ret = mptcp_check_req_master(sk, child, req, skb, mopt, 0, tsoff);
|
|
+ if (ret < 0)
|
|
+ return NULL;
|
|
+
|
|
+ if (!ret)
|
|
+ return tcp_sk(child)->mpcb->master_sk;
|
|
+
|
|
+listen_overflow:
|
|
+#endif
|
|
+
|
|
if (child) {
|
|
refcount_set(&req->rsk_refcnt, 1);
|
|
tcp_sk(child)->tsoffset = tsoff;
|
|
@@ -284,6 +306,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
|
|
struct tcp_options_received tcp_opt;
|
|
+ struct mptcp_options_received mopt;
|
|
struct inet_request_sock *ireq;
|
|
struct tcp_request_sock *treq;
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
@@ -313,7 +336,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
|
|
|
|
/* check for timestamp cookie support */
|
|
memset(&tcp_opt, 0, sizeof(tcp_opt));
|
|
- tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
|
|
+ mptcp_init_mp_opt(&mopt);
|
|
+ tcp_parse_options(sock_net(sk), skb, &tcp_opt, &mopt, 0, NULL, NULL);
|
|
|
|
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
|
|
tsoff = secure_tcp_ts_off(sock_net(sk),
|
|
@@ -326,7 +350,12 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
|
|
goto out;
|
|
|
|
ret = NULL;
|
|
- req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (mopt.saw_mpc)
|
|
+ req = inet_reqsk_alloc(&mptcp_request_sock_ops, sk, false); /* for safety */
|
|
+ else
|
|
+#endif
|
|
+ req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */
|
|
if (!req)
|
|
goto out;
|
|
|
|
@@ -346,6 +375,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
|
|
ireq->sack_ok = tcp_opt.sack_ok;
|
|
ireq->wscale_ok = tcp_opt.wscale_ok;
|
|
ireq->tstamp_ok = tcp_opt.saw_tstamp;
|
|
+ ireq->mptcp_rqsk = 0;
|
|
+ ireq->saw_mpc = 0;
|
|
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
|
|
treq->snt_synack = 0;
|
|
treq->tfo_listener = false;
|
|
@@ -354,6 +385,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
|
|
|
|
ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
|
|
|
|
+ if (mopt.saw_mpc)
|
|
+ mptcp_cookies_reqsk_init(req, &mopt, skb);
|
|
+
|
|
/* We throwed the options of the initial SYN away, so we hope
|
|
* the ACK carries the same options again (see RFC1122 4.2.3.8)
|
|
*/
|
|
@@ -392,15 +426,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
|
|
(req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
|
|
req->rsk_window_clamp = full_space;
|
|
|
|
- tcp_select_initial_window(sk, full_space, req->mss,
|
|
- &req->rsk_rcv_wnd, &req->rsk_window_clamp,
|
|
- ireq->wscale_ok, &rcv_wscale,
|
|
- dst_metric(&rt->dst, RTAX_INITRWND));
|
|
+ tp->ops->select_initial_window(sk, full_space, req->mss,
|
|
+ &req->rsk_rcv_wnd, &req->rsk_window_clamp,
|
|
+ ireq->wscale_ok, &rcv_wscale,
|
|
+ dst_metric(&rt->dst, RTAX_INITRWND));
|
|
|
|
ireq->rcv_wscale = rcv_wscale;
|
|
ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
|
|
|
|
- ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst, tsoff);
|
|
+ ret = tcp_get_cookie_sock(sk, skb, req, &mopt, &rt->dst, tsoff);
|
|
/* ip_queue_xmit() depends on our flow being setup
|
|
* Normal sockets get it right from inet_csk_route_child_sock()
|
|
*/
|
|
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
|
|
index 9f53d25e047e..ae9ba8f2ced1 100644
|
|
--- a/net/ipv4/tcp.c
|
|
+++ b/net/ipv4/tcp.c
|
|
@@ -270,6 +270,7 @@
|
|
|
|
#include <net/icmp.h>
|
|
#include <net/inet_common.h>
|
|
+#include <net/mptcp.h>
|
|
#include <net/tcp.h>
|
|
#include <net/xfrm.h>
|
|
#include <net/ip.h>
|
|
@@ -400,6 +401,23 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
|
|
return rate64;
|
|
}
|
|
|
|
+const struct tcp_sock_ops tcp_specific = {
|
|
+ .__select_window = __tcp_select_window,
|
|
+ .select_window = tcp_select_window,
|
|
+ .select_initial_window = tcp_select_initial_window,
|
|
+ .init_buffer_space = tcp_init_buffer_space,
|
|
+ .set_rto = tcp_set_rto,
|
|
+ .should_expand_sndbuf = tcp_should_expand_sndbuf,
|
|
+ .send_fin = tcp_send_fin,
|
|
+ .write_xmit = tcp_write_xmit,
|
|
+ .send_active_reset = tcp_send_active_reset,
|
|
+ .write_wakeup = tcp_write_wakeup,
|
|
+ .retransmit_timer = tcp_retransmit_timer,
|
|
+ .time_wait = tcp_time_wait,
|
|
+ .cleanup_rbuf = tcp_cleanup_rbuf,
|
|
+ .set_cong_ctrl = __tcp_set_congestion_control,
|
|
+};
|
|
+
|
|
/* Address-family independent initialization for a tcp_sock.
|
|
*
|
|
* NOTE: A lot of things set to zero explicitly by call to
|
|
@@ -453,6 +471,11 @@ void tcp_init_sock(struct sock *sk)
|
|
WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
|
|
WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
|
|
|
|
+ tp->ops = &tcp_specific;
|
|
+
|
|
+ /* Initialize MPTCP-specific stuff and function-pointers */
|
|
+ mptcp_init_tcp_sock(sk);
|
|
+
|
|
sk_sockets_allocated_inc(sk);
|
|
sk->sk_route_forced_caps = NETIF_F_GSO;
|
|
}
|
|
@@ -484,7 +507,7 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
|
|
return true;
|
|
if (tcp_rmem_pressure(sk))
|
|
return true;
|
|
- if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
|
|
+ if (tcp_receive_window_now(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
|
|
return true;
|
|
}
|
|
if (sk->sk_prot->stream_memory_read)
|
|
@@ -787,6 +810,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
|
|
int ret;
|
|
|
|
sock_rps_record_flow(sk);
|
|
+
|
|
/*
|
|
* We can't seek on a socket input
|
|
*/
|
|
@@ -797,6 +821,16 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
|
|
|
|
lock_sock(sk);
|
|
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (mptcp(tcp_sk(sk))) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) {
|
|
+ sock_rps_record_flow(mptcp_to_sock(mptcp));
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+
|
|
timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
|
|
while (tss.len) {
|
|
ret = __tcp_splice_read(sk, &tss);
|
|
@@ -912,8 +946,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
|
|
return NULL;
|
|
}
|
|
|
|
-static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
|
|
- int large_allowed)
|
|
+unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
u32 new_size_goal, size_goal;
|
|
@@ -941,8 +974,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
|
|
{
|
|
int mss_now;
|
|
|
|
- mss_now = tcp_current_mss(sk);
|
|
- *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
|
|
+ if (mptcp(tcp_sk(sk))) {
|
|
+ mss_now = mptcp_current_mss(sk);
|
|
+ *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
|
|
+ } else {
|
|
+ mss_now = tcp_current_mss(sk);
|
|
+ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
|
|
+ }
|
|
|
|
return mss_now;
|
|
}
|
|
@@ -982,12 +1020,34 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
|
|
* is fully established.
|
|
*/
|
|
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
|
|
- !tcp_passive_fastopen(sk)) {
|
|
+ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
|
|
+ tp->mpcb->master_sk : sk)) {
|
|
err = sk_stream_wait_connect(sk, &timeo);
|
|
if (err != 0)
|
|
goto out_err;
|
|
}
|
|
|
|
+ if (mptcp(tp)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ /* We must check this with socket-lock hold because we iterate
|
|
+ * over the subflows.
|
|
+ */
|
|
+ if (!mptcp_can_sendpage(sk)) {
|
|
+ ssize_t ret;
|
|
+
|
|
+ release_sock(sk);
|
|
+ ret = sock_no_sendpage(sk->sk_socket, page, offset,
|
|
+ size, flags);
|
|
+ lock_sock(sk);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ sock_rps_record_flow(mptcp_to_sock(mptcp));
|
|
+ }
|
|
+ }
|
|
+
|
|
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
|
|
|
|
mss_now = tcp_send_mss(sk, &size_goal, flags);
|
|
@@ -1109,7 +1169,8 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
|
|
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
|
|
size_t size, int flags)
|
|
{
|
|
- if (!(sk->sk_route_caps & NETIF_F_SG))
|
|
+ /* If MPTCP is enabled, we check it later after establishment */
|
|
+ if (!mptcp(tcp_sk(sk)) && !(sk->sk_route_caps & NETIF_F_SG))
|
|
return sock_no_sendpage_locked(sk, page, offset, size, flags);
|
|
|
|
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
|
|
@@ -1231,12 +1292,21 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
|
|
* is fully established.
|
|
*/
|
|
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
|
|
- !tcp_passive_fastopen(sk)) {
|
|
+ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
|
|
+ tp->mpcb->master_sk : sk)) {
|
|
err = sk_stream_wait_connect(sk, &timeo);
|
|
if (err != 0)
|
|
goto do_error;
|
|
}
|
|
|
|
+ if (mptcp(tp)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ sock_rps_record_flow(mptcp_to_sock(mptcp));
|
|
+ }
|
|
+ }
|
|
+
|
|
if (unlikely(tp->repair)) {
|
|
if (tp->repair_queue == TCP_RECV_QUEUE) {
|
|
copied = tcp_send_rcvq(sk, msg, size);
|
|
@@ -1529,7 +1599,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
|
|
* calculation of whether or not we must ACK for the sake of
|
|
* a window update.
|
|
*/
|
|
-static void tcp_cleanup_rbuf(struct sock *sk, int copied)
|
|
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
bool time_to_ack = false;
|
|
@@ -1568,11 +1638,11 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
|
|
* in states, where we will not receive more. It is useless.
|
|
*/
|
|
if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
|
|
- __u32 rcv_window_now = tcp_receive_window(tp);
|
|
+ __u32 rcv_window_now = tcp_receive_window_now(tp);
|
|
|
|
/* Optimize, __tcp_select_window() is not cheap. */
|
|
if (2*rcv_window_now <= tp->window_clamp) {
|
|
- __u32 new_window = __tcp_select_window(sk);
|
|
+ __u32 new_window = tp->ops->__select_window(sk);
|
|
|
|
/* Send ACK now, if this read freed lots of space
|
|
* in our buffer. Certainly, new_window is new window.
|
|
@@ -1688,7 +1758,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
|
|
/* Clean up data we have read: This will do ACK frames. */
|
|
if (copied > 0) {
|
|
tcp_recv_skb(sk, seq, &offset);
|
|
- tcp_cleanup_rbuf(sk, copied);
|
|
+ tp->ops->cleanup_rbuf(sk, copied);
|
|
}
|
|
return copied;
|
|
}
|
|
@@ -1979,6 +2049,16 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
|
|
|
|
lock_sock(sk);
|
|
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (mptcp(tp)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ sock_rps_record_flow(mptcp_to_sock(mptcp));
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+
|
|
err = -ENOTCONN;
|
|
if (sk->sk_state == TCP_LISTEN)
|
|
goto out;
|
|
@@ -2097,7 +2177,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
|
|
}
|
|
}
|
|
|
|
- tcp_cleanup_rbuf(sk, copied);
|
|
+ tp->ops->cleanup_rbuf(sk, copied);
|
|
|
|
if (copied >= target) {
|
|
/* Do not sleep, just process backlog. */
|
|
@@ -2189,7 +2269,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
|
|
*/
|
|
|
|
/* Clean up data we have read: This will do ACK frames. */
|
|
- tcp_cleanup_rbuf(sk, copied);
|
|
+ tp->ops->cleanup_rbuf(sk, copied);
|
|
|
|
release_sock(sk);
|
|
|
|
@@ -2248,8 +2328,11 @@ void tcp_set_state(struct sock *sk, int state)
|
|
|
|
switch (state) {
|
|
case TCP_ESTABLISHED:
|
|
- if (oldstate != TCP_ESTABLISHED)
|
|
+ if (oldstate != TCP_ESTABLISHED) {
|
|
TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
|
|
+ if (is_meta_sk(sk))
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB);
|
|
+ }
|
|
break;
|
|
|
|
case TCP_CLOSE:
|
|
@@ -2262,8 +2345,11 @@ void tcp_set_state(struct sock *sk, int state)
|
|
inet_put_port(sk);
|
|
/* fall through */
|
|
default:
|
|
- if (oldstate == TCP_ESTABLISHED)
|
|
+ if (oldstate == TCP_ESTABLISHED) {
|
|
TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
|
|
+ if (is_meta_sk(sk))
|
|
+ MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB);
|
|
+ }
|
|
}
|
|
|
|
/* Change state AFTER socket is unhashed to avoid closed
|
|
@@ -2297,7 +2383,7 @@ void tcp_set_state(struct sock *sk, int state)
|
|
[TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
|
|
};
|
|
|
|
-static int tcp_close_state(struct sock *sk)
|
|
+int tcp_close_state(struct sock *sk)
|
|
{
|
|
int next = (int)new_state[sk->sk_state];
|
|
int ns = next & TCP_STATE_MASK;
|
|
@@ -2327,7 +2413,7 @@ void tcp_shutdown(struct sock *sk, int how)
|
|
TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
|
|
/* Clear out any half completed packets. FIN if needed. */
|
|
if (tcp_close_state(sk))
|
|
- tcp_send_fin(sk);
|
|
+ tcp_sk(sk)->ops->send_fin(sk);
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(tcp_shutdown);
|
|
@@ -2352,6 +2438,17 @@ void tcp_close(struct sock *sk, long timeout)
|
|
int data_was_unread = 0;
|
|
int state;
|
|
|
|
+ if (is_meta_sk(sk)) {
|
|
+ /* TODO: Currently forcing timeout to 0 because
|
|
+ * sk_stream_wait_close will complain during lockdep because
|
|
+ * of the mpcb_mutex (circular lock dependency through
|
|
+ * inet_csk_listen_stop()).
|
|
+ * We should find a way to get rid of the mpcb_mutex.
|
|
+ */
|
|
+ mptcp_close(sk, 0);
|
|
+ return;
|
|
+ }
|
|
+
|
|
lock_sock(sk);
|
|
sk->sk_shutdown = SHUTDOWN_MASK;
|
|
|
|
@@ -2396,7 +2493,7 @@ void tcp_close(struct sock *sk, long timeout)
|
|
/* Unread data was tossed, zap the connection. */
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
|
|
tcp_set_state(sk, TCP_CLOSE);
|
|
- tcp_send_active_reset(sk, sk->sk_allocation);
|
|
+ tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation);
|
|
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
|
|
/* Check zero linger _after_ checking for unread data. */
|
|
sk->sk_prot->disconnect(sk, 0);
|
|
@@ -2470,7 +2567,7 @@ void tcp_close(struct sock *sk, long timeout)
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
if (tp->linger2 < 0) {
|
|
tcp_set_state(sk, TCP_CLOSE);
|
|
- tcp_send_active_reset(sk, GFP_ATOMIC);
|
|
+ tp->ops->send_active_reset(sk, GFP_ATOMIC);
|
|
__NET_INC_STATS(sock_net(sk),
|
|
LINUX_MIB_TCPABORTONLINGER);
|
|
} else {
|
|
@@ -2480,7 +2577,8 @@ void tcp_close(struct sock *sk, long timeout)
|
|
inet_csk_reset_keepalive_timer(sk,
|
|
tmo - TCP_TIMEWAIT_LEN);
|
|
} else {
|
|
- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
|
|
+ tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2,
|
|
+ tmo);
|
|
goto out;
|
|
}
|
|
}
|
|
@@ -2489,7 +2587,7 @@ void tcp_close(struct sock *sk, long timeout)
|
|
sk_mem_reclaim(sk);
|
|
if (tcp_check_oom(sk, 0)) {
|
|
tcp_set_state(sk, TCP_CLOSE);
|
|
- tcp_send_active_reset(sk, GFP_ATOMIC);
|
|
+ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
|
|
__NET_INC_STATS(sock_net(sk),
|
|
LINUX_MIB_TCPABORTONMEMORY);
|
|
} else if (!check_net(sock_net(sk))) {
|
|
@@ -2521,15 +2619,6 @@ void tcp_close(struct sock *sk, long timeout)
|
|
}
|
|
EXPORT_SYMBOL(tcp_close);
|
|
|
|
-/* These states need RST on ABORT according to RFC793 */
|
|
-
|
|
-static inline bool tcp_need_reset(int state)
|
|
-{
|
|
- return (1 << state) &
|
|
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
|
|
- TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
|
|
-}
|
|
-
|
|
static void tcp_rtx_queue_purge(struct sock *sk)
|
|
{
|
|
struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
|
|
@@ -2551,6 +2640,10 @@ void tcp_write_queue_purge(struct sock *sk)
|
|
{
|
|
struct sk_buff *skb;
|
|
|
|
+ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk) &&
|
|
+ !tcp_rtx_and_write_queues_empty(sk))
|
|
+ mptcp_reinject_data(sk, 0);
|
|
+
|
|
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
|
|
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
|
|
tcp_skb_tsorted_anchor_cleanup(skb);
|
|
@@ -2569,6 +2662,36 @@ void tcp_write_queue_purge(struct sock *sk)
|
|
inet_csk(sk)->icsk_backoff = 0;
|
|
}
|
|
|
|
+void tcp_reset_vars(struct sock *sk)
|
|
+{
|
|
+ struct inet_connection_sock *icsk = inet_csk(sk);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ tp->srtt_us = 0;
|
|
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
|
|
+ tp->rcv_rtt_last_tsecr = 0;
|
|
+ icsk->icsk_probes_tstamp = 0;
|
|
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
|
|
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
|
|
+ tp->snd_cwnd = TCP_INIT_CWND;
|
|
+ tp->snd_cwnd_cnt = 0;
|
|
+ tp->delivered = 0;
|
|
+ tp->delivered_ce = 0;
|
|
+ tp->is_sack_reneg = 0;
|
|
+ tcp_clear_retrans(tp);
|
|
+ tp->segs_in = 0;
|
|
+ tp->segs_out = 0;
|
|
+ tp->bytes_sent = 0;
|
|
+ tp->bytes_acked = 0;
|
|
+ tp->bytes_received = 0;
|
|
+ tp->bytes_retrans = 0;
|
|
+ tp->total_retrans = 0;
|
|
+ tp->data_segs_in = 0;
|
|
+ tp->data_segs_out = 0;
|
|
+ /* There's a bubble in the pipe until at least the first ACK. */
|
|
+ tp->app_limited = ~0U;
|
|
+}
|
|
+
|
|
int tcp_disconnect(struct sock *sk, int flags)
|
|
{
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
@@ -2591,7 +2714,7 @@ int tcp_disconnect(struct sock *sk, int flags)
|
|
/* The last check adjusts for discrepancy of Linux wrt. RFC
|
|
* states
|
|
*/
|
|
- tcp_send_active_reset(sk, gfp_any());
|
|
+ tp->ops->send_active_reset(sk, gfp_any());
|
|
sk->sk_err = ECONNRESET;
|
|
} else if (old_state == TCP_SYN_SENT)
|
|
sk->sk_err = ECONNRESET;
|
|
@@ -2613,11 +2736,15 @@ int tcp_disconnect(struct sock *sk, int flags)
|
|
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
|
|
inet_reset_saddr(sk);
|
|
|
|
+ if (is_meta_sk(sk)) {
|
|
+ mptcp_disconnect(sk);
|
|
+ } else {
|
|
+ if (tp->inside_tk_table)
|
|
+ mptcp_hash_remove_bh(tp);
|
|
+ }
|
|
+
|
|
sk->sk_shutdown = 0;
|
|
sock_reset_flag(sk, SOCK_DONE);
|
|
- tp->srtt_us = 0;
|
|
- tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
|
|
- tp->rcv_rtt_last_tsecr = 0;
|
|
|
|
seq = tp->write_seq + tp->max_window + 2;
|
|
if (!seq)
|
|
@@ -2627,21 +2754,14 @@ int tcp_disconnect(struct sock *sk, int flags)
|
|
icsk->icsk_backoff = 0;
|
|
tp->snd_cwnd = 2;
|
|
icsk->icsk_probes_out = 0;
|
|
- icsk->icsk_probes_tstamp = 0;
|
|
- icsk->icsk_rto = TCP_TIMEOUT_INIT;
|
|
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
|
|
- tp->snd_cwnd = TCP_INIT_CWND;
|
|
- tp->snd_cwnd_cnt = 0;
|
|
tp->window_clamp = 0;
|
|
- tp->delivered = 0;
|
|
- tp->delivered_ce = 0;
|
|
+
|
|
+ tcp_reset_vars(sk);
|
|
+
|
|
if (icsk->icsk_ca_ops->release)
|
|
icsk->icsk_ca_ops->release(sk);
|
|
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
|
|
tcp_set_ca_state(sk, TCP_CA_Open);
|
|
- tp->is_sack_reneg = 0;
|
|
- tcp_clear_retrans(tp);
|
|
- tp->total_retrans = 0;
|
|
inet_csk_delack_init(sk);
|
|
/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
|
|
* issue in __tcp_select_window()
|
|
@@ -2653,14 +2773,6 @@ int tcp_disconnect(struct sock *sk, int flags)
|
|
sk->sk_rx_dst = NULL;
|
|
tcp_saved_syn_free(tp);
|
|
tp->compressed_ack = 0;
|
|
- tp->segs_in = 0;
|
|
- tp->segs_out = 0;
|
|
- tp->bytes_sent = 0;
|
|
- tp->bytes_acked = 0;
|
|
- tp->bytes_received = 0;
|
|
- tp->bytes_retrans = 0;
|
|
- tp->data_segs_in = 0;
|
|
- tp->data_segs_out = 0;
|
|
tp->duplicate_sack[0].start_seq = 0;
|
|
tp->duplicate_sack[0].end_seq = 0;
|
|
tp->dsack_dups = 0;
|
|
@@ -2669,8 +2781,6 @@ int tcp_disconnect(struct sock *sk, int flags)
|
|
tp->sacked_out = 0;
|
|
tp->tlp_high_seq = 0;
|
|
tp->last_oow_ack_time = 0;
|
|
- /* There's a bubble in the pipe until at least the first ACK. */
|
|
- tp->app_limited = ~0U;
|
|
tp->rack.mstamp = 0;
|
|
tp->rack.advanced = 0;
|
|
tp->rack.reo_wnd_steps = 1;
|
|
@@ -2704,7 +2814,7 @@ int tcp_disconnect(struct sock *sk, int flags)
|
|
static inline bool tcp_can_repair_sock(const struct sock *sk)
|
|
{
|
|
return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
|
|
- (sk->sk_state != TCP_LISTEN);
|
|
+ (sk->sk_state != TCP_LISTEN) && !sock_flag(sk, SOCK_MPTCP);
|
|
}
|
|
|
|
static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
|
|
@@ -2735,6 +2845,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l
|
|
|
|
tp->rcv_wnd = opt.rcv_wnd;
|
|
tp->rcv_wup = opt.rcv_wup;
|
|
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
|
|
|
return 0;
|
|
}
|
|
@@ -2873,6 +2984,61 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
|
|
|
|
return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
|
|
}
|
|
+#ifdef CONFIG_MPTCP
|
|
+ case MPTCP_SCHEDULER: {
|
|
+ char name[MPTCP_SCHED_NAME_MAX];
|
|
+
|
|
+ if (optlen < 1)
|
|
+ return -EINVAL;
|
|
+
|
|
+ /* Cannot be used if MPTCP is not used or we already have
|
|
+ * established an MPTCP-connection.
|
|
+ */
|
|
+ if (mptcp_init_failed || !sysctl_mptcp_enabled ||
|
|
+ sk->sk_state != TCP_CLOSE)
|
|
+ return -EPERM;
|
|
+
|
|
+ val = strncpy_from_user(name, optval,
|
|
+ min_t(long, MPTCP_SCHED_NAME_MAX - 1,
|
|
+ optlen));
|
|
+
|
|
+ if (val < 0)
|
|
+ return -EFAULT;
|
|
+ name[val] = 0;
|
|
+
|
|
+ lock_sock(sk);
|
|
+ err = mptcp_set_scheduler(sk, name);
|
|
+ release_sock(sk);
|
|
+ return err;
|
|
+ }
|
|
+
|
|
+ case MPTCP_PATH_MANAGER: {
|
|
+ char name[MPTCP_PM_NAME_MAX];
|
|
+
|
|
+ if (optlen < 1)
|
|
+ return -EINVAL;
|
|
+
|
|
+ /* Cannot be used if MPTCP is not used or we already have
|
|
+ * established an MPTCP-connection.
|
|
+ */
|
|
+ if (mptcp_init_failed || !sysctl_mptcp_enabled ||
|
|
+ sk->sk_state != TCP_CLOSE)
|
|
+ return -EPERM;
|
|
+
|
|
+ val = strncpy_from_user(name, optval,
|
|
+ min_t(long, MPTCP_PM_NAME_MAX - 1,
|
|
+ optlen));
|
|
+
|
|
+ if (val < 0)
|
|
+ return -EFAULT;
|
|
+ name[val] = 0;
|
|
+
|
|
+ lock_sock(sk);
|
|
+ err = mptcp_set_path_manager(sk, name);
|
|
+ release_sock(sk);
|
|
+ return err;
|
|
+ }
|
|
+#endif
|
|
default:
|
|
/* fallthru */
|
|
break;
|
|
@@ -3062,6 +3228,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
|
|
break;
|
|
|
|
case TCP_DEFER_ACCEPT:
|
|
+ /* An established MPTCP-connection (mptcp(tp) only returns true
|
|
+ * if the socket is established) should not use DEFER on new
|
|
+ * subflows.
|
|
+ */
|
|
+ if (mptcp(tp))
|
|
+ break;
|
|
/* Translate value in seconds to number of retransmits */
|
|
icsk->icsk_accept_queue.rskq_defer_accept =
|
|
secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
|
|
@@ -3089,7 +3261,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
|
|
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
|
|
inet_csk_ack_scheduled(sk)) {
|
|
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
|
|
- tcp_cleanup_rbuf(sk, 1);
|
|
+ tp->ops->cleanup_rbuf(sk, 1);
|
|
if (!(val & 1))
|
|
inet_csk_enter_pingpong_mode(sk);
|
|
}
|
|
@@ -3099,7 +3271,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
case TCP_MD5SIG:
|
|
case TCP_MD5SIG_EXT:
|
|
- err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
|
|
+ if (!sock_flag(sk, SOCK_MPTCP))
|
|
+ err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
|
|
+ else
|
|
+ err = -EINVAL;
|
|
break;
|
|
#endif
|
|
case TCP_USER_TIMEOUT:
|
|
@@ -3155,6 +3330,32 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
|
|
tp->notsent_lowat = val;
|
|
sk->sk_write_space(sk);
|
|
break;
|
|
+#ifdef CONFIG_MPTCP
|
|
+ case MPTCP_ENABLED:
|
|
+ if (mptcp_init_failed || !sysctl_mptcp_enabled ||
|
|
+ sk->sk_state != TCP_CLOSE
|
|
+#ifdef CONFIG_TCP_MD5SIG
|
|
+ || rcu_access_pointer(tp->md5sig_info)
|
|
+#endif
|
|
+ ) {
|
|
+ err = -EPERM;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (val)
|
|
+ mptcp_enable_sock(sk);
|
|
+ else
|
|
+ mptcp_disable_sock(sk);
|
|
+ break;
|
|
+ case MPTCP_INFO:
|
|
+ if (mptcp_init_failed || !sysctl_mptcp_enabled) {
|
|
+ err = -EPERM;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ tp->record_master_info = !!(val & MPTCP_INFO_FLAG_SAVE_MASTER);
|
|
+ break;
|
|
+#endif
|
|
case TCP_INQ:
|
|
if (val > 1 || val < 0)
|
|
err = -EINVAL;
|
|
@@ -3219,7 +3420,7 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
|
|
}
|
|
|
|
/* Return information about state of tcp endpoint in API format. */
|
|
-void tcp_get_info(struct sock *sk, struct tcp_info *info)
|
|
+void tcp_get_info(struct sock *sk, struct tcp_info *info, bool no_lock)
|
|
{
|
|
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
|
|
const struct inet_connection_sock *icsk = inet_csk(sk);
|
|
@@ -3256,7 +3457,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
|
|
return;
|
|
}
|
|
|
|
- slow = lock_sock_fast(sk);
|
|
+ if (!no_lock)
|
|
+ slow = lock_sock_fast(sk);
|
|
|
|
info->tcpi_ca_state = icsk->icsk_ca_state;
|
|
info->tcpi_retransmits = icsk->icsk_retransmits;
|
|
@@ -3332,7 +3534,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
|
|
info->tcpi_reord_seen = tp->reord_seen;
|
|
info->tcpi_rcv_ooopack = tp->rcv_ooopack;
|
|
info->tcpi_snd_wnd = tp->snd_wnd;
|
|
- unlock_sock_fast(sk, slow);
|
|
+
|
|
+ if (!no_lock)
|
|
+ unlock_sock_fast(sk, slow);
|
|
}
|
|
EXPORT_SYMBOL_GPL(tcp_get_info);
|
|
|
|
@@ -3479,7 +3683,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
|
|
if (get_user(len, optlen))
|
|
return -EFAULT;
|
|
|
|
- tcp_get_info(sk, &info);
|
|
+ tcp_get_info(sk, &info, false);
|
|
|
|
len = min_t(unsigned int, len, sizeof(info));
|
|
if (put_user(len, optlen))
|
|
@@ -3668,6 +3872,87 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
|
|
}
|
|
return 0;
|
|
}
|
|
+#ifdef CONFIG_MPTCP
|
|
+ case MPTCP_SCHEDULER:
|
|
+ if (get_user(len, optlen))
|
|
+ return -EFAULT;
|
|
+ len = min_t(unsigned int, len, MPTCP_SCHED_NAME_MAX);
|
|
+ if (put_user(len, optlen))
|
|
+ return -EFAULT;
|
|
+
|
|
+ lock_sock(sk);
|
|
+ if (mptcp(tcp_sk(sk))) {
|
|
+ struct mptcp_cb *mpcb = tcp_sk(mptcp_meta_sk(sk))->mpcb;
|
|
+
|
|
+ if (copy_to_user(optval, mpcb->sched_ops->name, len)) {
|
|
+ release_sock(sk);
|
|
+ return -EFAULT;
|
|
+ }
|
|
+ } else {
|
|
+ if (copy_to_user(optval, tcp_sk(sk)->mptcp_sched_name,
|
|
+ len)) {
|
|
+ release_sock(sk);
|
|
+ return -EFAULT;
|
|
+ }
|
|
+ }
|
|
+ release_sock(sk);
|
|
+ return 0;
|
|
+
|
|
+ case MPTCP_PATH_MANAGER:
|
|
+ if (get_user(len, optlen))
|
|
+ return -EFAULT;
|
|
+ len = min_t(unsigned int, len, MPTCP_PM_NAME_MAX);
|
|
+ if (put_user(len, optlen))
|
|
+ return -EFAULT;
|
|
+
|
|
+ lock_sock(sk);
|
|
+ if (mptcp(tcp_sk(sk))) {
|
|
+ struct mptcp_cb *mpcb = tcp_sk(mptcp_meta_sk(sk))->mpcb;
|
|
+
|
|
+ if (copy_to_user(optval, mpcb->pm_ops->name, len)) {
|
|
+ release_sock(sk);
|
|
+ return -EFAULT;
|
|
+ }
|
|
+ } else {
|
|
+ if (copy_to_user(optval, tcp_sk(sk)->mptcp_pm_name,
|
|
+ len)) {
|
|
+ release_sock(sk);
|
|
+ return -EFAULT;
|
|
+ }
|
|
+ }
|
|
+ release_sock(sk);
|
|
+ return 0;
|
|
+
|
|
+ case MPTCP_ENABLED:
|
|
+ if (sk->sk_state != TCP_SYN_SENT)
|
|
+ val = mptcp(tp) ? 1 : 0;
|
|
+ else
|
|
+ val = sock_flag(sk, SOCK_MPTCP) ? 1 : 0;
|
|
+ break;
|
|
+ case MPTCP_INFO:
|
|
+ {
|
|
+ int ret;
|
|
+
|
|
+ if (!mptcp(tp))
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (get_user(len, optlen))
|
|
+ return -EFAULT;
|
|
+
|
|
+ len = min_t(unsigned int, len, sizeof(struct mptcp_info));
|
|
+
|
|
+ lock_sock(sk);
|
|
+ ret = mptcp_get_info(sk, optval, len);
|
|
+ release_sock(sk);
|
|
+
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (put_user(len, optlen))
|
|
+ return -EFAULT;
|
|
+ return 0;
|
|
+ }
|
|
+#endif
|
|
#ifdef CONFIG_MMU
|
|
case TCP_ZEROCOPY_RECEIVE: {
|
|
struct tcp_zerocopy_receive zc;
|
|
@@ -3873,7 +4158,9 @@ void tcp_done(struct sock *sk)
|
|
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
|
|
TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
|
|
|
|
+ WARN_ON(sk->sk_state == TCP_CLOSE);
|
|
tcp_set_state(sk, TCP_CLOSE);
|
|
+
|
|
tcp_clear_xmit_timers(sk);
|
|
if (req)
|
|
reqsk_fastopen_remove(sk, req, false);
|
|
@@ -3889,6 +4176,8 @@ void tcp_done(struct sock *sk)
|
|
|
|
int tcp_abort(struct sock *sk, int err)
|
|
{
|
|
+ struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk;
|
|
+
|
|
if (!sk_fullsock(sk)) {
|
|
if (sk->sk_state == TCP_NEW_SYN_RECV) {
|
|
struct request_sock *req = inet_reqsk(sk);
|
|
@@ -3902,7 +4191,7 @@ int tcp_abort(struct sock *sk, int err)
|
|
}
|
|
|
|
/* Don't race with userspace socket closes such as tcp_close. */
|
|
- lock_sock(sk);
|
|
+ lock_sock(meta_sk);
|
|
|
|
if (sk->sk_state == TCP_LISTEN) {
|
|
tcp_set_state(sk, TCP_CLOSE);
|
|
@@ -3911,7 +4200,7 @@ int tcp_abort(struct sock *sk, int err)
|
|
|
|
/* Don't race with BH socket closes such as inet_csk_listen_stop. */
|
|
local_bh_disable();
|
|
- bh_lock_sock(sk);
|
|
+ bh_lock_sock(meta_sk);
|
|
|
|
if (!sock_flag(sk, SOCK_DEAD)) {
|
|
sk->sk_err = err;
|
|
@@ -3919,14 +4208,14 @@ int tcp_abort(struct sock *sk, int err)
|
|
smp_wmb();
|
|
sk->sk_error_report(sk);
|
|
if (tcp_need_reset(sk->sk_state))
|
|
- tcp_send_active_reset(sk, GFP_ATOMIC);
|
|
+ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
|
|
tcp_done(sk);
|
|
}
|
|
|
|
- bh_unlock_sock(sk);
|
|
+ bh_unlock_sock(meta_sk);
|
|
local_bh_enable();
|
|
tcp_write_queue_purge(sk);
|
|
- release_sock(sk);
|
|
+ release_sock(meta_sk);
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(tcp_abort);
|
|
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
|
|
index 6d5600889dcf..247c1168b6a5 100644
|
|
--- a/net/ipv4/tcp_cong.c
|
|
+++ b/net/ipv4/tcp_cong.c
|
|
@@ -337,13 +337,19 @@ int tcp_set_allowed_congestion_control(char *val)
|
|
return ret;
|
|
}
|
|
|
|
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
|
|
+ bool reinit, bool cap_net_admin)
|
|
+{
|
|
+ return tcp_sk(sk)->ops->set_cong_ctrl(sk, name, load, reinit, cap_net_admin);
|
|
+}
|
|
+
|
|
/* Change congestion control for socket. If load is false, then it is the
|
|
* responsibility of the caller to call tcp_init_congestion_control or
|
|
* tcp_reinit_congestion_control (if the current congestion control was
|
|
* already initialized.
|
|
*/
|
|
-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
|
|
- bool reinit, bool cap_net_admin)
|
|
+int __tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
|
|
+ bool reinit, bool cap_net_admin)
|
|
{
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
const struct tcp_congestion_ops *ca;
|
|
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
|
|
index 549506162dde..e5a530e0b1c5 100644
|
|
--- a/net/ipv4/tcp_diag.c
|
|
+++ b/net/ipv4/tcp_diag.c
|
|
@@ -31,7 +31,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
|
|
r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una;
|
|
}
|
|
if (info)
|
|
- tcp_get_info(sk, info);
|
|
+ tcp_get_info(sk, info, false);
|
|
}
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
|
|
index a5ec77a5ad6f..f9fb4a268b9b 100644
|
|
--- a/net/ipv4/tcp_fastopen.c
|
|
+++ b/net/ipv4/tcp_fastopen.c
|
|
@@ -9,6 +9,7 @@
|
|
#include <linux/rculist.h>
|
|
#include <net/inetpeer.h>
|
|
#include <net/tcp.h>
|
|
+#include <net/mptcp.h>
|
|
|
|
void tcp_fastopen_init_key_once(struct net *net)
|
|
{
|
|
@@ -136,8 +137,6 @@ static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
|
|
const siphash_key_t *key,
|
|
struct tcp_fastopen_cookie *foc)
|
|
{
|
|
- BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
|
|
-
|
|
if (req->rsk_ops->family == AF_INET) {
|
|
const struct iphdr *iph = ip_hdr(syn);
|
|
|
|
@@ -258,8 +257,9 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
|
|
{
|
|
struct tcp_sock *tp;
|
|
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
|
|
- struct sock *child;
|
|
+ struct sock *child, *meta_sk;
|
|
bool own_req;
|
|
+ int ret;
|
|
|
|
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
|
|
NULL, &own_req);
|
|
@@ -294,15 +294,27 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
|
|
|
|
refcount_set(&req->rsk_refcnt, 2);
|
|
|
|
- /* Now finish processing the fastopen child socket. */
|
|
- tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
|
|
-
|
|
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
|
|
|
|
tcp_fastopen_add_skb(child, skb);
|
|
|
|
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
|
|
tp->rcv_wup = tp->rcv_nxt;
|
|
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
|
+
|
|
+ meta_sk = child;
|
|
+ ret = mptcp_check_req_fastopen(meta_sk, req);
|
|
+ if (ret < 0)
|
|
+ return NULL;
|
|
+
|
|
+ if (ret == 0) {
|
|
+ child = tcp_sk(meta_sk)->mpcb->master_sk;
|
|
+ tp = tcp_sk(child);
|
|
+ }
|
|
+
|
|
+ /* Now finish processing the fastopen child socket. */
|
|
+ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
|
|
+
|
|
/* tcp_conn_request() is sending the SYNACK,
|
|
* and queues the child into listener accept queue.
|
|
*/
|
|
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
|
|
index c0fcfa296468..dae2ce9656b8 100644
|
|
--- a/net/ipv4/tcp_input.c
|
|
+++ b/net/ipv4/tcp_input.c
|
|
@@ -76,35 +76,15 @@
|
|
#include <linux/ipsec.h>
|
|
#include <asm/unaligned.h>
|
|
#include <linux/errqueue.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
+#include <net/mptcp_v6.h>
|
|
#include <trace/events/tcp.h>
|
|
#include <linux/jump_label_ratelimit.h>
|
|
#include <net/busy_poll.h>
|
|
|
|
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
|
|
|
|
-#define FLAG_DATA 0x01 /* Incoming frame contained data. */
|
|
-#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
|
|
-#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
|
|
-#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
|
|
-#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
|
|
-#define FLAG_DATA_SACKED 0x20 /* New SACK. */
|
|
-#define FLAG_ECE 0x40 /* ECE in this ACK */
|
|
-#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
|
|
-#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
|
|
-#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
|
|
-#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
|
|
-#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
|
|
-#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
|
|
-#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
|
|
-#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
|
|
-#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
|
|
-#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
|
|
-
|
|
-#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
|
|
-#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
|
|
-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
|
|
-#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
|
|
-
|
|
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
|
|
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
|
|
|
|
@@ -349,8 +329,12 @@ static void tcp_sndbuf_expand(struct sock *sk)
|
|
per_mss = roundup_pow_of_two(per_mss) +
|
|
SKB_DATA_ALIGN(sizeof(struct sk_buff));
|
|
|
|
- nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
|
|
- nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
|
|
+ if (mptcp(tp)) {
|
|
+ nr_segs = mptcp_check_snd_buf(tp);
|
|
+ } else {
|
|
+ nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
|
|
+ nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
|
|
+ }
|
|
|
|
/* Fast Recovery (RFC 5681 3.2) :
|
|
* Cubic needs 1.7 factor, rounded to 2 to include
|
|
@@ -359,9 +343,17 @@ static void tcp_sndbuf_expand(struct sock *sk)
|
|
sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
|
|
sndmem *= nr_segs * per_mss;
|
|
|
|
- if (sk->sk_sndbuf < sndmem)
|
|
+ /* MPTCP: after this sndmem is the new contribution of the
|
|
+ * current subflow to the aggregated sndbuf */
|
|
+ if (sk->sk_sndbuf < sndmem) {
|
|
+ int old_sndbuf = sk->sk_sndbuf;
|
|
WRITE_ONCE(sk->sk_sndbuf,
|
|
min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
|
|
+ /* MPTCP: ok, the subflow sndbuf has grown, reflect
|
|
+ * this in the aggregate buffer.*/
|
|
+ if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf)
|
|
+ mptcp_update_sndbuf(tp);
|
|
+ }
|
|
}
|
|
|
|
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
|
|
@@ -410,9 +402,14 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
|
|
static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
int room;
|
|
|
|
- room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
|
|
+ if (is_meta_sk(sk))
|
|
+ return;
|
|
+
|
|
+ room = min_t(int, meta_tp->window_clamp, tcp_space(meta_sk)) - meta_tp->rcv_ssthresh;
|
|
|
|
/* Check #1 */
|
|
if (room > 0 && !tcp_under_memory_pressure(sk)) {
|
|
@@ -422,13 +419,13 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
|
|
* will fit to rcvbuf in future.
|
|
*/
|
|
if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
|
|
- incr = 2 * tp->advmss;
|
|
+ incr = 2 * meta_tp->advmss;
|
|
else
|
|
- incr = __tcp_grow_window(sk, skb);
|
|
+ incr = __tcp_grow_window(meta_sk, skb);
|
|
|
|
if (incr) {
|
|
incr = max_t(int, incr, 2 * skb->len);
|
|
- tp->rcv_ssthresh += min(room, incr);
|
|
+ meta_tp->rcv_ssthresh += min(room, incr);
|
|
inet_csk(sk)->icsk_ack.quick |= 1;
|
|
}
|
|
}
|
|
@@ -612,7 +609,10 @@ void tcp_rcv_space_adjust(struct sock *sk)
|
|
|
|
tcp_mstamp_refresh(tp);
|
|
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
|
|
- if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
|
|
+ if (mptcp(tp)) {
|
|
+ if (mptcp_check_rtt(tp, time))
|
|
+ return;
|
|
+ } else if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
|
|
return;
|
|
|
|
/* Number of bytes copied to user in last RTT */
|
|
@@ -835,7 +835,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
|
|
/* Calculate rto without backoff. This is the second half of Van Jacobson's
|
|
* routine referred to above.
|
|
*/
|
|
-static void tcp_set_rto(struct sock *sk)
|
|
+void tcp_set_rto(struct sock *sk)
|
|
{
|
|
const struct tcp_sock *tp = tcp_sk(sk);
|
|
/* Old crap is replaced with new one. 8)
|
|
@@ -1407,6 +1407,13 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
|
|
int len;
|
|
int in_sack;
|
|
|
|
+ /* For MPTCP we cannot shift skb-data and remove one skb from the
|
|
+ * send-queue, because this will make us loose the DSS-option (which
|
|
+ * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing.
|
|
+ */
|
|
+ if (mptcp(tp))
|
|
+ goto fallback;
|
|
+
|
|
/* Normally R but no L won't result in plain S */
|
|
if (!dup_sack &&
|
|
(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
|
|
@@ -2962,7 +2969,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
|
|
*/
|
|
tcp_update_rtt_min(sk, ca_rtt_us, flag);
|
|
tcp_rtt_estimator(sk, seq_rtt_us);
|
|
- tcp_set_rto(sk);
|
|
+ tp->ops->set_rto(sk);
|
|
|
|
/* RFC6298: only reset backoff on valid RTT measurement. */
|
|
inet_csk(sk)->icsk_backoff = 0;
|
|
@@ -3030,7 +3037,7 @@ static void tcp_set_xmit_timer(struct sock *sk)
|
|
}
|
|
|
|
/* If we get here, the whole TSO packet has not been acked. */
|
|
-static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
|
|
+u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
u32 packets_acked;
|
|
@@ -3050,8 +3057,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
|
|
return packets_acked;
|
|
}
|
|
|
|
-static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
|
|
- u32 prior_snd_una)
|
|
+void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, u32 prior_snd_una)
|
|
{
|
|
const struct skb_shared_info *shinfo;
|
|
|
|
@@ -3156,6 +3162,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
|
|
*/
|
|
if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
|
|
flag |= FLAG_DATA_ACKED;
|
|
+ if (mptcp(tp) && mptcp_is_data_seq(skb))
|
|
+ flag |= MPTCP_FLAG_DATA_ACKED;
|
|
} else {
|
|
flag |= FLAG_SYN_ACKED;
|
|
tp->retrans_stamp = 0;
|
|
@@ -3276,7 +3284,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
|
|
return flag;
|
|
}
|
|
|
|
-static void tcp_ack_probe(struct sock *sk)
|
|
+void tcp_ack_probe(struct sock *sk)
|
|
{
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
struct sk_buff *head = tcp_send_head(sk);
|
|
@@ -3350,9 +3358,8 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
|
|
/* Check that window update is acceptable.
|
|
* The function assumes that snd_una<=ack<=snd_next.
|
|
*/
|
|
-static inline bool tcp_may_update_window(const struct tcp_sock *tp,
|
|
- const u32 ack, const u32 ack_seq,
|
|
- const u32 nwin)
|
|
+bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
|
|
+ const u32 ack_seq, const u32 nwin)
|
|
{
|
|
return after(ack, tp->snd_una) ||
|
|
after(ack_seq, tp->snd_wl1) ||
|
|
@@ -3590,7 +3597,7 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
|
|
}
|
|
|
|
/* This routine deals with incoming acks, but not outgoing ones. */
|
|
-static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
|
|
+static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
|
|
{
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
@@ -3713,6 +3720,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
|
|
|
|
tcp_rack_update_reo_wnd(sk, &rs);
|
|
|
|
+ if (mptcp(tp)) {
|
|
+ if (mptcp_fallback_infinite(sk, flag)) {
|
|
+ pr_debug("%s resetting flow\n", __func__);
|
|
+ mptcp_send_reset(sk);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ mptcp_clean_rtx_infinite(skb, sk);
|
|
+ }
|
|
+
|
|
if (tp->tlp_high_seq)
|
|
tcp_process_tlp_ack(sk, ack, flag);
|
|
|
|
@@ -3856,8 +3873,10 @@ static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
|
|
*/
|
|
void tcp_parse_options(const struct net *net,
|
|
const struct sk_buff *skb,
|
|
- struct tcp_options_received *opt_rx, int estab,
|
|
- struct tcp_fastopen_cookie *foc)
|
|
+ struct tcp_options_received *opt_rx,
|
|
+ struct mptcp_options_received *mopt,
|
|
+ int estab, struct tcp_fastopen_cookie *foc,
|
|
+ struct tcp_sock *tp)
|
|
{
|
|
const unsigned char *ptr;
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
@@ -3943,6 +3962,10 @@ void tcp_parse_options(const struct net *net,
|
|
*/
|
|
break;
|
|
#endif
|
|
+ case TCPOPT_MPTCP:
|
|
+ mptcp_parse_options(ptr - 2, opsize, mopt, skb, tp);
|
|
+ break;
|
|
+
|
|
case TCPOPT_FASTOPEN:
|
|
tcp_parse_fastopen_option(
|
|
opsize - TCPOLEN_FASTOPEN_BASE,
|
|
@@ -4010,7 +4033,9 @@ static bool tcp_fast_parse_options(const struct net *net,
|
|
return true;
|
|
}
|
|
|
|
- tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
|
|
+ tcp_parse_options(net, skb, &tp->rx_opt,
|
|
+ mptcp(tp) ? &tp->mptcp->rx_opt : NULL, 1, NULL, tp);
|
|
+
|
|
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
|
|
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
|
|
|
|
@@ -4120,7 +4145,7 @@ static inline bool tcp_paws_discard(const struct sock *sk,
|
|
static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
|
|
{
|
|
return !before(end_seq, tp->rcv_wup) &&
|
|
- !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
|
|
+ !after(seq, tp->rcv_nxt + tcp_receive_window_no_shrink(tp));
|
|
}
|
|
|
|
/* When we get a reset we do this. */
|
|
@@ -4169,6 +4194,11 @@ void tcp_fin(struct sock *sk)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
+ if (is_meta_sk(sk)) {
|
|
+ mptcp_fin(sk);
|
|
+ return;
|
|
+ }
|
|
+
|
|
inet_csk_schedule_ack(sk);
|
|
|
|
sk->sk_shutdown |= RCV_SHUTDOWN;
|
|
@@ -4179,6 +4209,10 @@ void tcp_fin(struct sock *sk)
|
|
case TCP_ESTABLISHED:
|
|
/* Move to CLOSE_WAIT */
|
|
tcp_set_state(sk, TCP_CLOSE_WAIT);
|
|
+
|
|
+ if (mptcp(tp))
|
|
+ mptcp_sub_close_passive(sk);
|
|
+
|
|
inet_csk_enter_pingpong_mode(sk);
|
|
break;
|
|
|
|
@@ -4201,9 +4235,16 @@ void tcp_fin(struct sock *sk)
|
|
tcp_set_state(sk, TCP_CLOSING);
|
|
break;
|
|
case TCP_FIN_WAIT2:
|
|
+ if (mptcp(tp)) {
|
|
+ /* The socket will get closed by mptcp_data_ready.
|
|
+ * We first have to process all data-sequences.
|
|
+ */
|
|
+ tp->close_it = 1;
|
|
+ break;
|
|
+ }
|
|
/* Received a FIN -- send ACK and enter TIME_WAIT. */
|
|
tcp_send_ack(sk);
|
|
- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
|
|
+ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
|
|
break;
|
|
default:
|
|
/* Only TCP_LISTEN and TCP_CLOSE are left, in these
|
|
@@ -4225,6 +4266,10 @@ void tcp_fin(struct sock *sk)
|
|
if (!sock_flag(sk, SOCK_DEAD)) {
|
|
sk->sk_state_change(sk);
|
|
|
|
+ /* Don't wake up MPTCP-subflows */
|
|
+ if (mptcp(tp))
|
|
+ return;
|
|
+
|
|
/* Do not send POLL_HUP for half duplex close. */
|
|
if (sk->sk_shutdown == SHUTDOWN_MASK ||
|
|
sk->sk_state == TCP_CLOSE)
|
|
@@ -4439,6 +4484,9 @@ static bool tcp_try_coalesce(struct sock *sk,
|
|
|
|
*fragstolen = false;
|
|
|
|
+ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))
|
|
+ return false;
|
|
+
|
|
/* Its possible this segment overlaps with prior segment in queue */
|
|
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
|
|
return false;
|
|
@@ -4493,7 +4541,7 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
|
|
/* This one checks to see if we can put data from the
|
|
* out_of_order queue into the receive_queue.
|
|
*/
|
|
-static void tcp_ofo_queue(struct sock *sk)
|
|
+void tcp_ofo_queue(struct sock *sk)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
__u32 dsack_high = tp->rcv_nxt;
|
|
@@ -4516,7 +4564,14 @@ static void tcp_ofo_queue(struct sock *sk)
|
|
p = rb_next(p);
|
|
rb_erase(&skb->rbnode, &tp->out_of_order_queue);
|
|
|
|
- if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
|
|
+ /* In case of MPTCP, the segment may be empty if it's a
|
|
+ * non-data DATA_FIN. (see beginning of tcp_data_queue)
|
|
+ *
|
|
+ * But this only holds true for subflows, not for the
|
|
+ * meta-socket.
|
|
+ */
|
|
+ if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) &&
|
|
+ (is_meta_sk(sk) || !mptcp(tp) || TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq))) {
|
|
tcp_drop(sk, skb);
|
|
continue;
|
|
}
|
|
@@ -4546,6 +4601,9 @@ static void tcp_ofo_queue(struct sock *sk)
|
|
static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
|
|
unsigned int size)
|
|
{
|
|
+ if (mptcp(tcp_sk(sk)))
|
|
+ sk = mptcp_meta_sk(sk);
|
|
+
|
|
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
|
|
!sk_rmem_schedule(sk, skb, size)) {
|
|
|
|
@@ -4560,7 +4618,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
|
|
return 0;
|
|
}
|
|
|
|
-static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
|
|
+void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
struct rb_node **p, *parent;
|
|
@@ -4632,7 +4690,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
|
|
continue;
|
|
}
|
|
if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
|
|
- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
|
|
+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) &&
|
|
+ (is_meta_sk(sk) || !mptcp(tp) || end_seq != seq)) {
|
|
/* All the bits are present. Drop. */
|
|
NET_INC_STATS(sock_net(sk),
|
|
LINUX_MIB_TCPOFOMERGE);
|
|
@@ -4679,6 +4738,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
|
|
end_seq);
|
|
break;
|
|
}
|
|
+ /* MPTCP allows non-data data-fin to be in the ofo-queue */
|
|
+ if (mptcp(tp) && !is_meta_sk(sk) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq) {
|
|
+ skb = skb1;
|
|
+ continue;
|
|
+ }
|
|
rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
|
|
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
|
|
TCP_SKB_CB(skb1)->end_seq);
|
|
@@ -4690,7 +4754,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
|
|
tp->ooo_last_skb = skb;
|
|
|
|
add_sack:
|
|
- if (tcp_is_sack(tp))
|
|
+ if (tcp_is_sack(tp) && seq != end_seq)
|
|
tcp_sack_new_ofo_skb(sk, seq, end_seq);
|
|
end:
|
|
if (skb) {
|
|
@@ -4704,8 +4768,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
|
|
}
|
|
}
|
|
|
|
-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
|
|
- bool *fragstolen)
|
|
+int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
|
|
+ bool *fragstolen)
|
|
{
|
|
int eaten;
|
|
struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
|
|
@@ -4780,7 +4844,8 @@ void tcp_data_ready(struct sock *sk)
|
|
|
|
if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) &&
|
|
!sock_flag(sk, SOCK_DONE) &&
|
|
- tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss)
|
|
+ tcp_receive_window_now(tp) > inet_csk(sk)->icsk_ack.rcv_mss &&
|
|
+ !mptcp(tp))
|
|
return;
|
|
|
|
sk->sk_data_ready(sk);
|
|
@@ -4792,10 +4857,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
|
bool fragstolen;
|
|
int eaten;
|
|
|
|
- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
|
|
+ /* If no data is present, but a data_fin is in the options, we still
|
|
+ * have to call mptcp_queue_skb later on. */
|
|
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
|
|
+ !(mptcp(tp) && mptcp_is_data_fin(skb))) {
|
|
__kfree_skb(skb);
|
|
return;
|
|
}
|
|
+
|
|
skb_dst_drop(skb);
|
|
__skb_pull(skb, tcp_hdr(skb)->doff * 4);
|
|
|
|
@@ -4806,7 +4875,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
|
* Out of sequence packets to the out_of_order_queue.
|
|
*/
|
|
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
|
|
- if (tcp_receive_window(tp) == 0) {
|
|
+ if (tcp_receive_window_no_shrink(tp) == 0) {
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
|
|
goto out_of_window;
|
|
}
|
|
@@ -4822,7 +4891,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
|
}
|
|
|
|
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
|
|
- if (skb->len)
|
|
+ if (skb->len || mptcp_is_data_fin(skb))
|
|
tcp_event_data_recv(sk, skb);
|
|
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
|
|
tcp_fin(sk);
|
|
@@ -4844,7 +4913,11 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
|
|
|
if (eaten > 0)
|
|
kfree_skb_partial(skb, fragstolen);
|
|
- if (!sock_flag(sk, SOCK_DEAD))
|
|
+ if (!sock_flag(sk, SOCK_DEAD) || mptcp(tp))
|
|
+ /* MPTCP: we always have to call data_ready, because
|
|
+ * we may be about to receive a data-fin, which still
|
|
+ * must get queued.
|
|
+ */
|
|
tcp_data_ready(sk);
|
|
return;
|
|
}
|
|
@@ -4864,7 +4937,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
|
}
|
|
|
|
/* Out of window. F.e. zero window probe. */
|
|
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
|
|
+ if (!before(TCP_SKB_CB(skb)->seq,
|
|
+ tp->rcv_nxt + tcp_receive_window_no_shrink(tp)))
|
|
goto out_of_window;
|
|
|
|
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
|
|
@@ -4874,7 +4948,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
|
|
/* If window is closed, drop tail of packet. But after
|
|
* remembering D-SACK for its head made in previous line.
|
|
*/
|
|
- if (!tcp_receive_window(tp)) {
|
|
+ if (!tcp_receive_window_no_shrink(tp)) {
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
|
|
goto out_of_window;
|
|
}
|
|
@@ -5187,7 +5261,7 @@ static int tcp_prune_queue(struct sock *sk)
|
|
return -1;
|
|
}
|
|
|
|
-static bool tcp_should_expand_sndbuf(const struct sock *sk)
|
|
+bool tcp_should_expand_sndbuf(const struct sock *sk)
|
|
{
|
|
const struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
@@ -5222,7 +5296,7 @@ static void tcp_new_space(struct sock *sk)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
- if (tcp_should_expand_sndbuf(sk)) {
|
|
+ if (tp->ops->should_expand_sndbuf(sk)) {
|
|
tcp_sndbuf_expand(sk);
|
|
tp->snd_cwnd_stamp = tcp_jiffies32;
|
|
}
|
|
@@ -5236,10 +5310,11 @@ static void tcp_check_space(struct sock *sk)
|
|
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
|
|
/* pairs with tcp_poll() */
|
|
smp_mb();
|
|
- if (sk->sk_socket &&
|
|
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
|
|
+ if (mptcp(tcp_sk(sk)) ||
|
|
+ (sk->sk_socket &&
|
|
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))) {
|
|
tcp_new_space(sk);
|
|
- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
|
|
+ if (sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
|
|
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
|
|
}
|
|
}
|
|
@@ -5258,6 +5333,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
unsigned long rtt, delay;
|
|
+ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
|
|
/* More than one full frame received... */
|
|
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
|
|
@@ -5266,8 +5343,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
|
|
* If application uses SO_RCVLOWAT, we want send ack now if
|
|
* we have not received enough bytes to satisfy the condition.
|
|
*/
|
|
- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
|
|
- __tcp_select_window(sk) >= tp->rcv_wnd)) ||
|
|
+ (meta_tp->rcv_nxt - meta_tp->copied_seq < meta_sk->sk_rcvlowat ||
|
|
+ tp->ops->__select_window(sk) >= tp->rcv_wnd)) ||
|
|
/* We ACK each frame or... */
|
|
tcp_in_quickack_mode(sk) ||
|
|
/* Protocol state mandates a one-time immediate ACK */
|
|
@@ -5402,6 +5479,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
+ /* MPTCP urgent data is not yet supported */
|
|
+ if (mptcp(tp))
|
|
+ return;
|
|
+
|
|
/* Check if we get a new urgent pointer - normally not. */
|
|
if (th->urg)
|
|
tcp_check_urg(sk, th);
|
|
@@ -5544,9 +5625,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
|
|
goto discard;
|
|
}
|
|
|
|
+ /* If valid: post process the received MPTCP options. */
|
|
+ if (mptcp(tp) && mptcp_handle_options(sk, th, skb))
|
|
+ goto discard;
|
|
+
|
|
return true;
|
|
|
|
discard:
|
|
+ if (mptcp(tp))
|
|
+ mptcp_reset_mopt(tp);
|
|
tcp_drop(sk, skb);
|
|
return false;
|
|
}
|
|
@@ -5603,6 +5690,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
|
|
|
|
tp->rx_opt.saw_tstamp = 0;
|
|
|
|
+ /* MPTCP: force slowpath. */
|
|
+ if (mptcp(tp))
|
|
+ goto slow_path;
|
|
+
|
|
/* pred_flags is 0xS?10 << 16 + snd_wnd
|
|
* if header_prediction is to be made
|
|
* 'S' will always be tp->tcp_header_len >> 2
|
|
@@ -5777,7 +5868,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
|
|
|
|
tcp_call_bpf(sk, bpf_op, 0, NULL);
|
|
tcp_init_congestion_control(sk);
|
|
- tcp_init_buffer_space(sk);
|
|
+ tcp_sk(sk)->ops->init_buffer_space(sk);
|
|
}
|
|
|
|
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
|
|
@@ -5814,17 +5905,24 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
|
|
struct tcp_fastopen_cookie *cookie)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
- struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
|
|
+ struct sk_buff *data = NULL;
|
|
u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
|
|
bool syn_drop = false;
|
|
|
|
+ if (tp->syn_data) {
|
|
+ if (mptcp(tp))
|
|
+ data = tcp_write_queue_head(mptcp_meta_sk(sk));
|
|
+ else
|
|
+ data = tcp_rtx_queue_head(sk);
|
|
+ }
|
|
+
|
|
if (mss == tp->rx_opt.user_mss) {
|
|
struct tcp_options_received opt;
|
|
|
|
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
|
|
tcp_clear_options(&opt);
|
|
opt.user_mss = opt.mss_clamp = 0;
|
|
- tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
|
|
+ tcp_parse_options(sock_net(sk), synack, &opt, NULL, 0, NULL, NULL);
|
|
mss = opt.mss_clamp;
|
|
}
|
|
|
|
@@ -5848,7 +5946,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
|
|
|
|
tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
|
|
|
|
- if (data) { /* Retransmit unacked data in SYN */
|
|
+ /* In mptcp case, we do not rely on "retransmit", but instead on
|
|
+ * "transmit", because if fastopen data is not acked, the retransmission
|
|
+ * becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen).
|
|
+ */
|
|
+ if (data && !mptcp(tp)) { /* Retransmit unacked data in SYN */
|
|
skb_rbtree_walk_from(data) {
|
|
if (__tcp_retransmit_skb(sk, data, 1))
|
|
break;
|
|
@@ -5903,9 +6005,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
struct tcp_fastopen_cookie foc = { .len = -1 };
|
|
int saved_clamp = tp->rx_opt.mss_clamp;
|
|
+ struct mptcp_options_received mopt;
|
|
bool fastopen_fail;
|
|
|
|
- tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
|
|
+ mptcp_init_mp_opt(&mopt);
|
|
+
|
|
+ tcp_parse_options(sock_net(sk), skb, &tp->rx_opt,
|
|
+ mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc, tp);
|
|
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
|
|
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
|
|
|
|
@@ -5966,11 +6072,41 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
|
|
tcp_try_undo_spurious_syn(sk);
|
|
tcp_ack(sk, skb, FLAG_SLOWPATH);
|
|
|
|
+ if (tp->request_mptcp || mptcp(tp)) {
|
|
+ int ret;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+ ret = mptcp_rcv_synsent_state_process(sk, &sk,
|
|
+ skb, &mopt);
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ /* May have changed if we support MPTCP */
|
|
+ tp = tcp_sk(sk);
|
|
+ icsk = inet_csk(sk);
|
|
+
|
|
+ if (ret == 1)
|
|
+ goto reset_and_undo;
|
|
+ if (ret == 2)
|
|
+ goto discard;
|
|
+ }
|
|
+
|
|
+ if (mptcp(tp) && !is_master_tp(tp)) {
|
|
+ /* Timer for repeating the ACK until an answer
|
|
+ * arrives. Used only when establishing an additional
|
|
+ * subflow inside of an MPTCP connection.
|
|
+ */
|
|
+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
|
|
+ jiffies + icsk->icsk_rto);
|
|
+ }
|
|
+
|
|
/* Ok.. it's good. Set up sequence numbers and
|
|
* move to established.
|
|
*/
|
|
WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
|
|
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
|
|
+ tcp_update_rcv_right_edge(tp);
|
|
|
|
/* RFC1323: The window in SYN & SYN/ACK segments is
|
|
* never scaled.
|
|
@@ -5992,6 +6128,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
|
|
tp->tcp_header_len = sizeof(struct tcphdr);
|
|
}
|
|
|
|
+ if (mptcp(tp)) {
|
|
+ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
|
|
+ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
|
|
+ }
|
|
+
|
|
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
|
|
tcp_initialize_rcv_mss(sk);
|
|
|
|
@@ -6015,9 +6156,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
|
|
}
|
|
if (fastopen_fail)
|
|
return -1;
|
|
- if (sk->sk_write_pending ||
|
|
+ /* With MPTCP we cannot send data on the third ack due to the
|
|
+ * lack of option-space to combine with an MP_CAPABLE.
|
|
+ */
|
|
+ if (!mptcp(tp) && (sk->sk_write_pending ||
|
|
icsk->icsk_accept_queue.rskq_defer_accept ||
|
|
- inet_csk_in_pingpong_mode(sk)) {
|
|
+ inet_csk_in_pingpong_mode(sk))) {
|
|
/* Save one ACK. Data will be ready after
|
|
* several ticks, if write_pending is set.
|
|
*
|
|
@@ -6056,6 +6200,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
|
|
tcp_paws_reject(&tp->rx_opt, 0))
|
|
goto discard_and_undo;
|
|
|
|
+ /* TODO - check this here for MPTCP */
|
|
if (th->syn) {
|
|
/* We see SYN without ACK. It is attempt of
|
|
* simultaneous connect with crossed SYNs.
|
|
@@ -6072,9 +6217,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
|
|
tp->tcp_header_len = sizeof(struct tcphdr);
|
|
}
|
|
|
|
+ if (mptcp(tp)) {
|
|
+ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
|
|
+ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
|
|
+ }
|
|
+
|
|
WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
|
|
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
|
|
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
|
|
+ tcp_update_rcv_right_edge(tp);
|
|
|
|
/* RFC1323: The window in SYN & SYN/ACK segments is
|
|
* never scaled.
|
|
@@ -6162,6 +6313,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
|
|
*/
|
|
|
|
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
|
|
+ __releases(&sk->sk_lock.slock)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
@@ -6204,6 +6356,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
|
|
tp->rx_opt.saw_tstamp = 0;
|
|
tcp_mstamp_refresh(tp);
|
|
queued = tcp_rcv_synsent_state_process(sk, skb, th);
|
|
+ if (is_meta_sk(sk)) {
|
|
+ sk = tcp_sk(sk)->mpcb->master_sk;
|
|
+ tp = tcp_sk(sk);
|
|
+
|
|
+ /* Need to call it here, because it will announce new
|
|
+ * addresses, which can only be done after the third ack
|
|
+ * of the 3-way handshake.
|
|
+ */
|
|
+ mptcp_update_metasocket(tp->meta_sk);
|
|
+ }
|
|
if (queued >= 0)
|
|
return queued;
|
|
|
|
@@ -6276,6 +6438,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
|
|
|
|
if (tp->rx_opt.tstamp_ok)
|
|
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
|
|
+ if (mptcp(tp))
|
|
+ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
|
|
|
|
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
|
|
tcp_update_pacing_rate(sk);
|
|
@@ -6285,6 +6449,30 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
|
|
|
|
tcp_initialize_rcv_mss(sk);
|
|
tcp_fast_path_on(tp);
|
|
+
|
|
+ /* Send an ACK when establishing a new MPTCP subflow, i.e.
|
|
+ * using an MP_JOIN subtype.
|
|
+ */
|
|
+ if (mptcp(tp)) {
|
|
+ if (is_master_tp(tp)) {
|
|
+ mptcp_update_metasocket(mptcp_meta_sk(sk));
|
|
+ } else {
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+
|
|
+ tcp_send_ack(sk);
|
|
+
|
|
+ /* Update RTO as it might be worse/better */
|
|
+ mptcp_set_rto(sk);
|
|
+
|
|
+ /* If the new RTO would fire earlier, pull it in! */
|
|
+ if (tcp_sk(meta_sk)->packets_out &&
|
|
+ icsk->icsk_timeout > inet_csk(meta_sk)->icsk_rto + jiffies) {
|
|
+ tcp_rearm_rto(meta_sk);
|
|
+ }
|
|
+
|
|
+ mptcp_push_pending_frames(mptcp_meta_sk(sk));
|
|
+ }
|
|
+ }
|
|
break;
|
|
|
|
case TCP_FIN_WAIT1: {
|
|
@@ -6325,7 +6513,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
|
|
tmo = tcp_fin_time(sk);
|
|
if (tmo > TCP_TIMEWAIT_LEN) {
|
|
inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
|
|
- } else if (th->fin || sock_owned_by_user(sk)) {
|
|
+ } else if (th->fin || mptcp_is_data_fin(skb) ||
|
|
+ sock_owned_by_user(sk)) {
|
|
/* Bad case. We could lose such FIN otherwise.
|
|
* It is not a big problem, but it looks confusing
|
|
* and not so rare event. We still can lose it now,
|
|
@@ -6334,7 +6523,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
|
|
*/
|
|
inet_csk_reset_keepalive_timer(sk, tmo);
|
|
} else {
|
|
- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
|
|
+ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
|
|
goto discard;
|
|
}
|
|
break;
|
|
@@ -6342,7 +6531,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
|
|
|
|
case TCP_CLOSING:
|
|
if (tp->snd_una == tp->write_seq) {
|
|
- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
|
|
+ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
|
|
goto discard;
|
|
}
|
|
break;
|
|
@@ -6354,6 +6543,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
|
|
goto discard;
|
|
}
|
|
break;
|
|
+ case TCP_CLOSE:
|
|
+ if (tp->mp_killed)
|
|
+ goto discard;
|
|
}
|
|
|
|
/* step 6: check the URG bit */
|
|
@@ -6375,7 +6567,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
|
|
*/
|
|
if (sk->sk_shutdown & RCV_SHUTDOWN) {
|
|
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
|
|
- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
|
|
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
|
|
+ !mptcp(tp)) {
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
|
|
tcp_reset(sk);
|
|
return 1;
|
|
@@ -6477,6 +6670,8 @@ static void tcp_openreq_init(struct request_sock *req,
|
|
ireq->wscale_ok = rx_opt->wscale_ok;
|
|
ireq->acked = 0;
|
|
ireq->ecn_ok = 0;
|
|
+ ireq->mptcp_rqsk = 0;
|
|
+ ireq->saw_mpc = 0;
|
|
ireq->ir_rmt_port = tcp_hdr(skb)->source;
|
|
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
|
|
ireq->ir_mark = inet_request_mark(sk, skb);
|
|
@@ -6602,12 +6797,17 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
|
|
/* TW buckets are converted to open requests without
|
|
* limitations, they conserve resources and peer is
|
|
* evidently real one.
|
|
+ *
|
|
+ * MPTCP: new subflows cannot be established in a stateless manner.
|
|
*/
|
|
- if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
|
|
+ if (((!is_meta_sk(sk) && net->ipv4.sysctl_tcp_syncookies == 2) ||
|
|
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
|
|
want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
|
|
if (!want_cookie)
|
|
goto drop;
|
|
+
|
|
+ if (is_meta_sk(sk))
|
|
+ goto drop;
|
|
}
|
|
|
|
if (sk_acceptq_is_full(sk)) {
|
|
@@ -6625,8 +6825,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
|
|
tcp_clear_options(&tmp_opt);
|
|
tmp_opt.mss_clamp = af_ops->mss_clamp;
|
|
tmp_opt.user_mss = tp->rx_opt.user_mss;
|
|
- tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
|
|
- want_cookie ? NULL : &foc);
|
|
+ tcp_parse_options(sock_net(sk), skb, &tmp_opt, NULL, 0,
|
|
+ want_cookie ? NULL : &foc, NULL);
|
|
|
|
if (want_cookie && !tmp_opt.saw_tstamp)
|
|
tcp_clear_options(&tmp_opt);
|
|
@@ -6641,7 +6841,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
|
|
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
|
|
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
|
|
|
|
- af_ops->init_req(req, sk, skb);
|
|
+ if (af_ops->init_req(req, sk, skb, want_cookie))
|
|
+ goto drop_and_free;
|
|
|
|
if (security_inet_conn_request(sk, skb, req))
|
|
goto drop_and_free;
|
|
@@ -6677,7 +6878,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
|
|
tcp_ecn_create_request(req, skb, sk, dst);
|
|
|
|
if (want_cookie) {
|
|
- isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
|
|
+ isn = cookie_init_sequence(af_ops, req, sk, skb, &req->mss);
|
|
req->cookie_ts = tmp_opt.tstamp_ok;
|
|
if (!tmp_opt.tstamp_ok)
|
|
inet_rsk(req)->ecn_ok = 0;
|
|
@@ -6692,17 +6893,25 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
|
|
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
|
|
}
|
|
if (fastopen_sk) {
|
|
+ struct sock *meta_sk = fastopen_sk;
|
|
+
|
|
+ if (mptcp(tcp_sk(fastopen_sk)))
|
|
+ meta_sk = mptcp_meta_sk(fastopen_sk);
|
|
af_ops->send_synack(fastopen_sk, dst, &fl, req,
|
|
&foc, TCP_SYNACK_FASTOPEN);
|
|
/* Add the child socket directly into the accept queue */
|
|
- if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
|
|
+ if (!inet_csk_reqsk_queue_add(sk, req, meta_sk)) {
|
|
reqsk_fastopen_remove(fastopen_sk, req, false);
|
|
bh_unlock_sock(fastopen_sk);
|
|
+ if (meta_sk != fastopen_sk)
|
|
+ bh_unlock_sock(meta_sk);
|
|
sock_put(fastopen_sk);
|
|
goto drop_and_free;
|
|
}
|
|
sk->sk_data_ready(sk);
|
|
bh_unlock_sock(fastopen_sk);
|
|
+ if (meta_sk != fastopen_sk)
|
|
+ bh_unlock_sock(meta_sk);
|
|
sock_put(fastopen_sk);
|
|
} else {
|
|
tcp_rsk(req)->tfo_listener = false;
|
|
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
|
|
index 2ce85e52aea7..2e76c006ad16 100644
|
|
--- a/net/ipv4/tcp_ipv4.c
|
|
+++ b/net/ipv4/tcp_ipv4.c
|
|
@@ -62,6 +62,8 @@
|
|
#include <net/icmp.h>
|
|
#include <net/inet_hashtables.h>
|
|
#include <net/tcp.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
#include <net/transp_v6.h>
|
|
#include <net/ipv6.h>
|
|
#include <net/inet_common.h>
|
|
@@ -209,6 +211,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
|
|
struct ip_options_rcu *inet_opt;
|
|
struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
|
|
|
|
+ mptcp_init_connect(sk);
|
|
+
|
|
if (addr_len < sizeof(struct sockaddr_in))
|
|
return -EINVAL;
|
|
|
|
@@ -430,7 +434,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
|
|
struct inet_sock *inet;
|
|
const int type = icmp_hdr(icmp_skb)->type;
|
|
const int code = icmp_hdr(icmp_skb)->code;
|
|
- struct sock *sk;
|
|
+ struct sock *sk, *meta_sk;
|
|
struct sk_buff *skb;
|
|
struct request_sock *fastopen;
|
|
u32 seq, snd_una;
|
|
@@ -460,13 +464,19 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
|
|
return 0;
|
|
}
|
|
|
|
- bh_lock_sock(sk);
|
|
+ tp = tcp_sk(sk);
|
|
+ if (mptcp(tp))
|
|
+ meta_sk = mptcp_meta_sk(sk);
|
|
+ else
|
|
+ meta_sk = sk;
|
|
+
|
|
+ bh_lock_sock(meta_sk);
|
|
/* If too many ICMPs get dropped on busy
|
|
* servers this needs to be solved differently.
|
|
* We do take care of PMTU discovery (RFC1191) special case :
|
|
* we can receive locally generated ICMP messages while socket is held.
|
|
*/
|
|
- if (sock_owned_by_user(sk)) {
|
|
+ if (sock_owned_by_user(meta_sk)) {
|
|
if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
|
|
__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
|
|
}
|
|
@@ -479,7 +489,6 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
|
|
}
|
|
|
|
icsk = inet_csk(sk);
|
|
- tp = tcp_sk(sk);
|
|
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
|
|
fastopen = rcu_dereference(tp->fastopen_rsk);
|
|
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
|
|
@@ -513,11 +522,13 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
|
|
goto out;
|
|
|
|
tp->mtu_info = info;
|
|
- if (!sock_owned_by_user(sk)) {
|
|
+ if (!sock_owned_by_user(meta_sk)) {
|
|
tcp_v4_mtu_reduced(sk);
|
|
} else {
|
|
if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
|
|
sock_hold(sk);
|
|
+ if (mptcp(tp))
|
|
+ mptcp_tsq_flags(sk);
|
|
}
|
|
goto out;
|
|
}
|
|
@@ -531,7 +542,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
|
|
!icsk->icsk_backoff || fastopen)
|
|
break;
|
|
|
|
- if (sock_owned_by_user(sk))
|
|
+ if (sock_owned_by_user(meta_sk))
|
|
break;
|
|
|
|
skb = tcp_rtx_queue_head(sk);
|
|
@@ -555,7 +566,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
|
|
} else {
|
|
/* RTO revert clocked out retransmission.
|
|
* Will retransmit now */
|
|
- tcp_retransmit_timer(sk);
|
|
+ tcp_sk(sk)->ops->retransmit_timer(sk);
|
|
}
|
|
|
|
break;
|
|
@@ -575,7 +586,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
|
|
if (fastopen && !fastopen->sk)
|
|
break;
|
|
|
|
- if (!sock_owned_by_user(sk)) {
|
|
+ if (!sock_owned_by_user(meta_sk)) {
|
|
sk->sk_err = err;
|
|
|
|
sk->sk_error_report(sk);
|
|
@@ -604,7 +615,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
|
|
*/
|
|
|
|
inet = inet_sk(sk);
|
|
- if (!sock_owned_by_user(sk) && inet->recverr) {
|
|
+ if (!sock_owned_by_user(meta_sk) && inet->recverr) {
|
|
sk->sk_err = err;
|
|
sk->sk_error_report(sk);
|
|
} else { /* Only an error on timeout */
|
|
@@ -612,7 +623,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
|
|
}
|
|
|
|
out:
|
|
- bh_unlock_sock(sk);
|
|
+ bh_unlock_sock(meta_sk);
|
|
sock_put(sk);
|
|
return 0;
|
|
}
|
|
@@ -648,7 +659,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
|
|
* Exception: precedence violation. We do not implement it in any case.
|
|
*/
|
|
|
|
-static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
|
|
+void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
struct {
|
|
@@ -800,10 +811,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
|
|
*/
|
|
|
|
static void tcp_v4_send_ack(const struct sock *sk,
|
|
- struct sk_buff *skb, u32 seq, u32 ack,
|
|
+ struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
|
|
u32 win, u32 tsval, u32 tsecr, int oif,
|
|
struct tcp_md5sig_key *key,
|
|
- int reply_flags, u8 tos)
|
|
+ int reply_flags, u8 tos, int mptcp)
|
|
{
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
struct {
|
|
@@ -812,6 +823,10 @@ static void tcp_v4_send_ack(const struct sock *sk,
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
+ (TCPOLEN_MD5SIG_ALIGNED >> 2)
|
|
#endif
|
|
+#ifdef CONFIG_MPTCP
|
|
+ + ((MPTCP_SUB_LEN_DSS >> 2) +
|
|
+ (MPTCP_SUB_LEN_ACK >> 2))
|
|
+#endif
|
|
];
|
|
} rep;
|
|
struct net *net = sock_net(sk);
|
|
@@ -858,6 +873,21 @@ static void tcp_v4_send_ack(const struct sock *sk,
|
|
ip_hdr(skb)->daddr, &rep.th);
|
|
}
|
|
#endif
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (mptcp) {
|
|
+ int offset = (tsecr) ? 3 : 0;
|
|
+ /* Construction of 32-bit data_ack */
|
|
+ rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |
|
|
+ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
|
|
+ (0x20 << 8) |
|
|
+ (0x01));
|
|
+ rep.opt[offset] = htonl(data_ack);
|
|
+
|
|
+ arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
|
|
+ rep.th.doff = arg.iov[0].iov_len / 4;
|
|
+ }
|
|
+#endif /* CONFIG_MPTCP */
|
|
+
|
|
arg.flags = reply_flags;
|
|
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
|
|
ip_hdr(skb)->saddr, /* XXX */
|
|
@@ -889,28 +919,36 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct inet_timewait_sock *tw = inet_twsk(sk);
|
|
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
|
|
+ u32 data_ack = 0;
|
|
+ int mptcp = 0;
|
|
+
|
|
+ if (tcptw->mptcp_tw) {
|
|
+ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
|
|
+ mptcp = 1;
|
|
+ }
|
|
|
|
tcp_v4_send_ack(sk, skb,
|
|
- tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
|
|
+ tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, data_ack,
|
|
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
|
|
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
|
|
tcptw->tw_ts_recent,
|
|
tw->tw_bound_dev_if,
|
|
tcp_twsk_md5_key(tcptw),
|
|
tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
|
|
- tw->tw_tos
|
|
+ tw->tw_tos, mptcp
|
|
);
|
|
|
|
inet_twsk_put(tw);
|
|
}
|
|
|
|
-static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
- struct request_sock *req)
|
|
+void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
+ struct request_sock *req)
|
|
{
|
|
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
|
|
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
|
|
*/
|
|
- u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
|
|
+ u32 seq = (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ?
|
|
+ tcp_rsk(req)->snt_isn + 1 :
|
|
tcp_sk(sk)->snd_nxt;
|
|
|
|
/* RFC 7323 2.3
|
|
@@ -919,7 +957,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
* Rcv.Wind.Shift bits:
|
|
*/
|
|
tcp_v4_send_ack(sk, skb, seq,
|
|
- tcp_rsk(req)->rcv_nxt,
|
|
+ tcp_rsk(req)->rcv_nxt, 0,
|
|
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
|
|
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
|
|
req->ts_recent,
|
|
@@ -927,7 +965,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
|
|
AF_INET),
|
|
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
|
|
- ip_hdr(skb)->tos);
|
|
+ ip_hdr(skb)->tos, 0);
|
|
}
|
|
|
|
/*
|
|
@@ -935,11 +973,11 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
* This still operates on a request_sock only, not on a big
|
|
* socket.
|
|
*/
|
|
-static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
|
|
- struct flowi *fl,
|
|
- struct request_sock *req,
|
|
- struct tcp_fastopen_cookie *foc,
|
|
- enum tcp_synack_type synack_type)
|
|
+int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
|
|
+ struct flowi *fl,
|
|
+ struct request_sock *req,
|
|
+ struct tcp_fastopen_cookie *foc,
|
|
+ enum tcp_synack_type synack_type)
|
|
{
|
|
const struct inet_request_sock *ireq = inet_rsk(req);
|
|
struct flowi4 fl4;
|
|
@@ -969,7 +1007,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
|
|
/*
|
|
* IPv4 request_sock destructor.
|
|
*/
|
|
-static void tcp_v4_reqsk_destructor(struct request_sock *req)
|
|
+void tcp_v4_reqsk_destructor(struct request_sock *req)
|
|
{
|
|
kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
|
|
}
|
|
@@ -1354,9 +1392,10 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
|
|
return false;
|
|
}
|
|
|
|
-static void tcp_v4_init_req(struct request_sock *req,
|
|
- const struct sock *sk_listener,
|
|
- struct sk_buff *skb)
|
|
+static int tcp_v4_init_req(struct request_sock *req,
|
|
+ const struct sock *sk_listener,
|
|
+ struct sk_buff *skb,
|
|
+ bool want_cookie)
|
|
{
|
|
struct inet_request_sock *ireq = inet_rsk(req);
|
|
struct net *net = sock_net(sk_listener);
|
|
@@ -1364,6 +1403,8 @@ static void tcp_v4_init_req(struct request_sock *req,
|
|
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
|
|
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
|
|
RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
|
|
+
|
|
+ return 0;
|
|
}
|
|
|
|
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
|
|
@@ -1383,7 +1424,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
|
|
.syn_ack_timeout = tcp_syn_ack_timeout,
|
|
};
|
|
|
|
-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
|
|
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
|
|
.mss_clamp = TCP_MSS_DEFAULT,
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
.req_md5_lookup = tcp_v4_md5_lookup,
|
|
@@ -1520,7 +1561,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
|
|
}
|
|
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
|
|
|
|
-static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
|
|
+struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
#ifdef CONFIG_SYN_COOKIES
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
@@ -1558,6 +1599,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct sock *rsk;
|
|
|
|
+ if (is_meta_sk(sk))
|
|
+ return mptcp_v4_do_rcv(sk, skb);
|
|
+
|
|
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
|
|
struct dst_entry *dst = sk->sk_rx_dst;
|
|
|
|
@@ -1803,6 +1847,10 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
|
|
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
|
|
skb->len - th->doff * 4);
|
|
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
|
|
+#ifdef CONFIG_MPTCP
|
|
+ TCP_SKB_CB(skb)->mptcp_flags = 0;
|
|
+ TCP_SKB_CB(skb)->dss_off = 0;
|
|
+#endif
|
|
TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
|
|
TCP_SKB_CB(skb)->tcp_tw_isn = 0;
|
|
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
|
|
@@ -1822,8 +1870,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
|
|
int sdif = inet_sdif(skb);
|
|
const struct iphdr *iph;
|
|
const struct tcphdr *th;
|
|
+ struct sock *sk, *meta_sk = NULL;
|
|
bool refcounted;
|
|
- struct sock *sk;
|
|
int ret;
|
|
|
|
if (skb->pkt_type != PACKET_HOST)
|
|
@@ -1877,7 +1925,11 @@ int tcp_v4_rcv(struct sk_buff *skb)
|
|
reqsk_put(req);
|
|
goto csum_error;
|
|
}
|
|
- if (unlikely(sk->sk_state != TCP_LISTEN)) {
|
|
+ if (unlikely(sk->sk_state != TCP_LISTEN && !is_meta_sk(sk))) {
|
|
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
|
|
+ goto lookup;
|
|
+ }
|
|
+ if (unlikely(is_meta_sk(sk) && !mptcp_can_new_subflow(sk))) {
|
|
inet_csk_reqsk_queue_drop_and_put(sk, req);
|
|
goto lookup;
|
|
}
|
|
@@ -1886,6 +1938,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
|
|
*/
|
|
sock_hold(sk);
|
|
refcounted = true;
|
|
+
|
|
nsk = NULL;
|
|
if (!tcp_filter(sk, skb)) {
|
|
th = (const struct tcphdr *)skb->data;
|
|
@@ -1946,19 +1999,28 @@ int tcp_v4_rcv(struct sk_buff *skb)
|
|
|
|
sk_incoming_cpu_update(sk);
|
|
|
|
- bh_lock_sock_nested(sk);
|
|
+ if (mptcp(tcp_sk(sk))) {
|
|
+ meta_sk = mptcp_meta_sk(sk);
|
|
+
|
|
+ bh_lock_sock_nested(meta_sk);
|
|
+ if (sock_owned_by_user(meta_sk))
|
|
+ mptcp_prepare_for_backlog(sk, skb);
|
|
+ } else {
|
|
+ meta_sk = sk;
|
|
+ bh_lock_sock_nested(sk);
|
|
+ }
|
|
tcp_segs_in(tcp_sk(sk), skb);
|
|
ret = 0;
|
|
- if (!sock_owned_by_user(sk)) {
|
|
+ if (!sock_owned_by_user(meta_sk)) {
|
|
skb_to_free = sk->sk_rx_skb_cache;
|
|
sk->sk_rx_skb_cache = NULL;
|
|
ret = tcp_v4_do_rcv(sk, skb);
|
|
} else {
|
|
- if (tcp_add_backlog(sk, skb))
|
|
+ if (tcp_add_backlog(meta_sk, skb))
|
|
goto discard_and_relse;
|
|
skb_to_free = NULL;
|
|
}
|
|
- bh_unlock_sock(sk);
|
|
+ bh_unlock_sock(meta_sk);
|
|
if (skb_to_free)
|
|
__kfree_skb(skb_to_free);
|
|
|
|
@@ -1974,6 +2036,19 @@ int tcp_v4_rcv(struct sk_buff *skb)
|
|
|
|
tcp_v4_fill_cb(skb, iph, th);
|
|
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (!sk && th->syn && !th->ack) {
|
|
+ int ret = mptcp_lookup_join(skb, NULL);
|
|
+
|
|
+ if (ret < 0) {
|
|
+ tcp_v4_send_reset(NULL, skb);
|
|
+ goto discard_it;
|
|
+ } else if (ret > 0) {
|
|
+ return 0;
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+
|
|
if (tcp_checksum_complete(skb)) {
|
|
csum_error:
|
|
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
|
|
@@ -2022,6 +2097,18 @@ int tcp_v4_rcv(struct sk_buff *skb)
|
|
refcounted = false;
|
|
goto process;
|
|
}
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (th->syn && !th->ack) {
|
|
+ int ret = mptcp_lookup_join(skb, inet_twsk(sk));
|
|
+
|
|
+ if (ret < 0) {
|
|
+ tcp_v4_send_reset(NULL, skb);
|
|
+ goto discard_it;
|
|
+ } else if (ret > 0) {
|
|
+ return 0;
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
}
|
|
/* to ACK */
|
|
/* fall through */
|
|
@@ -2091,7 +2178,12 @@ static int tcp_v4_init_sock(struct sock *sk)
|
|
|
|
tcp_init_sock(sk);
|
|
|
|
- icsk->icsk_af_ops = &ipv4_specific;
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (sock_flag(sk, SOCK_MPTCP))
|
|
+ icsk->icsk_af_ops = &mptcp_v4_specific;
|
|
+ else
|
|
+#endif
|
|
+ icsk->icsk_af_ops = &ipv4_specific;
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
|
|
@@ -2110,6 +2202,11 @@ void tcp_v4_destroy_sock(struct sock *sk)
|
|
|
|
tcp_cleanup_congestion_control(sk);
|
|
|
|
+ if (mptcp(tp))
|
|
+ mptcp_destroy_sock(sk);
|
|
+ if (tp->inside_tk_table)
|
|
+ mptcp_hash_remove_bh(tp);
|
|
+
|
|
tcp_cleanup_ulp(sk);
|
|
|
|
/* Cleanup up the write buffer. */
|
|
@@ -2615,6 +2712,11 @@ struct proto tcp_prot = {
|
|
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
|
|
.max_header = MAX_TCP_HEADER,
|
|
.obj_size = sizeof(struct tcp_sock),
|
|
+#ifdef CONFIG_MPTCP
|
|
+ .useroffset = offsetof(struct tcp_sock, mptcp_sched_name),
|
|
+ .usersize = sizeof_field(struct tcp_sock, mptcp_sched_name) +
|
|
+ sizeof_field(struct tcp_sock, mptcp_pm_name),
|
|
+#endif
|
|
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
|
.twsk_prot = &tcp_timewait_sock_ops,
|
|
.rsk_prot = &tcp_request_sock_ops,
|
|
@@ -2625,6 +2727,9 @@ struct proto tcp_prot = {
|
|
.compat_getsockopt = compat_tcp_getsockopt,
|
|
#endif
|
|
.diag_destroy = tcp_abort,
|
|
+#ifdef CONFIG_MPTCP
|
|
+ .clear_sk = mptcp_clear_sk,
|
|
+#endif
|
|
};
|
|
EXPORT_SYMBOL(tcp_prot);
|
|
|
|
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
|
|
index 194743bd3fc1..b35942faf7df 100644
|
|
--- a/net/ipv4/tcp_minisocks.c
|
|
+++ b/net/ipv4/tcp_minisocks.c
|
|
@@ -19,11 +19,13 @@
|
|
* Jorge Cwik, <jorge@laser.satlink.net>
|
|
*/
|
|
|
|
+#include <linux/kconfig.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/module.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/workqueue.h>
|
|
+#include <net/mptcp.h>
|
|
#include <linux/static_key.h>
|
|
#include <net/tcp.h>
|
|
#include <net/inet_common.h>
|
|
@@ -95,10 +97,14 @@ enum tcp_tw_status
|
|
struct tcp_options_received tmp_opt;
|
|
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
|
|
bool paws_reject = false;
|
|
+ struct mptcp_options_received mopt;
|
|
|
|
tmp_opt.saw_tstamp = 0;
|
|
- if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
|
|
- tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
|
|
+ if (th->doff > (sizeof(*th) >> 2) &&
|
|
+ (tcptw->tw_ts_recent_stamp || tcptw->mptcp_tw)) {
|
|
+ mptcp_init_mp_opt(&mopt);
|
|
+
|
|
+ tcp_parse_options(twsk_net(tw), skb, &tmp_opt, &mopt, 0, NULL, NULL);
|
|
|
|
if (tmp_opt.saw_tstamp) {
|
|
if (tmp_opt.rcv_tsecr)
|
|
@@ -107,6 +113,11 @@ enum tcp_tw_status
|
|
tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
|
|
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
|
|
}
|
|
+
|
|
+ if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) {
|
|
+ if (mopt.mptcp_sender_key == tcptw->mptcp_tw->loc_key)
|
|
+ return TCP_TW_RST;
|
|
+ }
|
|
}
|
|
|
|
if (tw->tw_substate == TCP_FIN_WAIT2) {
|
|
@@ -130,6 +141,16 @@ enum tcp_tw_status
|
|
if (!th->ack ||
|
|
!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
|
|
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
|
|
+ /* If mptcp_is_data_fin() returns true, we are sure that
|
|
+ * mopt has been initialized - otherwise it would not
|
|
+ * be a DATA_FIN.
|
|
+ */
|
|
+ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw &&
|
|
+ mptcp_is_data_fin(skb) &&
|
|
+ TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
|
|
+ mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt)
|
|
+ return TCP_TW_ACK;
|
|
+
|
|
inet_twsk_put(tw);
|
|
return TCP_TW_SUCCESS;
|
|
}
|
|
@@ -270,11 +291,25 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
|
|
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
|
|
tcptw->tw_rcv_nxt = tp->rcv_nxt;
|
|
tcptw->tw_snd_nxt = tp->snd_nxt;
|
|
- tcptw->tw_rcv_wnd = tcp_receive_window(tp);
|
|
+ /* no need to keep track of the right-most right edge
|
|
+ * when in time wait, can directly use the currently
|
|
+ * advertised window.
|
|
+ */
|
|
+ tcptw->tw_rcv_wnd = tcp_receive_window_now(tp);
|
|
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
|
|
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
|
|
tcptw->tw_ts_offset = tp->tsoffset;
|
|
tcptw->tw_last_oow_ack_time = 0;
|
|
+
|
|
+ if (mptcp(tp)) {
|
|
+ if (mptcp_init_tw_sock(sk, tcptw)) {
|
|
+ inet_twsk_free(tw);
|
|
+ goto exit;
|
|
+ }
|
|
+ } else {
|
|
+ tcptw->mptcp_tw = NULL;
|
|
+ }
|
|
+
|
|
tcptw->tw_tx_delay = tp->tcp_tx_delay;
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
if (tw->tw_family == PF_INET6) {
|
|
@@ -336,6 +371,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
|
|
}
|
|
|
|
+exit:
|
|
tcp_update_metrics(sk);
|
|
tcp_done(sk);
|
|
}
|
|
@@ -343,6 +379,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
|
|
|
|
void tcp_twsk_destructor(struct sock *sk)
|
|
{
|
|
+ struct tcp_timewait_sock *twsk = tcp_twsk(sk);
|
|
+
|
|
+ if (twsk->mptcp_tw)
|
|
+ mptcp_twsk_destructor(twsk);
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
if (static_branch_unlikely(&tcp_md5_needed)) {
|
|
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
|
|
@@ -386,8 +426,9 @@ void tcp_openreq_init_rwin(struct request_sock *req,
|
|
full_space = rcv_wnd * mss;
|
|
|
|
/* tcp_full_space because it is guaranteed to be the first packet */
|
|
- tcp_select_initial_window(sk_listener, full_space,
|
|
- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
|
|
+ tp->ops->select_initial_window(sk_listener, full_space,
|
|
+ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) -
|
|
+ (ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0),
|
|
&req->rsk_rcv_wnd,
|
|
&req->rsk_window_clamp,
|
|
ireq->wscale_ok,
|
|
@@ -487,6 +528,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
|
|
WRITE_ONCE(newtp->snd_nxt, seq);
|
|
newtp->snd_up = seq;
|
|
|
|
+ newtp->out_of_order_queue = RB_ROOT;
|
|
+ newsk->tcp_rtx_queue = RB_ROOT;
|
|
INIT_LIST_HEAD(&newtp->tsq_node);
|
|
INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
|
|
|
|
@@ -511,6 +554,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
|
|
newtp->window_clamp = req->rsk_window_clamp;
|
|
newtp->rcv_ssthresh = req->rsk_rcv_wnd;
|
|
newtp->rcv_wnd = req->rsk_rcv_wnd;
|
|
+ newtp->rcv_right_edge = newtp->rcv_wnd + newtp->rcv_wup;
|
|
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
|
|
if (newtp->rx_opt.wscale_ok) {
|
|
newtp->rx_opt.snd_wscale = ireq->snd_wscale;
|
|
@@ -530,6 +574,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
|
|
newtp->rx_opt.ts_recent_stamp = 0;
|
|
newtp->tcp_header_len = sizeof(struct tcphdr);
|
|
}
|
|
+ if (ireq->saw_mpc)
|
|
+ newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
|
|
if (req->num_timeout) {
|
|
newtp->undo_marker = treq->snt_isn;
|
|
newtp->retrans_stamp = div_u64(treq->snt_synack,
|
|
@@ -547,6 +593,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
|
|
tcp_ecn_openreq_child(newtp, req);
|
|
newtp->fastopen_req = NULL;
|
|
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
|
|
+ newtp->inside_tk_table = 0;
|
|
|
|
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
|
|
|
|
@@ -570,15 +617,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
|
|
bool fastopen, bool *req_stolen)
|
|
{
|
|
struct tcp_options_received tmp_opt;
|
|
+ struct mptcp_options_received mopt;
|
|
struct sock *child;
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
|
|
bool paws_reject = false;
|
|
bool own_req;
|
|
+ bool meta_locked = false;
|
|
|
|
tmp_opt.saw_tstamp = 0;
|
|
+
|
|
+ mptcp_init_mp_opt(&mopt);
|
|
+
|
|
if (th->doff > (sizeof(struct tcphdr)>>2)) {
|
|
- tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
|
|
+ tcp_parse_options(sock_net(sk), skb, &tmp_opt, &mopt, 0, NULL, NULL);
|
|
|
|
if (tmp_opt.saw_tstamp) {
|
|
tmp_opt.ts_recent = req->ts_recent;
|
|
@@ -619,7 +671,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
|
|
*
|
|
* Reset timer after retransmitting SYNACK, similar to
|
|
* the idea of fast retransmit in recovery.
|
|
+ *
|
|
+ * Fall back to TCP if MP_CAPABLE is not set.
|
|
*/
|
|
+
|
|
+ if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc)
|
|
+ inet_rsk(req)->saw_mpc = false;
|
|
+
|
|
+
|
|
if (!tcp_oow_rate_limited(sock_net(sk), skb,
|
|
LINUX_MIB_TCPACKSKIPPEDSYNRECV,
|
|
&tcp_rsk(req)->last_oow_ack_time) &&
|
|
@@ -767,17 +826,40 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
|
|
* ESTABLISHED STATE. If it will be dropped after
|
|
* socket is created, wait for troubles.
|
|
*/
|
|
+ if (is_meta_sk(sk)) {
|
|
+ bh_lock_sock_nested(sk);
|
|
+ meta_locked = true;
|
|
+ }
|
|
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
|
|
req, &own_req);
|
|
if (!child)
|
|
goto listen_overflow;
|
|
|
|
+ if (own_req && !is_meta_sk(sk)) {
|
|
+ int ret = mptcp_check_req_master(sk, child, req, skb, &mopt, 1, 0);
|
|
+ if (ret < 0)
|
|
+ goto listen_overflow;
|
|
+
|
|
+ /* MPTCP-supported */
|
|
+ if (!ret)
|
|
+ return tcp_sk(child)->mpcb->master_sk;
|
|
+ } else if (own_req) {
|
|
+ return mptcp_check_req_child(sk, child, req, skb, &mopt);
|
|
+ }
|
|
+
|
|
+ if (meta_locked)
|
|
+ bh_unlock_sock(sk);
|
|
+
|
|
sock_rps_save_rxhash(child, skb);
|
|
tcp_synack_rtt_meas(child, req);
|
|
*req_stolen = !own_req;
|
|
+
|
|
return inet_csk_complete_hashdance(sk, child, req, own_req);
|
|
|
|
listen_overflow:
|
|
+ if (meta_locked)
|
|
+ bh_unlock_sock(sk);
|
|
+
|
|
if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) {
|
|
inet_rsk(req)->acked = 1;
|
|
return NULL;
|
|
@@ -823,12 +905,13 @@ int tcp_child_process(struct sock *parent, struct sock *child,
|
|
{
|
|
int ret = 0;
|
|
int state = child->sk_state;
|
|
+ struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child;
|
|
|
|
/* record NAPI ID of child */
|
|
sk_mark_napi_id(child, skb);
|
|
|
|
tcp_segs_in(tcp_sk(child), skb);
|
|
- if (!sock_owned_by_user(child)) {
|
|
+ if (!sock_owned_by_user(meta_sk)) {
|
|
ret = tcp_rcv_state_process(child, skb);
|
|
/* Wakeup parent, send SIGIO */
|
|
if (state == TCP_SYN_RECV && child->sk_state != state)
|
|
@@ -838,10 +921,14 @@ int tcp_child_process(struct sock *parent, struct sock *child,
|
|
* in main socket hash table and lock on listening
|
|
* socket does not protect us more.
|
|
*/
|
|
- __sk_add_backlog(child, skb);
|
|
+ if (mptcp(tcp_sk(child)))
|
|
+ mptcp_prepare_for_backlog(child, skb);
|
|
+ __sk_add_backlog(meta_sk, skb);
|
|
}
|
|
|
|
bh_unlock_sock(child);
|
|
+ if (mptcp(tcp_sk(child)))
|
|
+ bh_unlock_sock(meta_sk);
|
|
sock_put(child);
|
|
return ret;
|
|
}
|
|
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
|
|
index 638d7b49ad71..d246e537e686 100644
|
|
--- a/net/ipv4/tcp_output.c
|
|
+++ b/net/ipv4/tcp_output.c
|
|
@@ -37,6 +37,12 @@
|
|
|
|
#define pr_fmt(fmt) "TCP: " fmt
|
|
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+#include <net/mptcp_v6.h>
|
|
+#endif
|
|
+#include <net/ipv6.h>
|
|
#include <net/tcp.h>
|
|
|
|
#include <linux/compiler.h>
|
|
@@ -57,11 +63,8 @@ void tcp_mstamp_refresh(struct tcp_sock *tp)
|
|
tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
|
|
}
|
|
|
|
-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
- int push_one, gfp_t gfp);
|
|
-
|
|
/* Account for new data that has been sent to the network. */
|
|
-static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
|
|
+void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
@@ -255,12 +258,16 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
|
|
* value can be stuffed directly into th->window for an outgoing
|
|
* frame.
|
|
*/
|
|
-static u16 tcp_select_window(struct sock *sk)
|
|
+u16 tcp_select_window(struct sock *sk)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
u32 old_win = tp->rcv_wnd;
|
|
- u32 cur_win = tcp_receive_window(tp);
|
|
- u32 new_win = __tcp_select_window(sk);
|
|
+ /* The window must never shrink at the meta-level. At the subflow we
|
|
+ * have to allow this. Otherwise we may announce a window too large
|
|
+ * for the current meta-level sk_rcvbuf.
|
|
+ */
|
|
+ u32 cur_win = tcp_receive_window_now(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp);
|
|
+ u32 new_win = tp->ops->__select_window(sk);
|
|
|
|
/* Never shrink the offered window */
|
|
if (new_win < cur_win) {
|
|
@@ -276,8 +283,10 @@ static u16 tcp_select_window(struct sock *sk)
|
|
LINUX_MIB_TCPWANTZEROWINDOWADV);
|
|
new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
|
|
}
|
|
+
|
|
tp->rcv_wnd = new_win;
|
|
tp->rcv_wup = tp->rcv_nxt;
|
|
+ tcp_update_rcv_right_edge(tp);
|
|
|
|
/* Make sure we do not exceed the maximum possible
|
|
* scaled window.
|
|
@@ -388,7 +397,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
|
|
/* Constructs common control bits of non-data skb. If SYN/FIN is present,
|
|
* auto increment end seqno.
|
|
*/
|
|
-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
|
|
+void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
|
|
{
|
|
skb->ip_summed = CHECKSUM_PARTIAL;
|
|
|
|
@@ -403,7 +412,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
|
|
TCP_SKB_CB(skb)->end_seq = seq;
|
|
}
|
|
|
|
-static inline bool tcp_urg_mode(const struct tcp_sock *tp)
|
|
+bool tcp_urg_mode(const struct tcp_sock *tp)
|
|
{
|
|
return tp->snd_una != tp->snd_up;
|
|
}
|
|
@@ -414,6 +423,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
|
|
#define OPTION_WSCALE (1 << 3)
|
|
#define OPTION_FAST_OPEN_COOKIE (1 << 8)
|
|
#define OPTION_SMC (1 << 9)
|
|
+/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */
|
|
|
|
static void smc_options_write(__be32 *ptr, u16 *options)
|
|
{
|
|
@@ -430,17 +440,6 @@ static void smc_options_write(__be32 *ptr, u16 *options)
|
|
#endif
|
|
}
|
|
|
|
-struct tcp_out_options {
|
|
- u16 options; /* bit field of OPTION_* */
|
|
- u16 mss; /* 0 to disable */
|
|
- u8 ws; /* window scale, 0 to disable */
|
|
- u8 num_sack_blocks; /* number of SACK blocks to include */
|
|
- u8 hash_size; /* bytes in hash_location */
|
|
- __u8 *hash_location; /* temporary pointer, overloaded */
|
|
- __u32 tsval, tsecr; /* need to include OPTION_TS */
|
|
- struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
|
|
-};
|
|
-
|
|
/* Write previously computed TCP options to the packet.
|
|
*
|
|
* Beware: Something in the Internet is very sensitive to the ordering of
|
|
@@ -455,7 +454,7 @@ struct tcp_out_options {
|
|
* (but it may well be that other scenarios fail similarly).
|
|
*/
|
|
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
|
|
- struct tcp_out_options *opts)
|
|
+ struct tcp_out_options *opts, struct sk_buff *skb)
|
|
{
|
|
u16 options = opts->options; /* mungable copy */
|
|
|
|
@@ -549,6 +548,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
|
|
}
|
|
|
|
smc_options_write(ptr, &options);
|
|
+
|
|
+ if (unlikely(OPTION_MPTCP & opts->options))
|
|
+ mptcp_options_write(ptr, tp, opts, skb);
|
|
}
|
|
|
|
static void smc_set_option(const struct tcp_sock *tp,
|
|
@@ -635,6 +637,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
|
|
if (unlikely(!(OPTION_TS & opts->options)))
|
|
remaining -= TCPOLEN_SACKPERM_ALIGNED;
|
|
}
|
|
+ if (tp->request_mptcp || mptcp(tp))
|
|
+ mptcp_syn_options(sk, opts, &remaining);
|
|
|
|
if (fastopen && fastopen->cookie.len >= 0) {
|
|
u32 need = fastopen->cookie.len;
|
|
@@ -718,6 +722,9 @@ static unsigned int tcp_synack_options(const struct sock *sk,
|
|
|
|
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
|
|
|
|
+ if (ireq->saw_mpc)
|
|
+ mptcp_synack_options(req, opts, &remaining);
|
|
+
|
|
return MAX_TCP_OPTION_SPACE - remaining;
|
|
}
|
|
|
|
@@ -752,14 +759,19 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
|
|
opts->tsecr = tp->rx_opt.ts_recent;
|
|
size += TCPOLEN_TSTAMP_ALIGNED;
|
|
}
|
|
+ if (mptcp(tp))
|
|
+ mptcp_established_options(sk, skb, opts, &size);
|
|
|
|
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
|
|
if (unlikely(eff_sacks)) {
|
|
- const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
|
|
- opts->num_sack_blocks =
|
|
- min_t(unsigned int, eff_sacks,
|
|
- (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
|
|
- TCPOLEN_SACK_PERBLOCK);
|
|
+ const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
|
|
+ if (remaining < TCPOLEN_SACK_BASE_ALIGNED)
|
|
+ opts->num_sack_blocks = 0;
|
|
+ else
|
|
+ opts->num_sack_blocks =
|
|
+ min_t(unsigned int, eff_sacks,
|
|
+ (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
|
|
+ TCPOLEN_SACK_PERBLOCK);
|
|
if (likely(opts->num_sack_blocks))
|
|
size += TCPOLEN_SACK_BASE_ALIGNED +
|
|
opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
|
|
@@ -802,19 +814,31 @@ static void tcp_tsq_write(struct sock *sk)
|
|
tcp_xmit_retransmit_queue(sk);
|
|
}
|
|
|
|
- tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
|
|
- 0, GFP_ATOMIC);
|
|
+ tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk),
|
|
+ tcp_sk(sk)->nonagle, 0, GFP_ATOMIC);
|
|
}
|
|
}
|
|
|
|
static void tcp_tsq_handler(struct sock *sk)
|
|
{
|
|
- bh_lock_sock(sk);
|
|
- if (!sock_owned_by_user(sk))
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
|
|
+
|
|
+ bh_lock_sock(meta_sk);
|
|
+ if (!sock_owned_by_user(meta_sk)) {
|
|
tcp_tsq_write(sk);
|
|
- else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
|
|
- sock_hold(sk);
|
|
- bh_unlock_sock(sk);
|
|
+
|
|
+ if (mptcp(tp))
|
|
+ tcp_tsq_write(meta_sk);
|
|
+ } else {
|
|
+ if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
|
|
+ sock_hold(sk);
|
|
+
|
|
+ if ((mptcp(tp)) && (sk->sk_state != TCP_CLOSE))
|
|
+ mptcp_tsq_flags(sk);
|
|
+ }
|
|
+
|
|
+ bh_unlock_sock(meta_sk);
|
|
}
|
|
/*
|
|
* One tasklet per cpu tries to send more skbs.
|
|
@@ -851,7 +875,9 @@ static void tcp_tasklet_func(unsigned long data)
|
|
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
|
|
TCPF_WRITE_TIMER_DEFERRED | \
|
|
TCPF_DELACK_TIMER_DEFERRED | \
|
|
- TCPF_MTU_REDUCED_DEFERRED)
|
|
+ TCPF_MTU_REDUCED_DEFERRED | \
|
|
+ TCPF_PATH_MANAGER_DEFERRED |\
|
|
+ TCPF_SUB_DEFERRED)
|
|
/**
|
|
* tcp_release_cb - tcp release_sock() callback
|
|
* @sk: socket
|
|
@@ -874,6 +900,9 @@ void tcp_release_cb(struct sock *sk)
|
|
if (flags & TCPF_TSQ_DEFERRED) {
|
|
tcp_tsq_write(sk);
|
|
__sock_put(sk);
|
|
+
|
|
+ if (mptcp(tcp_sk(sk)))
|
|
+ tcp_tsq_write(mptcp_meta_sk(sk));
|
|
}
|
|
/* Here begins the tricky part :
|
|
* We are called from release_sock() with :
|
|
@@ -898,6 +927,13 @@ void tcp_release_cb(struct sock *sk)
|
|
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
|
|
__sock_put(sk);
|
|
}
|
|
+ if (flags & TCPF_PATH_MANAGER_DEFERRED) {
|
|
+ if (tcp_sk(sk)->mpcb->pm_ops->release_sock)
|
|
+ tcp_sk(sk)->mpcb->pm_ops->release_sock(sk);
|
|
+ __sock_put(sk);
|
|
+ }
|
|
+ if (flags & TCPF_SUB_DEFERRED)
|
|
+ mptcp_tsq_sub_deferred(sk);
|
|
}
|
|
EXPORT_SYMBOL(tcp_release_cb);
|
|
|
|
@@ -981,8 +1017,8 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
|
|
return HRTIMER_NORESTART;
|
|
}
|
|
|
|
-static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
|
|
- u64 prior_wstamp)
|
|
+void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
|
|
+ u64 prior_wstamp)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
@@ -1128,10 +1164,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
|
|
}
|
|
}
|
|
|
|
- tcp_options_write((__be32 *)(th + 1), tp, &opts);
|
|
+ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
|
|
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
|
|
if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
|
|
- th->window = htons(tcp_select_window(sk));
|
|
+ th->window = htons(tp->ops->select_window(sk));
|
|
tcp_ecn_send(sk, skb, th, tcp_header_size);
|
|
} else {
|
|
/* RFC1323: The window in SYN & SYN/ACK segments
|
|
@@ -1189,8 +1225,8 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
|
|
return err;
|
|
}
|
|
|
|
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
|
|
- gfp_t gfp_mask)
|
|
+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
|
|
+ gfp_t gfp_mask)
|
|
{
|
|
return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
|
|
tcp_sk(sk)->rcv_nxt);
|
|
@@ -1201,7 +1237,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
|
|
* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
|
|
* otherwise socket can stall.
|
|
*/
|
|
-static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
|
|
+void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
@@ -1214,7 +1250,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
|
|
}
|
|
|
|
/* Initialize TSO segments for a packet. */
|
|
-static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
|
|
+void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
|
|
{
|
|
if (skb->len <= mss_now) {
|
|
/* Avoid the costly divide in the normal
|
|
@@ -1231,7 +1267,7 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
|
|
/* Pcount in the middle of the write queue got changed, we need to do various
|
|
* tweaks to fix counters
|
|
*/
|
|
-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
|
|
+void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
@@ -1400,7 +1436,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
|
|
/* This is similar to __pskb_pull_tail(). The difference is that pulled
|
|
* data is not copied, but immediately discarded.
|
|
*/
|
|
-static int __pskb_trim_head(struct sk_buff *skb, int len)
|
|
+int __pskb_trim_head(struct sk_buff *skb, int len)
|
|
{
|
|
struct skb_shared_info *shinfo;
|
|
int i, k, eat;
|
|
@@ -1623,6 +1659,7 @@ unsigned int tcp_current_mss(struct sock *sk)
|
|
|
|
return mss_now;
|
|
}
|
|
+EXPORT_SYMBOL(tcp_current_mss);
|
|
|
|
/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
|
|
* As additional protections, we do not touch cwnd in retransmission phases,
|
|
@@ -1682,8 +1719,11 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
|
|
* 2) not cwnd limited (this else condition)
|
|
* 3) no more data to send (tcp_write_queue_empty())
|
|
* 4) application is hitting buffer limit (SOCK_NOSPACE)
|
|
+ * 5) For MPTCP subflows, the scheduler determines
|
|
+ * sndbuf limited.
|
|
*/
|
|
if (tcp_write_queue_empty(sk) && sk->sk_socket &&
|
|
+ !(mptcp(tcp_sk(sk)) && !is_meta_sk(sk)) &&
|
|
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
|
|
(1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
|
|
tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
|
|
@@ -1705,8 +1745,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp)
|
|
* But we can avoid doing the divide again given we already have
|
|
* skb_pcount = skb->len / mss_now
|
|
*/
|
|
-static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
|
|
- const struct sk_buff *skb)
|
|
+void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
|
|
+ const struct sk_buff *skb)
|
|
{
|
|
if (skb->len < tcp_skb_pcount(skb) * mss_now)
|
|
tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
|
|
@@ -1752,7 +1792,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
|
|
/* Return the number of segments we want in the skb we are transmitting.
|
|
* See if congestion control module wants to decide; otherwise, autosize.
|
|
*/
|
|
-static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
|
|
+u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
|
|
{
|
|
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
|
|
u32 min_tso, tso_segs;
|
|
@@ -1766,11 +1806,11 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
|
|
}
|
|
|
|
/* Returns the portion of skb which can be sent right away */
|
|
-static unsigned int tcp_mss_split_point(const struct sock *sk,
|
|
- const struct sk_buff *skb,
|
|
- unsigned int mss_now,
|
|
- unsigned int max_segs,
|
|
- int nonagle)
|
|
+unsigned int tcp_mss_split_point(const struct sock *sk,
|
|
+ const struct sk_buff *skb,
|
|
+ unsigned int mss_now,
|
|
+ unsigned int max_segs,
|
|
+ int nonagle)
|
|
{
|
|
const struct tcp_sock *tp = tcp_sk(sk);
|
|
u32 partial, needed, window, max_len;
|
|
@@ -1800,13 +1840,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
|
|
/* Can at least one segment of SKB be sent right now, according to the
|
|
* congestion window rules? If so, return how many segments are allowed.
|
|
*/
|
|
-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
|
|
- const struct sk_buff *skb)
|
|
+unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
|
|
+ const struct sk_buff *skb)
|
|
{
|
|
u32 in_flight, cwnd, halfcwnd;
|
|
|
|
/* Don't be strict about the congestion window for the final FIN. */
|
|
- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
|
|
+ if (skb &&
|
|
+ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
|
|
tcp_skb_pcount(skb) == 1)
|
|
return 1;
|
|
|
|
@@ -1821,12 +1862,13 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
|
|
halfcwnd = max(cwnd >> 1, 1U);
|
|
return min(halfcwnd, cwnd - in_flight);
|
|
}
|
|
+EXPORT_SYMBOL(tcp_cwnd_test);
|
|
|
|
/* Initialize TSO state of a skb.
|
|
* This must be invoked the first time we consider transmitting
|
|
* SKB onto the wire.
|
|
*/
|
|
-static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
|
|
+int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
|
|
{
|
|
int tso_segs = tcp_skb_pcount(skb);
|
|
|
|
@@ -1841,8 +1883,8 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
|
|
/* Return true if the Nagle test allows this packet to be
|
|
* sent now.
|
|
*/
|
|
-static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
|
|
- unsigned int cur_mss, int nonagle)
|
|
+bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
|
|
+ unsigned int cur_mss, int nonagle)
|
|
{
|
|
/* Nagle rule does not apply to frames, which sit in the middle of the
|
|
* write_queue (they have no chances to get new data).
|
|
@@ -1854,7 +1896,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
|
|
return true;
|
|
|
|
/* Don't use the nagle rule for urgent data (or for the final FIN). */
|
|
- if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
|
|
+ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
|
|
+ mptcp_is_data_fin(skb))
|
|
return true;
|
|
|
|
if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
|
|
@@ -1864,9 +1907,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
|
|
}
|
|
|
|
/* Does at least the first segment of SKB fit into the send window? */
|
|
-static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
|
|
- const struct sk_buff *skb,
|
|
- unsigned int cur_mss)
|
|
+bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
|
|
+ unsigned int cur_mss)
|
|
{
|
|
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
|
|
|
|
@@ -1875,6 +1917,7 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
|
|
|
|
return !after(end_seq, tcp_wnd_end(tp));
|
|
}
|
|
+EXPORT_SYMBOL(tcp_snd_wnd_test);
|
|
|
|
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
|
|
* which is put after SKB on the list. It is very much like
|
|
@@ -2033,7 +2076,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
|
|
|
|
/* If this packet won't get more data, do not wait. */
|
|
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
|
|
- TCP_SKB_CB(skb)->eor)
|
|
+ TCP_SKB_CB(skb)->eor ||
|
|
+ mptcp_is_data_fin(skb))
|
|
goto send_now;
|
|
|
|
return true;
|
|
@@ -2366,7 +2410,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
|
|
* Returns true, if no segments are in flight and we have queued segments,
|
|
* but cannot send anything now because of SWS or another problem.
|
|
*/
|
|
-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
+bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
int push_one, gfp_t gfp)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
@@ -2380,7 +2424,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
sent_pkts = 0;
|
|
|
|
tcp_mstamp_refresh(tp);
|
|
- if (!push_one) {
|
|
+
|
|
+ /* pmtu not yet supported with MPTCP. Should be possible, by early
|
|
+ * exiting the loop inside tcp_mtu_probe, making sure that only one
|
|
+ * single DSS-mapping gets probed.
|
|
+ */
|
|
+ if (!push_one && !mptcp(tp)) {
|
|
/* Do MTU probing. */
|
|
result = tcp_mtu_probe(sk);
|
|
if (!result) {
|
|
@@ -2576,7 +2625,7 @@ void tcp_send_loss_probe(struct sock *sk)
|
|
skb = tcp_send_head(sk);
|
|
if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
|
|
pcount = tp->packets_out;
|
|
- tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
|
|
+ tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
|
|
if (tp->packets_out > pcount)
|
|
goto probe_sent;
|
|
goto rearm_timer;
|
|
@@ -2638,8 +2687,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
|
|
if (unlikely(sk->sk_state == TCP_CLOSE))
|
|
return;
|
|
|
|
- if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
|
|
- sk_gfp_mask(sk, GFP_ATOMIC)))
|
|
+ if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0,
|
|
+ sk_gfp_mask(sk, GFP_ATOMIC)))
|
|
tcp_check_probe_timer(sk);
|
|
}
|
|
|
|
@@ -2652,7 +2701,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
|
|
|
|
BUG_ON(!skb || skb->len < mss_now);
|
|
|
|
- tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
|
|
+ tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1,
|
|
+ sk->sk_allocation);
|
|
}
|
|
|
|
/* This function returns the amount that we can raise the
|
|
@@ -2874,6 +2924,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
|
|
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
|
|
return;
|
|
|
|
+ /* Currently not supported for MPTCP - but it should be possible */
|
|
+ if (mptcp(tp))
|
|
+ return;
|
|
+
|
|
skb_rbtree_walk_from_safe(skb, tmp) {
|
|
if (!tcp_can_collapse(sk, skb))
|
|
break;
|
|
@@ -3355,7 +3409,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
|
|
|
|
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
|
|
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
|
|
- tcp_options_write((__be32 *)(th + 1), NULL, &opts);
|
|
+ tcp_options_write((__be32 *)(th + 1), NULL, &opts, skb);
|
|
th->doff = (tcp_header_size >> 2);
|
|
__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
|
|
|
|
@@ -3437,13 +3491,13 @@ static void tcp_connect_init(struct sock *sk)
|
|
if (rcv_wnd == 0)
|
|
rcv_wnd = dst_metric(dst, RTAX_INITRWND);
|
|
|
|
- tcp_select_initial_window(sk, tcp_full_space(sk),
|
|
- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
|
|
- &tp->rcv_wnd,
|
|
- &tp->window_clamp,
|
|
- sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
|
|
- &rcv_wscale,
|
|
- rcv_wnd);
|
|
+ tp->ops->select_initial_window(sk, tcp_full_space(sk),
|
|
+ tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
|
|
+ &tp->rcv_wnd,
|
|
+ &tp->window_clamp,
|
|
+ sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
|
|
+ &rcv_wscale,
|
|
+ rcv_wnd);
|
|
|
|
tp->rx_opt.rcv_wscale = rcv_wscale;
|
|
tp->rcv_ssthresh = tp->rcv_wnd;
|
|
@@ -3463,11 +3517,43 @@ static void tcp_connect_init(struct sock *sk)
|
|
else
|
|
tp->rcv_tstamp = tcp_jiffies32;
|
|
tp->rcv_wup = tp->rcv_nxt;
|
|
+ /* force set rcv_right_edge here at start of connection */
|
|
+ tp->rcv_right_edge = tp->rcv_wup + tp->rcv_wnd;
|
|
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
|
|
|
|
inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
|
|
inet_csk(sk)->icsk_retransmits = 0;
|
|
tcp_clear_retrans(tp);
|
|
+
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (sock_flag(sk, SOCK_MPTCP) && mptcp_doit(sk)) {
|
|
+ if (is_master_tp(tp)) {
|
|
+ tp->request_mptcp = 1;
|
|
+ mptcp_connect_init(sk);
|
|
+ } else if (tp->mptcp) {
|
|
+ struct inet_sock *inet = inet_sk(sk);
|
|
+
|
|
+ tp->mptcp->snt_isn = tp->write_seq;
|
|
+ tp->mptcp->init_rcv_wnd = tp->rcv_wnd;
|
|
+
|
|
+ /* Set nonce for new subflows */
|
|
+ if (sk->sk_family == AF_INET)
|
|
+ tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(
|
|
+ inet->inet_saddr,
|
|
+ inet->inet_daddr,
|
|
+ inet->inet_sport,
|
|
+ inet->inet_dport);
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ else
|
|
+ tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(
|
|
+ inet6_sk(sk)->saddr.s6_addr32,
|
|
+ sk->sk_v6_daddr.s6_addr32,
|
|
+ inet->inet_sport,
|
|
+ inet->inet_dport);
|
|
+#endif
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
}
|
|
|
|
static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
|
|
@@ -3731,6 +3817,7 @@ void tcp_send_ack(struct sock *sk)
|
|
{
|
|
__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
|
|
}
|
|
+EXPORT_SYMBOL_GPL(tcp_send_ack);
|
|
|
|
/* This routine sends a packet with an out of date sequence
|
|
* number. It assumes the other end will try to ack it.
|
|
@@ -3743,7 +3830,7 @@ void tcp_send_ack(struct sock *sk)
|
|
* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
|
|
* out-of-date with SND.UNA-1 to probe window.
|
|
*/
|
|
-static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
|
|
+int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
struct sk_buff *skb;
|
|
@@ -3830,7 +3917,7 @@ void tcp_send_probe0(struct sock *sk)
|
|
unsigned long timeout;
|
|
int err;
|
|
|
|
- err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
|
|
+ err = tp->ops->write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
|
|
|
|
if (tp->packets_out || tcp_write_queue_empty(sk)) {
|
|
/* Cancel probe timer, if it is not required. */
|
|
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
|
|
index fa2ae96ecdc4..36199efe2837 100644
|
|
--- a/net/ipv4/tcp_timer.c
|
|
+++ b/net/ipv4/tcp_timer.c
|
|
@@ -21,6 +21,7 @@
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/gfp.h>
|
|
+#include <net/mptcp.h>
|
|
#include <net/tcp.h>
|
|
|
|
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
|
|
@@ -65,7 +66,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
|
|
* Returns: Nothing (void)
|
|
*/
|
|
|
|
-static void tcp_write_err(struct sock *sk)
|
|
+void tcp_write_err(struct sock *sk)
|
|
{
|
|
sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
|
|
sk->sk_error_report(sk);
|
|
@@ -121,7 +122,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
|
|
(!tp->snd_wnd && !tp->packets_out))
|
|
do_reset = true;
|
|
if (do_reset)
|
|
- tcp_send_active_reset(sk, GFP_ATOMIC);
|
|
+ tp->ops->send_active_reset(sk, GFP_ATOMIC);
|
|
tcp_done(sk);
|
|
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
|
|
return 1;
|
|
@@ -206,9 +207,9 @@ static unsigned int tcp_model_timeout(struct sock *sk,
|
|
* after "boundary" unsuccessful, exponentially backed-off
|
|
* retransmissions with an initial RTO of TCP_RTO_MIN.
|
|
*/
|
|
-static bool retransmits_timed_out(struct sock *sk,
|
|
- unsigned int boundary,
|
|
- unsigned int timeout)
|
|
+bool retransmits_timed_out(struct sock *sk,
|
|
+ unsigned int boundary,
|
|
+ unsigned int timeout)
|
|
{
|
|
unsigned int start_ts;
|
|
|
|
@@ -228,7 +229,7 @@ static bool retransmits_timed_out(struct sock *sk,
|
|
}
|
|
|
|
/* A write timeout has occurred. Process the after effects. */
|
|
-static int tcp_write_timeout(struct sock *sk)
|
|
+int tcp_write_timeout(struct sock *sk)
|
|
{
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
@@ -243,6 +244,17 @@ static int tcp_write_timeout(struct sock *sk)
|
|
sk_rethink_txhash(sk);
|
|
}
|
|
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
|
|
+
|
|
+#ifdef CONFIG_MPTCP
|
|
+ /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */
|
|
+ if (tcp_sk(sk)->request_mptcp &&
|
|
+ icsk->icsk_retransmits >= sysctl_mptcp_syn_retries) {
|
|
+ tcp_sk(sk)->request_mptcp = 0;
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLERETRANSFALLBACK);
|
|
+ }
|
|
+#endif /* CONFIG_MPTCP */
|
|
+
|
|
expired = icsk->icsk_retransmits >= retry_until;
|
|
} else {
|
|
if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
|
|
@@ -338,18 +350,22 @@ static void tcp_delack_timer(struct timer_list *t)
|
|
struct inet_connection_sock *icsk =
|
|
from_timer(icsk, t, icsk_delack_timer);
|
|
struct sock *sk = &icsk->icsk_inet.sk;
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
|
|
|
|
- bh_lock_sock(sk);
|
|
- if (!sock_owned_by_user(sk)) {
|
|
+ bh_lock_sock(meta_sk);
|
|
+ if (!sock_owned_by_user(meta_sk)) {
|
|
tcp_delack_timer_handler(sk);
|
|
} else {
|
|
icsk->icsk_ack.blocked = 1;
|
|
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
|
|
+ __NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED);
|
|
/* deleguate our work to tcp_release_cb() */
|
|
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
|
|
sock_hold(sk);
|
|
+ if (mptcp(tp))
|
|
+ mptcp_tsq_flags(sk);
|
|
}
|
|
- bh_unlock_sock(sk);
|
|
+ bh_unlock_sock(meta_sk);
|
|
sock_put(sk);
|
|
}
|
|
|
|
@@ -393,7 +409,12 @@ static void tcp_probe_timer(struct sock *sk)
|
|
}
|
|
|
|
if (icsk->icsk_probes_out >= max_probes) {
|
|
-abort: tcp_write_err(sk);
|
|
+abort:
|
|
+ tcp_write_err(sk);
|
|
+ if (is_meta_sk(sk) &&
|
|
+ mptcp_in_infinite_mapping_weak(tp->mpcb)) {
|
|
+ mptcp_sub_force_close_all(tp->mpcb, NULL);
|
|
+ }
|
|
} else {
|
|
/* Only send another probe if we didn't close things up. */
|
|
tcp_send_probe0(sk);
|
|
@@ -614,7 +635,7 @@ void tcp_write_timer_handler(struct sock *sk)
|
|
break;
|
|
case ICSK_TIME_RETRANS:
|
|
icsk->icsk_pending = 0;
|
|
- tcp_retransmit_timer(sk);
|
|
+ tcp_sk(sk)->ops->retransmit_timer(sk);
|
|
break;
|
|
case ICSK_TIME_PROBE0:
|
|
icsk->icsk_pending = 0;
|
|
@@ -631,16 +652,19 @@ static void tcp_write_timer(struct timer_list *t)
|
|
struct inet_connection_sock *icsk =
|
|
from_timer(icsk, t, icsk_retransmit_timer);
|
|
struct sock *sk = &icsk->icsk_inet.sk;
|
|
+ struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk;
|
|
|
|
- bh_lock_sock(sk);
|
|
- if (!sock_owned_by_user(sk)) {
|
|
+ bh_lock_sock(meta_sk);
|
|
+ if (!sock_owned_by_user(meta_sk)) {
|
|
tcp_write_timer_handler(sk);
|
|
} else {
|
|
/* delegate our work to tcp_release_cb() */
|
|
if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
|
|
sock_hold(sk);
|
|
+ if (mptcp(tcp_sk(sk)))
|
|
+ mptcp_tsq_flags(sk);
|
|
}
|
|
- bh_unlock_sock(sk);
|
|
+ bh_unlock_sock(meta_sk);
|
|
sock_put(sk);
|
|
}
|
|
|
|
@@ -670,11 +694,12 @@ static void tcp_keepalive_timer (struct timer_list *t)
|
|
struct sock *sk = from_timer(sk, t, sk_timer);
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
|
|
u32 elapsed;
|
|
|
|
/* Only process if socket is not in use. */
|
|
- bh_lock_sock(sk);
|
|
- if (sock_owned_by_user(sk)) {
|
|
+ bh_lock_sock(meta_sk);
|
|
+ if (sock_owned_by_user(meta_sk)) {
|
|
/* Try again later. */
|
|
inet_csk_reset_keepalive_timer (sk, HZ/20);
|
|
goto out;
|
|
@@ -686,16 +711,31 @@ static void tcp_keepalive_timer (struct timer_list *t)
|
|
}
|
|
|
|
tcp_mstamp_refresh(tp);
|
|
+
|
|
+ if (tp->send_mp_fclose) {
|
|
+ if (icsk->icsk_retransmits >= MPTCP_FASTCLOSE_RETRIES) {
|
|
+ tcp_write_err(sk);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ tcp_send_ack(sk);
|
|
+ icsk->icsk_retransmits++;
|
|
+
|
|
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
|
|
+ elapsed = icsk->icsk_rto;
|
|
+ goto resched;
|
|
+ }
|
|
+
|
|
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
|
|
if (tp->linger2 >= 0) {
|
|
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
|
|
|
|
if (tmo > 0) {
|
|
- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
|
|
+ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
|
|
goto out;
|
|
}
|
|
}
|
|
- tcp_send_active_reset(sk, GFP_ATOMIC);
|
|
+ tp->ops->send_active_reset(sk, GFP_ATOMIC);
|
|
goto death;
|
|
}
|
|
|
|
@@ -720,11 +760,11 @@ static void tcp_keepalive_timer (struct timer_list *t)
|
|
icsk->icsk_probes_out > 0) ||
|
|
(icsk->icsk_user_timeout == 0 &&
|
|
icsk->icsk_probes_out >= keepalive_probes(tp))) {
|
|
- tcp_send_active_reset(sk, GFP_ATOMIC);
|
|
+ tp->ops->send_active_reset(sk, GFP_ATOMIC);
|
|
tcp_write_err(sk);
|
|
goto out;
|
|
}
|
|
- if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
|
|
+ if (tp->ops->write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
|
|
icsk->icsk_probes_out++;
|
|
elapsed = keepalive_intvl_when(tp);
|
|
} else {
|
|
@@ -748,7 +788,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
|
|
tcp_done(sk);
|
|
|
|
out:
|
|
- bh_unlock_sock(sk);
|
|
+ bh_unlock_sock(meta_sk);
|
|
sock_put(sk);
|
|
}
|
|
|
|
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
|
|
index d1f29a3eb70b..78554dcb8532 100644
|
|
--- a/net/ipv6/addrconf.c
|
|
+++ b/net/ipv6/addrconf.c
|
|
@@ -967,6 +967,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
|
|
|
|
kfree_rcu(ifp, rcu);
|
|
}
|
|
+EXPORT_SYMBOL(inet6_ifa_finish_destroy);
|
|
|
|
static void
|
|
ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
|
|
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
|
|
index 942da168f18f..e36520f9dcd5 100644
|
|
--- a/net/ipv6/af_inet6.c
|
|
+++ b/net/ipv6/af_inet6.c
|
|
@@ -104,8 +104,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
|
|
return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
|
|
}
|
|
|
|
-static int inet6_create(struct net *net, struct socket *sock, int protocol,
|
|
- int kern)
|
|
+int inet6_create(struct net *net, struct socket *sock, int protocol, int kern)
|
|
{
|
|
struct inet_sock *inet;
|
|
struct ipv6_pinfo *np;
|
|
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
|
|
index 5352c7e68c42..534a9d2e4858 100644
|
|
--- a/net/ipv6/ipv6_sockglue.c
|
|
+++ b/net/ipv6/ipv6_sockglue.c
|
|
@@ -44,6 +44,8 @@
|
|
#include <net/addrconf.h>
|
|
#include <net/inet_common.h>
|
|
#include <net/tcp.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
#include <net/udp.h>
|
|
#include <net/udplite.h>
|
|
#include <net/xfrm.h>
|
|
@@ -221,7 +223,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
|
|
sock_prot_inuse_add(net, &tcp_prot, 1);
|
|
local_bh_enable();
|
|
sk->sk_prot = &tcp_prot;
|
|
- icsk->icsk_af_ops = &ipv4_specific;
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (sock_flag(sk, SOCK_MPTCP))
|
|
+ icsk->icsk_af_ops = &mptcp_v4_specific;
|
|
+ else
|
|
+#endif
|
|
+ icsk->icsk_af_ops = &ipv4_specific;
|
|
sk->sk_socket->ops = &inet_stream_ops;
|
|
sk->sk_family = PF_INET;
|
|
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
|
|
@@ -345,6 +352,17 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
|
|
if (val == -1)
|
|
val = 0;
|
|
np->tclass = val;
|
|
+
|
|
+ if (is_meta_sk(sk)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (sk_it->sk_family == AF_INET6)
|
|
+ inet6_sk(sk_it)->tclass = val;
|
|
+ }
|
|
+ }
|
|
retv = 0;
|
|
break;
|
|
|
|
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
|
|
index ec155844012b..225c015b60a8 100644
|
|
--- a/net/ipv6/syncookies.c
|
|
+++ b/net/ipv6/syncookies.c
|
|
@@ -15,6 +15,8 @@
|
|
#include <linux/kernel.h>
|
|
#include <net/secure_seq.h>
|
|
#include <net/ipv6.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v6.h>
|
|
#include <net/tcp.h>
|
|
|
|
#define COOKIEBITS 24 /* Upper bits store count */
|
|
@@ -106,7 +108,8 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
|
|
}
|
|
EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence);
|
|
|
|
-__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp)
|
|
+__u32 cookie_v6_init_sequence(struct request_sock *req, const struct sock *sk,
|
|
+ const struct sk_buff *skb, __u16 *mssp)
|
|
{
|
|
const struct ipv6hdr *iph = ipv6_hdr(skb);
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
@@ -128,6 +131,7 @@ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
|
|
struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct tcp_options_received tcp_opt;
|
|
+ struct mptcp_options_received mopt;
|
|
struct inet_request_sock *ireq;
|
|
struct tcp_request_sock *treq;
|
|
struct ipv6_pinfo *np = inet6_sk(sk);
|
|
@@ -157,7 +161,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
|
|
|
|
/* check for timestamp cookie support */
|
|
memset(&tcp_opt, 0, sizeof(tcp_opt));
|
|
- tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
|
|
+ mptcp_init_mp_opt(&mopt);
|
|
+ tcp_parse_options(sock_net(sk), skb, &tcp_opt, &mopt, 0, NULL, NULL);
|
|
|
|
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
|
|
tsoff = secure_tcpv6_ts_off(sock_net(sk),
|
|
@@ -170,14 +175,27 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
|
|
goto out;
|
|
|
|
ret = NULL;
|
|
- req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false);
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (mopt.saw_mpc)
|
|
+ req = inet_reqsk_alloc(&mptcp6_request_sock_ops, sk, false);
|
|
+ else
|
|
+#endif
|
|
+ req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false);
|
|
if (!req)
|
|
goto out;
|
|
|
|
ireq = inet_rsk(req);
|
|
+ ireq->mptcp_rqsk = 0;
|
|
+ ireq->saw_mpc = 0;
|
|
treq = tcp_rsk(req);
|
|
treq->tfo_listener = false;
|
|
|
|
+ /* Must be done before anything else, as it initializes
|
|
+ * hash_entry of the MPTCP request-sock.
|
|
+ */
|
|
+ if (mopt.saw_mpc)
|
|
+ mptcp_cookies_reqsk_init(req, &mopt, skb);
|
|
+
|
|
if (security_inet_conn_request(sk, skb, req))
|
|
goto out_free;
|
|
|
|
@@ -247,15 +265,15 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
|
|
(req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
|
|
req->rsk_window_clamp = full_space;
|
|
|
|
- tcp_select_initial_window(sk, full_space, req->mss,
|
|
- &req->rsk_rcv_wnd, &req->rsk_window_clamp,
|
|
- ireq->wscale_ok, &rcv_wscale,
|
|
- dst_metric(dst, RTAX_INITRWND));
|
|
+ tp->ops->select_initial_window(sk, full_space, req->mss,
|
|
+ &req->rsk_rcv_wnd, &req->rsk_window_clamp,
|
|
+ ireq->wscale_ok, &rcv_wscale,
|
|
+ dst_metric(dst, RTAX_INITRWND));
|
|
|
|
ireq->rcv_wscale = rcv_wscale;
|
|
ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);
|
|
|
|
- ret = tcp_get_cookie_sock(sk, skb, req, dst, tsoff);
|
|
+ ret = tcp_get_cookie_sock(sk, skb, req, &mopt, dst, tsoff);
|
|
out:
|
|
return ret;
|
|
out_free:
|
|
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
|
|
index 3903cc0ab188..2f91fddabceb 100644
|
|
--- a/net/ipv6/tcp_ipv6.c
|
|
+++ b/net/ipv6/tcp_ipv6.c
|
|
@@ -58,6 +58,8 @@
|
|
#include <net/timewait_sock.h>
|
|
#include <net/inet_common.h>
|
|
#include <net/secure_seq.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v6.h>
|
|
#include <net/busy_poll.h>
|
|
|
|
#include <linux/proc_fs.h>
|
|
@@ -67,15 +69,6 @@
|
|
#include <linux/scatterlist.h>
|
|
|
|
#include <trace/events/tcp.h>
|
|
-
|
|
-static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
|
|
-static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
- struct request_sock *req);
|
|
-
|
|
-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
|
|
-
|
|
-static const struct inet_connection_sock_af_ops ipv6_mapped;
|
|
-static const struct inet_connection_sock_af_ops ipv6_specific;
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
|
|
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
|
|
@@ -99,7 +92,7 @@ static struct ipv6_pinfo *tcp_inet6_sk(const struct sock *sk)
|
|
return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
|
|
}
|
|
|
|
-static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
|
|
+void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
|
|
{
|
|
struct dst_entry *dst = skb_dst(skb);
|
|
|
|
@@ -141,7 +134,7 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
|
|
return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr);
|
|
}
|
|
|
|
-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
|
|
+int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
|
|
int addr_len)
|
|
{
|
|
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
|
|
@@ -157,6 +150,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
|
|
int err;
|
|
struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
|
|
|
|
+ mptcp_init_connect(sk);
|
|
+
|
|
if (addr_len < SIN6_LEN_RFC2133)
|
|
return -EINVAL;
|
|
|
|
@@ -236,7 +231,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
|
|
sin.sin_port = usin->sin6_port;
|
|
sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
|
|
|
|
- icsk->icsk_af_ops = &ipv6_mapped;
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (sock_flag(sk, SOCK_MPTCP))
|
|
+ icsk->icsk_af_ops = &mptcp_v6_mapped;
|
|
+ else
|
|
+#endif
|
|
+ icsk->icsk_af_ops = &ipv6_mapped;
|
|
sk->sk_backlog_rcv = tcp_v4_do_rcv;
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
tp->af_specific = &tcp_sock_ipv6_mapped_specific;
|
|
@@ -246,7 +246,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
|
|
|
|
if (err) {
|
|
icsk->icsk_ext_hdr_len = exthdrlen;
|
|
- icsk->icsk_af_ops = &ipv6_specific;
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (sock_flag(sk, SOCK_MPTCP))
|
|
+ icsk->icsk_af_ops = &mptcp_v6_specific;
|
|
+ else
|
|
+#endif
|
|
+ icsk->icsk_af_ops = &ipv6_specific;
|
|
sk->sk_backlog_rcv = tcp_v6_do_rcv;
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
tp->af_specific = &tcp_sock_ipv6_specific;
|
|
@@ -340,7 +345,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
|
|
return err;
|
|
}
|
|
|
|
-static void tcp_v6_mtu_reduced(struct sock *sk)
|
|
+void tcp_v6_mtu_reduced(struct sock *sk)
|
|
{
|
|
struct dst_entry *dst;
|
|
u32 mtu;
|
|
@@ -376,7 +381,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
|
|
struct ipv6_pinfo *np;
|
|
struct tcp_sock *tp;
|
|
__u32 seq, snd_una;
|
|
- struct sock *sk;
|
|
+ struct sock *sk, *meta_sk;
|
|
bool fatal;
|
|
int err;
|
|
|
|
@@ -402,8 +407,14 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
|
|
return 0;
|
|
}
|
|
|
|
- bh_lock_sock(sk);
|
|
- if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
|
|
+ tp = tcp_sk(sk);
|
|
+ if (mptcp(tp))
|
|
+ meta_sk = mptcp_meta_sk(sk);
|
|
+ else
|
|
+ meta_sk = sk;
|
|
+
|
|
+ bh_lock_sock(meta_sk);
|
|
+ if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG)
|
|
__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
|
|
|
|
if (sk->sk_state == TCP_CLOSE)
|
|
@@ -414,7 +425,6 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
|
|
goto out;
|
|
}
|
|
|
|
- tp = tcp_sk(sk);
|
|
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
|
|
fastopen = rcu_dereference(tp->fastopen_rsk);
|
|
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
|
|
@@ -454,10 +464,14 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
|
|
|
|
tp->mtu_info = ntohl(info);
|
|
- if (!sock_owned_by_user(sk))
|
|
+ if (!sock_owned_by_user(meta_sk)) {
|
|
tcp_v6_mtu_reduced(sk);
|
|
- else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
|
|
- &sk->sk_tsq_flags))
|
|
- sock_hold(sk);
|
|
+ } else {
|
|
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
|
|
+ &sk->sk_tsq_flags))
|
|
+ sock_hold(sk);
|
|
+ if (mptcp(tp))
|
|
+ mptcp_tsq_flags(sk);
|
|
+ }
|
|
goto out;
|
|
}
|
|
|
|
@@ -473,7 +487,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
|
|
if (fastopen && !fastopen->sk)
|
|
break;
|
|
|
|
- if (!sock_owned_by_user(sk)) {
|
|
+ if (!sock_owned_by_user(meta_sk)) {
|
|
sk->sk_err = err;
|
|
sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
|
|
|
|
@@ -483,14 +497,14 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
|
|
goto out;
|
|
}
|
|
|
|
- if (!sock_owned_by_user(sk) && np->recverr) {
|
|
+ if (!sock_owned_by_user(meta_sk) && np->recverr) {
|
|
sk->sk_err = err;
|
|
sk->sk_error_report(sk);
|
|
} else
|
|
sk->sk_err_soft = err;
|
|
|
|
out:
|
|
- bh_unlock_sock(sk);
|
|
+ bh_unlock_sock(meta_sk);
|
|
sock_put(sk);
|
|
return 0;
|
|
}
|
|
@@ -538,8 +552,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
|
|
return err;
|
|
}
|
|
|
|
-
|
|
-static void tcp_v6_reqsk_destructor(struct request_sock *req)
|
|
+void tcp_v6_reqsk_destructor(struct request_sock *req)
|
|
{
|
|
kfree(inet_rsk(req)->ipv6_opt);
|
|
kfree_skb(inet_rsk(req)->pktopts);
|
|
@@ -757,9 +770,10 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
|
|
return false;
|
|
}
|
|
|
|
-static void tcp_v6_init_req(struct request_sock *req,
|
|
- const struct sock *sk_listener,
|
|
- struct sk_buff *skb)
|
|
+static int tcp_v6_init_req(struct request_sock *req,
|
|
+ const struct sock *sk_listener,
|
|
+ struct sk_buff *skb,
|
|
+ bool want_cookie)
|
|
{
|
|
bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
|
|
struct inet_request_sock *ireq = inet_rsk(req);
|
|
@@ -781,6 +795,8 @@ static void tcp_v6_init_req(struct request_sock *req,
|
|
refcount_inc(&skb->users);
|
|
ireq->pktopts = skb;
|
|
}
|
|
+
|
|
+ return 0;
|
|
}
|
|
|
|
static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
|
|
@@ -800,7 +816,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
|
|
.syn_ack_timeout = tcp_syn_ack_timeout,
|
|
};
|
|
|
|
-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
|
|
+const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
|
|
.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
|
|
sizeof(struct ipv6hdr),
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
@@ -818,9 +834,9 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
|
|
};
|
|
|
|
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
|
|
- u32 ack, u32 win, u32 tsval, u32 tsecr,
|
|
+ u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr,
|
|
int oif, struct tcp_md5sig_key *key, int rst,
|
|
- u8 tclass, __be32 label, u32 priority)
|
|
+ u8 tclass, __be32 label, u32 priority, int mptcp)
|
|
{
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
struct tcphdr *t1;
|
|
@@ -839,7 +855,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
|
|
if (key)
|
|
tot_len += TCPOLEN_MD5SIG_ALIGNED;
|
|
#endif
|
|
-
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (mptcp)
|
|
+ tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
|
|
+#endif
|
|
buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
|
|
GFP_ATOMIC);
|
|
if (!buff)
|
|
@@ -877,6 +896,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
|
|
tcp_v6_md5_hash_hdr((__u8 *)topt, key,
|
|
&ipv6_hdr(skb)->saddr,
|
|
&ipv6_hdr(skb)->daddr, t1);
|
|
+ topt += 4;
|
|
+ }
|
|
+#endif
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (mptcp) {
|
|
+ /* Construction of 32-bit data_ack */
|
|
+ *topt++ = htonl((TCPOPT_MPTCP << 24) |
|
|
+ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
|
|
+ (0x20 << 8) |
|
|
+ (0x01));
|
|
+ *topt++ = htonl(data_ack);
|
|
}
|
|
#endif
|
|
|
|
@@ -935,7 +965,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
|
|
kfree_skb(buff);
|
|
}
|
|
|
|
-static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
|
|
+void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
|
|
@@ -1020,8 +1050,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
|
|
label = ip6_flowlabel(ipv6h);
|
|
}
|
|
|
|
- tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0,
|
|
- label, priority);
|
|
+ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0,
|
|
+ label, priority, 0);
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
out:
|
|
@@ -1030,30 +1060,37 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
|
|
}
|
|
|
|
static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
|
|
- u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
|
|
+ u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr, int oif,
|
|
struct tcp_md5sig_key *key, u8 tclass,
|
|
- __be32 label, u32 priority)
|
|
+ __be32 label, u32 priority, int mptcp)
|
|
{
|
|
- tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
|
|
- tclass, label, priority);
|
|
+ tcp_v6_send_response(sk, skb, seq, ack, data_ack, win, tsval, tsecr, oif,
|
|
+ key, 0, tclass, label, priority, mptcp);
|
|
}
|
|
|
|
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct inet_timewait_sock *tw = inet_twsk(sk);
|
|
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
|
|
+ u32 data_ack = 0;
|
|
+ int mptcp = 0;
|
|
|
|
+ if (tcptw->mptcp_tw) {
|
|
+ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
|
|
+ mptcp = 1;
|
|
+ }
|
|
tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
|
|
+ data_ack,
|
|
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
|
|
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
|
|
tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
|
|
- tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority);
|
|
+ tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, mptcp);
|
|
|
|
inet_twsk_put(tw);
|
|
}
|
|
|
|
-static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
- struct request_sock *req)
|
|
+void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
+ struct request_sock *req)
|
|
{
|
|
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
|
|
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
|
|
@@ -1063,18 +1100,18 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
* exception of <SYN> segments, MUST be right-shifted by
|
|
* Rcv.Wind.Shift bits:
|
|
*/
|
|
- tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
|
|
+ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ?
|
|
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
|
|
- tcp_rsk(req)->rcv_nxt,
|
|
+ tcp_rsk(req)->rcv_nxt, 0,
|
|
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
|
|
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
|
|
req->ts_recent, sk->sk_bound_dev_if,
|
|
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
|
|
- 0, 0, sk->sk_priority);
|
|
+ 0, 0, sk->sk_priority, 0);
|
|
}
|
|
|
|
|
|
-static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
|
|
+struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
#ifdef CONFIG_SYN_COOKIES
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
@@ -1100,7 +1137,7 @@ u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
|
|
return mss;
|
|
}
|
|
|
|
-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
|
|
+int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
if (skb->protocol == htons(ETH_P_IP))
|
|
return tcp_v4_conn_request(sk, skb);
|
|
@@ -1131,11 +1168,11 @@ static void tcp_v6_restore_cb(struct sk_buff *skb)
|
|
sizeof(struct inet6_skb_parm));
|
|
}
|
|
|
|
-static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
|
|
- struct request_sock *req,
|
|
- struct dst_entry *dst,
|
|
- struct request_sock *req_unhash,
|
|
- bool *own_req)
|
|
+struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
|
|
+ struct request_sock *req,
|
|
+ struct dst_entry *dst,
|
|
+ struct request_sock *req_unhash,
|
|
+ bool *own_req)
|
|
{
|
|
struct inet_request_sock *ireq;
|
|
struct ipv6_pinfo *newnp;
|
|
@@ -1170,7 +1207,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
|
|
|
|
newnp->saddr = newsk->sk_v6_rcv_saddr;
|
|
|
|
- inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
|
|
+#ifdef CONFIG_MPTCP
|
|
+ /* We must check on the request-socket because the listener
|
|
+ * socket's flag may have been changed halfway through.
|
|
+ */
|
|
+ if (!inet_rsk(req)->saw_mpc)
|
|
+ inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped;
|
|
+ else
|
|
+#endif
|
|
+ inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
|
|
newsk->sk_backlog_rcv = tcp_v4_do_rcv;
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
|
|
@@ -1217,6 +1262,14 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
|
|
if (!newsk)
|
|
goto out_nonewsk;
|
|
|
|
+#ifdef CONFIG_MPTCP
|
|
+ /* If the meta_sk is v6-mapped we can end up here with the wrong af_ops.
|
|
+ * Just make sure that this subflow is v6.
|
|
+ */
|
|
+ if (is_meta_sk(sk))
|
|
+ inet_csk(newsk)->icsk_af_ops = &mptcp_v6_specific;
|
|
+#endif
|
|
+
|
|
/*
|
|
* No need to charge this sock to the relevant IPv6 refcnt debug socks
|
|
* count here, tcp_create_openreq_child now does this for us, see the
|
|
@@ -1344,7 +1397,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
|
|
* This is because we cannot sleep with the original spinlock
|
|
* held.
|
|
*/
|
|
-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
|
|
+int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
|
|
struct sk_buff *opt_skb = NULL;
|
|
@@ -1361,6 +1414,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
|
|
if (skb->protocol == htons(ETH_P_IP))
|
|
return tcp_v4_do_rcv(sk, skb);
|
|
|
|
+ if (is_meta_sk(sk))
|
|
+ return mptcp_v6_do_rcv(sk, skb);
|
|
+
|
|
/*
|
|
* socket locking is here for SMP purposes as backlog rcv
|
|
* is currently called with bh processing disabled.
|
|
@@ -1488,6 +1544,10 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
|
|
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
|
|
skb->len - th->doff*4);
|
|
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
|
|
+#ifdef CONFIG_MPTCP
|
|
+ TCP_SKB_CB(skb)->mptcp_flags = 0;
|
|
+ TCP_SKB_CB(skb)->dss_off = 0;
|
|
+#endif
|
|
TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
|
|
TCP_SKB_CB(skb)->tcp_tw_isn = 0;
|
|
TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
|
|
@@ -1502,8 +1562,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
|
|
int sdif = inet6_sdif(skb);
|
|
const struct tcphdr *th;
|
|
const struct ipv6hdr *hdr;
|
|
+ struct sock *sk, *meta_sk = NULL;
|
|
bool refcounted;
|
|
- struct sock *sk;
|
|
int ret;
|
|
struct net *net = dev_net(skb->dev);
|
|
|
|
@@ -1557,12 +1617,17 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
|
|
reqsk_put(req);
|
|
goto csum_error;
|
|
}
|
|
- if (unlikely(sk->sk_state != TCP_LISTEN)) {
|
|
+ if (unlikely(sk->sk_state != TCP_LISTEN && !is_meta_sk(sk))) {
|
|
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
|
|
+ goto lookup;
|
|
+ }
|
|
+ if (unlikely(is_meta_sk(sk) && !mptcp_can_new_subflow(sk))) {
|
|
inet_csk_reqsk_queue_drop_and_put(sk, req);
|
|
goto lookup;
|
|
}
|
|
sock_hold(sk);
|
|
refcounted = true;
|
|
+
|
|
nsk = NULL;
|
|
if (!tcp_filter(sk, skb)) {
|
|
th = (const struct tcphdr *)skb->data;
|
|
@@ -1621,19 +1686,28 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
|
|
|
|
sk_incoming_cpu_update(sk);
|
|
|
|
- bh_lock_sock_nested(sk);
|
|
+ if (mptcp(tcp_sk(sk))) {
|
|
+ meta_sk = mptcp_meta_sk(sk);
|
|
+
|
|
+ bh_lock_sock_nested(meta_sk);
|
|
+ if (sock_owned_by_user(meta_sk))
|
|
+ mptcp_prepare_for_backlog(sk, skb);
|
|
+ } else {
|
|
+ meta_sk = sk;
|
|
+ bh_lock_sock_nested(sk);
|
|
+ }
|
|
tcp_segs_in(tcp_sk(sk), skb);
|
|
ret = 0;
|
|
- if (!sock_owned_by_user(sk)) {
|
|
+ if (!sock_owned_by_user(meta_sk)) {
|
|
skb_to_free = sk->sk_rx_skb_cache;
|
|
sk->sk_rx_skb_cache = NULL;
|
|
ret = tcp_v6_do_rcv(sk, skb);
|
|
} else {
|
|
- if (tcp_add_backlog(sk, skb))
|
|
+ if (tcp_add_backlog(meta_sk, skb))
|
|
goto discard_and_relse;
|
|
skb_to_free = NULL;
|
|
}
|
|
- bh_unlock_sock(sk);
|
|
+ bh_unlock_sock(meta_sk);
|
|
if (skb_to_free)
|
|
__kfree_skb(skb_to_free);
|
|
put_and_return:
|
|
@@ -1647,6 +1721,19 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
|
|
|
|
tcp_v6_fill_cb(skb, hdr, th);
|
|
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (!sk && th->syn && !th->ack) {
|
|
+ int ret = mptcp_lookup_join(skb, NULL);
|
|
+
|
|
+ if (ret < 0) {
|
|
+ tcp_v6_send_reset(NULL, skb);
|
|
+ goto discard_it;
|
|
+ } else if (ret > 0) {
|
|
+ return 0;
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+
|
|
if (tcp_checksum_complete(skb)) {
|
|
csum_error:
|
|
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
|
|
@@ -1699,6 +1786,18 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
|
|
refcounted = false;
|
|
goto process;
|
|
}
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (th->syn && !th->ack) {
|
|
+ int ret = mptcp_lookup_join(skb, inet_twsk(sk));
|
|
+
|
|
+ if (ret < 0) {
|
|
+ tcp_v6_send_reset(NULL, skb);
|
|
+ goto discard_it;
|
|
+ } else if (ret > 0) {
|
|
+ return 0;
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
}
|
|
/* to ACK */
|
|
/* fall through */
|
|
@@ -1753,13 +1852,13 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb)
|
|
}
|
|
}
|
|
|
|
-static struct timewait_sock_ops tcp6_timewait_sock_ops = {
|
|
+struct timewait_sock_ops tcp6_timewait_sock_ops = {
|
|
.twsk_obj_size = sizeof(struct tcp6_timewait_sock),
|
|
.twsk_unique = tcp_twsk_unique,
|
|
.twsk_destructor = tcp_twsk_destructor,
|
|
};
|
|
|
|
-static const struct inet_connection_sock_af_ops ipv6_specific = {
|
|
+const struct inet_connection_sock_af_ops ipv6_specific = {
|
|
.queue_xmit = inet6_csk_xmit,
|
|
.send_check = tcp_v6_send_check,
|
|
.rebuild_header = inet6_sk_rebuild_header,
|
|
@@ -1790,7 +1889,7 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb)
|
|
/*
|
|
* TCP over IPv4 via INET6 API
|
|
*/
|
|
-static const struct inet_connection_sock_af_ops ipv6_mapped = {
|
|
+const struct inet_connection_sock_af_ops ipv6_mapped = {
|
|
.queue_xmit = ip_queue_xmit,
|
|
.send_check = tcp_v4_send_check,
|
|
.rebuild_header = inet_sk_rebuild_header,
|
|
@@ -1826,7 +1925,12 @@ static int tcp_v6_init_sock(struct sock *sk)
|
|
|
|
tcp_init_sock(sk);
|
|
|
|
- icsk->icsk_af_ops = &ipv6_specific;
|
|
+#ifdef CONFIG_MPTCP
|
|
+ if (sock_flag(sk, SOCK_MPTCP))
|
|
+ icsk->icsk_af_ops = &mptcp_v6_specific;
|
|
+ else
|
|
+#endif
|
|
+ icsk->icsk_af_ops = &ipv6_specific;
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
|
|
@@ -1835,7 +1939,7 @@ static int tcp_v6_init_sock(struct sock *sk)
|
|
return 0;
|
|
}
|
|
|
|
-static void tcp_v6_destroy_sock(struct sock *sk)
|
|
+void tcp_v6_destroy_sock(struct sock *sk)
|
|
{
|
|
tcp_v4_destroy_sock(sk);
|
|
inet6_destroy_sock(sk);
|
|
@@ -2058,6 +2162,11 @@ struct proto tcpv6_prot = {
|
|
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
|
|
.max_header = MAX_TCP_HEADER,
|
|
.obj_size = sizeof(struct tcp6_sock),
|
|
+#ifdef CONFIG_MPTCP
|
|
+ .useroffset = offsetof(struct tcp_sock, mptcp_sched_name),
|
|
+ .usersize = sizeof_field(struct tcp_sock, mptcp_sched_name) +
|
|
+ sizeof_field(struct tcp_sock, mptcp_pm_name),
|
|
+#endif
|
|
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
|
.twsk_prot = &tcp6_timewait_sock_ops,
|
|
.rsk_prot = &tcp6_request_sock_ops,
|
|
@@ -2068,6 +2177,9 @@ struct proto tcpv6_prot = {
|
|
.compat_getsockopt = compat_tcp_getsockopt,
|
|
#endif
|
|
.diag_destroy = tcp_abort,
|
|
+#ifdef CONFIG_MPTCP
|
|
+ .clear_sk = mptcp_clear_sk,
|
|
+#endif
|
|
};
|
|
|
|
/* thinking of making this const? Don't.
|
|
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
|
|
new file mode 100644
|
|
index 000000000000..6e05dab4c632
|
|
--- /dev/null
|
|
+++ b/net/mptcp/Kconfig
|
|
@@ -0,0 +1,154 @@
|
|
+#
|
|
+# MPTCP configuration
|
|
+#
|
|
+config MPTCP
|
|
+ bool "MPTCP protocol"
|
|
+ depends on (IPV6=y || IPV6=n)
|
|
+ select CRYPTO_LIB_SHA256
|
|
+ select CRYPTO
|
|
+ ---help---
|
|
+ This replaces the normal TCP stack with a Multipath TCP stack,
|
|
+ able to use several paths at once.
|
|
+
|
|
+menuconfig MPTCP_PM_ADVANCED
|
|
+ bool "MPTCP: advanced path-manager control"
|
|
+ depends on MPTCP=y
|
|
+ ---help---
|
|
+ Support for selection of different path-managers. You should choose 'Y' here,
|
|
+ because otherwise you will not actively create new MPTCP-subflows.
|
|
+
|
|
+if MPTCP_PM_ADVANCED
|
|
+
|
|
+config MPTCP_FULLMESH
|
|
+ tristate "MPTCP Full-Mesh Path-Manager"
|
|
+ depends on MPTCP=y
|
|
+ ---help---
|
|
+ This path-management module will create a full-mesh among all IP-addresses.
|
|
+
|
|
+config MPTCP_NDIFFPORTS
|
|
+ tristate "MPTCP ndiff-ports"
|
|
+ depends on MPTCP=y
|
|
+ ---help---
|
|
+ This path-management module will create multiple subflows between the same
|
|
+ pair of IP-addresses, modifying the source-port. You can set the number
|
|
+ of subflows via the mptcp_ndiffports-sysctl.
|
|
+
|
|
+config MPTCP_BINDER
|
|
+ tristate "MPTCP Binder"
|
|
+ depends on (MPTCP=y)
|
|
+ ---help---
|
|
+ This path-management module works like ndiffports, and adds the sysctl
|
|
+ option to set the gateway (and/or path to) per each additional subflow
|
|
+ via Loose Source Routing (IPv4 only).
|
|
+
|
|
+config MPTCP_NETLINK
|
|
+ tristate "MPTCP Netlink Path-Manager"
|
|
+ depends on MPTCP=y
|
|
+ ---help---
|
|
+ This path-management module is controlled over a Netlink interface. A userspace
|
|
+ module can therefore control the establishment of new subflows and the policy
|
|
+ to apply over those new subflows for every connection.
|
|
+
|
|
+choice
|
|
+ prompt "Default MPTCP Path-Manager"
|
|
+ default DEFAULT_DUMMY
|
|
+ help
|
|
+ Select the Path-Manager of your choice
|
|
+
|
|
+ config DEFAULT_FULLMESH
|
|
+ bool "Full mesh" if MPTCP_FULLMESH=y
|
|
+
|
|
+ config DEFAULT_NDIFFPORTS
|
|
+ bool "ndiff-ports" if MPTCP_NDIFFPORTS=y
|
|
+
|
|
+ config DEFAULT_BINDER
|
|
+ bool "binder" if MPTCP_BINDER=y
|
|
+
|
|
+ config DEFAULT_NETLINK
|
|
+ bool "Netlink" if MPTCP_NETLINK=y
|
|
+
|
|
+ config DEFAULT_DUMMY
|
|
+ bool "Default"
|
|
+
|
|
+endchoice
|
|
+
|
|
+endif
|
|
+
|
|
+config DEFAULT_MPTCP_PM
|
|
+ string
|
|
+ default "default" if DEFAULT_DUMMY
|
|
+ default "fullmesh" if DEFAULT_FULLMESH
|
|
+ default "ndiffports" if DEFAULT_NDIFFPORTS
|
|
+ default "binder" if DEFAULT_BINDER
|
|
+ default "default"
|
|
+
|
|
+menuconfig MPTCP_SCHED_ADVANCED
|
|
+ bool "MPTCP: advanced scheduler control"
|
|
+ depends on MPTCP=y
|
|
+ ---help---
|
|
+ Support for selection of different schedulers. You should choose 'Y' here,
|
|
+ if you want to choose a different scheduler than the default one.
|
|
+
|
|
+if MPTCP_SCHED_ADVANCED
|
|
+
|
|
+config MPTCP_BLEST
|
|
+ tristate "MPTCP BLEST"
|
|
+ depends on MPTCP=y
|
|
+ ---help---
|
|
+ This is an experimental BLocking ESTimation-based (BLEST) scheduler.
|
|
+
|
|
+config MPTCP_ROUNDROBIN
|
|
+ tristate "MPTCP Round-Robin"
|
|
+ depends on (MPTCP=y)
|
|
+ ---help---
|
|
+ This is a very simple round-robin scheduler. Probably has bad performance
|
|
+ but might be interesting for researchers.
|
|
+
|
|
+config MPTCP_REDUNDANT
|
|
+ tristate "MPTCP Redundant"
|
|
+ depends on (MPTCP=y)
|
|
+ ---help---
|
|
+ This scheduler sends all packets redundantly over all subflows to decreases
|
|
+ latency and jitter on the cost of lower throughput.
|
|
+
|
|
+config MPTCP_ECF
|
|
+ tristate "MPTCP ECF"
|
|
+ depends on (MPTCP=y)
|
|
+ ---help---
|
|
+ This is an experimental Earliest Completion First (ECF) scheduler.
|
|
+
|
|
+choice
|
|
+ prompt "Default MPTCP Scheduler"
|
|
+ default DEFAULT_SCHEDULER
|
|
+ help
|
|
+ Select the Scheduler of your choice
|
|
+
|
|
+ config DEFAULT_SCHEDULER
|
|
+ bool "Default"
|
|
+ ---help---
|
|
+ This is the default scheduler, sending first on the subflow
|
|
+ with the lowest RTT.
|
|
+
|
|
+ config DEFAULT_ROUNDROBIN
|
|
+ bool "Round-Robin" if MPTCP_ROUNDROBIN=y
|
|
+ ---help---
|
|
+ This is the round-rob scheduler, sending in a round-robin
|
|
+ fashion..
|
|
+
|
|
+ config DEFAULT_REDUNDANT
|
|
+ bool "Redundant" if MPTCP_REDUNDANT=y
|
|
+ ---help---
|
|
+ This is the redundant scheduler, sending packets redundantly over
|
|
+ all the subflows.
|
|
+
|
|
+endchoice
|
|
+endif
|
|
+
|
|
+config DEFAULT_MPTCP_SCHED
|
|
+ string
|
|
+ depends on (MPTCP=y)
|
|
+ default "default" if DEFAULT_SCHEDULER
|
|
+ default "roundrobin" if DEFAULT_ROUNDROBIN
|
|
+ default "redundant" if DEFAULT_REDUNDANT
|
|
+ default "default"
|
|
+
|
|
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
|
|
new file mode 100644
|
|
index 000000000000..369248a2f68e
|
|
--- /dev/null
|
|
+++ b/net/mptcp/Makefile
|
|
@@ -0,0 +1,25 @@
|
|
+#
|
|
+## Makefile for MultiPath TCP support code.
|
|
+#
|
|
+#
|
|
+
|
|
+obj-$(CONFIG_MPTCP) += mptcp.o
|
|
+
|
|
+mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_pm.o \
|
|
+ mptcp_output.o mptcp_input.o mptcp_sched.o
|
|
+
|
|
+obj-$(CONFIG_TCP_CONG_LIA) += mptcp_coupled.o
|
|
+obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o
|
|
+obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o
|
|
+obj-$(CONFIG_TCP_CONG_BALIA) += mptcp_balia.o
|
|
+obj-$(CONFIG_TCP_CONG_MCTCPDESYNC) += mctcp_desync.o
|
|
+obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o
|
|
+obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o
|
|
+obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o
|
|
+obj-$(CONFIG_MPTCP_NETLINK) += mptcp_netlink.o
|
|
+obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o
|
|
+obj-$(CONFIG_MPTCP_REDUNDANT) += mptcp_redundant.o
|
|
+obj-$(CONFIG_MPTCP_BLEST) += mptcp_blest.o
|
|
+obj-$(CONFIG_MPTCP_ECF) += mptcp_ecf.o
|
|
+
|
|
+mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o
|
|
diff --git a/net/mptcp/mctcp_desync.c b/net/mptcp/mctcp_desync.c
|
|
new file mode 100644
|
|
index 000000000000..f6bf9251d59b
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mctcp_desync.c
|
|
@@ -0,0 +1,193 @@
|
|
+/*
|
|
+ * Desynchronized Multi-Channel TCP Congestion Control Algorithm
|
|
+ *
|
|
+ * Implementation based on publications of "DMCTCP:Desynchronized Multi-Channel
|
|
+ * TCP for high speed access networks with tiny buffers" in 23rd international
|
|
+ * conference of Computer Communication and Networks (ICCCN), 2014, and
|
|
+ * "Exploring parallelism and desynchronization of TCP over high speed networks
|
|
+ * with tiny buffers" in Journal of Computer Communications Elsevier, 2015.
|
|
+ *
|
|
+ * http://ieeexplore.ieee.org/abstract/document/6911722/
|
|
+ * https://doi.org/10.1016/j.comcom.2015.07.010
|
|
+ *
|
|
+ * This prototype is for research purpose and is currently experimental code
|
|
+ * that only support a single path. Future support of multi-channel over
|
|
+ * multi-path requires channels grouping.
|
|
+ *
|
|
+ * Initial Design and Implementation:
|
|
+ * Cheng Cui <Cheng.Cui@netapp.com>
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or modify it
|
|
+ * under the terms of the GNU General Public License as published by the Free
|
|
+ * Software Foundation; either version 2 of the License, or (at your option)
|
|
+ * any later version.
|
|
+ */
|
|
+#include <net/tcp.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <linux/module.h>
|
|
+
|
|
+enum {
|
|
+ MASTER_CHANNEL = 1,
|
|
+ INI_MIN_CWND = 2,
|
|
+};
|
|
+
|
|
+/* private congestion control structure:
|
|
+ * off_tstamp: the last backoff timestamp for loss synchronization event
|
|
+ * off_subfid: the subflow which was backoff on off_tstamp
|
|
+ */
|
|
+struct mctcp_desync {
|
|
+ u64 off_tstamp;
|
|
+ u8 off_subfid;
|
|
+};
|
|
+
|
|
+static inline int mctcp_cc_sk_can_send(const struct sock *sk)
|
|
+{
|
|
+ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
|
|
+}
|
|
+
|
|
+static void mctcp_desync_init(struct sock *sk)
|
|
+{
|
|
+ if (mptcp(tcp_sk(sk))) {
|
|
+ struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk));
|
|
+ ca->off_tstamp = 0;
|
|
+ ca->off_subfid = 0;
|
|
+ }
|
|
+ /* If we do not mptcp, behave like reno: return */
|
|
+}
|
|
+
|
|
+static void mctcp_desync_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ if (!mptcp(tp)) {
|
|
+ tcp_reno_cong_avoid(sk, ack, acked);
|
|
+ return;
|
|
+ } else if (!tcp_is_cwnd_limited(sk)) {
|
|
+ return;
|
|
+ } else {
|
|
+ const struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk));
|
|
+ const u8 subfid = tp->mptcp->path_index;
|
|
+
|
|
+ /* current aggregated cwnd */
|
|
+ u32 agg_cwnd = 0;
|
|
+ u32 min_cwnd = 0xffffffff;
|
|
+ u8 min_cwnd_subfid = 0;
|
|
+
|
|
+ /* In "safe" area, increase */
|
|
+ if (tcp_in_slow_start(tp)) {
|
|
+ if (ca->off_subfid) {
|
|
+ /* passed initial phase, allow slow start */
|
|
+ tcp_slow_start(tp, acked);
|
|
+ } else if (MASTER_CHANNEL == tp->mptcp->path_index) {
|
|
+ /* master channel is normal slow start in
|
|
+ * initial phase */
|
|
+ tcp_slow_start(tp, acked);
|
|
+ } else {
|
|
+ /* secondary channels increase slowly until
|
|
+ * the initial phase passed
|
|
+ */
|
|
+ tp->snd_ssthresh = tp->snd_cwnd = INI_MIN_CWND;
|
|
+ }
|
|
+ return;
|
|
+ } else {
|
|
+ /* In dangerous area, increase slowly and linearly. */
|
|
+ const struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ /* get total cwnd and the subflow that has min cwnd */
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ const struct sock *sub_sk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (mctcp_cc_sk_can_send(sub_sk)) {
|
|
+ const struct tcp_sock *sub_tp =
|
|
+ tcp_sk(sub_sk);
|
|
+ agg_cwnd += sub_tp->snd_cwnd;
|
|
+ if(min_cwnd > sub_tp->snd_cwnd) {
|
|
+ min_cwnd = sub_tp->snd_cwnd;
|
|
+ min_cwnd_subfid =
|
|
+ sub_tp->mptcp->path_index;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ /* the smallest subflow grows faster than others */
|
|
+ if (subfid == min_cwnd_subfid) {
|
|
+ tcp_cong_avoid_ai(tp, min_cwnd, acked);
|
|
+ } else {
|
|
+ tcp_cong_avoid_ai(tp, agg_cwnd - min_cwnd,
|
|
+ acked);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static u32 mctcp_desync_ssthresh(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ if (!mptcp(tp)) {
|
|
+ return max(tp->snd_cwnd >> 1U, 2U);
|
|
+ } else {
|
|
+ struct mctcp_desync *ca = inet_csk_ca(mptcp_meta_sk(sk));
|
|
+ const u8 subfid = tp->mptcp->path_index;
|
|
+ const struct mptcp_tcp_sock *mptcp;
|
|
+ u32 max_cwnd = 0;
|
|
+ u8 max_cwnd_subfid = 0;
|
|
+
|
|
+ /* Find the subflow that has the max cwnd. */
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ const struct sock *sub_sk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (mctcp_cc_sk_can_send(sub_sk)) {
|
|
+ const struct tcp_sock *sub_tp = tcp_sk(sub_sk);
|
|
+ if (max_cwnd < sub_tp->snd_cwnd) {
|
|
+ max_cwnd = sub_tp->snd_cwnd;
|
|
+ max_cwnd_subfid =
|
|
+ sub_tp->mptcp->path_index;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ /* Use high resolution clock. */
|
|
+ if (subfid == max_cwnd_subfid) {
|
|
+ u64 now = tcp_clock_us();
|
|
+ u32 delta = tcp_stamp_us_delta(now, ca->off_tstamp);
|
|
+
|
|
+ if (delta < (tp->srtt_us >> 3)) {
|
|
+ /* desynchronize */
|
|
+ return tp->snd_cwnd;
|
|
+ } else {
|
|
+ ca->off_tstamp = now;
|
|
+ ca->off_subfid = subfid;
|
|
+ return max(max_cwnd >> 1U, 2U);
|
|
+ }
|
|
+ } else {
|
|
+ return tp->snd_cwnd;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct tcp_congestion_ops mctcp_desync = {
|
|
+ .init = mctcp_desync_init,
|
|
+ .ssthresh = mctcp_desync_ssthresh,
|
|
+ .undo_cwnd = tcp_reno_undo_cwnd,
|
|
+ .cong_avoid = mctcp_desync_cong_avoid,
|
|
+ .owner = THIS_MODULE,
|
|
+ .name = "mctcpdesync",
|
|
+};
|
|
+
|
|
+static int __init mctcp_desync_register(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct mctcp_desync) > ICSK_CA_PRIV_SIZE);
|
|
+ return tcp_register_congestion_control(&mctcp_desync);
|
|
+}
|
|
+
|
|
+static void __exit mctcp_desync_unregister(void)
|
|
+{
|
|
+ tcp_unregister_congestion_control(&mctcp_desync);
|
|
+}
|
|
+
|
|
+module_init(mctcp_desync_register);
|
|
+module_exit(mctcp_desync_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Cheng Cui");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("MCTCP: DESYNCHRONIZED MULTICHANNEL TCP CONGESTION CONTROL");
|
|
+MODULE_VERSION("1.0");
|
|
diff --git a/net/mptcp/mptcp_balia.c b/net/mptcp/mptcp_balia.c
|
|
new file mode 100644
|
|
index 000000000000..179b53dea020
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_balia.c
|
|
@@ -0,0 +1,261 @@
|
|
+/*
|
|
+ * MPTCP implementation - Balia Congestion Control
|
|
+ * (Balanced Linked Adaptation Algorithm)
|
|
+ *
|
|
+ * Analysis, Design and Implementation:
|
|
+ * Qiuyu Peng <qpeng@caltech.edu>
|
|
+ * Anwar Walid <anwar@research.bell-labs.com>
|
|
+ * Jaehyun Hwang <jhyun.hwang@samsung.com>
|
|
+ * Steven H. Low <slow@caltech.edu>
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#include <net/tcp.h>
|
|
+#include <net/mptcp.h>
|
|
+
|
|
+#include <linux/module.h>
|
|
+
|
|
+/* The variable 'rate' (i.e., x_r) will be scaled
|
|
+ * e.g., from B/s to KB/s, MB/s, or GB/s
|
|
+ * if max_rate > 2^rate_scale_limit
|
|
+ */
|
|
+
|
|
+static int rate_scale_limit = 25;
|
|
+static int alpha_scale = 10;
|
|
+static int scale_num = 5;
|
|
+
|
|
+struct mptcp_balia {
|
|
+ u64 ai;
|
|
+ u64 md;
|
|
+ bool forced_update;
|
|
+};
|
|
+
|
|
+static inline int mptcp_balia_sk_can_send(const struct sock *sk)
|
|
+{
|
|
+ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
|
|
+}
|
|
+
|
|
+static inline u64 mptcp_get_ai(const struct sock *meta_sk)
|
|
+{
|
|
+ return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->ai;
|
|
+}
|
|
+
|
|
+static inline void mptcp_set_ai(const struct sock *meta_sk, u64 ai)
|
|
+{
|
|
+ ((struct mptcp_balia *)inet_csk_ca(meta_sk))->ai = ai;
|
|
+}
|
|
+
|
|
+static inline u64 mptcp_get_md(const struct sock *meta_sk)
|
|
+{
|
|
+ return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->md;
|
|
+}
|
|
+
|
|
+static inline void mptcp_set_md(const struct sock *meta_sk, u64 md)
|
|
+{
|
|
+ ((struct mptcp_balia *)inet_csk_ca(meta_sk))->md = md;
|
|
+}
|
|
+
|
|
+static inline u64 mptcp_balia_scale(u64 val, int scale)
|
|
+{
|
|
+ return (u64) val << scale;
|
|
+}
|
|
+
|
|
+static inline bool mptcp_get_forced(const struct sock *meta_sk)
|
|
+{
|
|
+ return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->forced_update;
|
|
+}
|
|
+
|
|
+static inline void mptcp_set_forced(const struct sock *meta_sk, bool force)
|
|
+{
|
|
+ ((struct mptcp_balia *)inet_csk_ca(meta_sk))->forced_update = force;
|
|
+}
|
|
+
|
|
+static void mptcp_balia_recalc_ai(const struct sock *sk)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+ const struct mptcp_cb *mpcb = tp->mpcb;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ u64 max_rate = 0, rate = 0, sum_rate = 0;
|
|
+ u64 alpha, ai = tp->snd_cwnd, md = (tp->snd_cwnd >> 1);
|
|
+ int num_scale_down = 0;
|
|
+
|
|
+ if (!mpcb)
|
|
+ return;
|
|
+
|
|
+ /* Find max_rate first */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ const struct sock *sub_sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
|
|
+ u64 tmp;
|
|
+
|
|
+ if (!mptcp_balia_sk_can_send(sub_sk))
|
|
+ continue;
|
|
+
|
|
+ tmp = div_u64((u64)tp->mss_cache * sub_tp->snd_cwnd
|
|
+ * (USEC_PER_SEC << 3), sub_tp->srtt_us);
|
|
+ sum_rate += tmp;
|
|
+
|
|
+ if (tp == sub_tp)
|
|
+ rate = tmp;
|
|
+
|
|
+ if (tmp >= max_rate)
|
|
+ max_rate = tmp;
|
|
+ }
|
|
+
|
|
+ /* At least, the current subflow should be able to send */
|
|
+ if (unlikely(!rate))
|
|
+ goto exit;
|
|
+
|
|
+ alpha = div64_u64(max_rate, rate);
|
|
+
|
|
+ /* Scale down max_rate if it is too high (e.g., >2^25) */
|
|
+ while (max_rate > mptcp_balia_scale(1, rate_scale_limit)) {
|
|
+ max_rate >>= scale_num;
|
|
+ num_scale_down++;
|
|
+ }
|
|
+
|
|
+ if (num_scale_down) {
|
|
+ sum_rate = 0;
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ const struct sock *sub_sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
|
|
+ u64 tmp;
|
|
+
|
|
+ if (!mptcp_balia_sk_can_send(sub_sk))
|
|
+ continue;
|
|
+
|
|
+ tmp = div_u64((u64)tp->mss_cache * sub_tp->snd_cwnd
|
|
+ * (USEC_PER_SEC << 3), sub_tp->srtt_us);
|
|
+ tmp >>= (scale_num * num_scale_down);
|
|
+
|
|
+ sum_rate += tmp;
|
|
+ }
|
|
+ rate >>= (scale_num * num_scale_down);
|
|
+ }
|
|
+
|
|
+ /* (sum_rate)^2 * 10 * w_r
|
|
+ * ai = ------------------------------------
|
|
+ * (x_r + max_rate) * (4x_r + max_rate)
|
|
+ */
|
|
+ sum_rate *= sum_rate;
|
|
+
|
|
+ ai = div64_u64(sum_rate * 10, rate + max_rate);
|
|
+ ai = div64_u64(ai * tp->snd_cwnd, (rate << 2) + max_rate);
|
|
+
|
|
+ if (unlikely(!ai))
|
|
+ ai = tp->snd_cwnd;
|
|
+
|
|
+ md = ((tp->snd_cwnd >> 1) * min(mptcp_balia_scale(alpha, alpha_scale),
|
|
+ mptcp_balia_scale(3, alpha_scale) >> 1))
|
|
+ >> alpha_scale;
|
|
+
|
|
+exit:
|
|
+ mptcp_set_ai(sk, ai);
|
|
+ mptcp_set_md(sk, md);
|
|
+}
|
|
+
|
|
+static void mptcp_balia_init(struct sock *sk)
|
|
+{
|
|
+ if (mptcp(tcp_sk(sk))) {
|
|
+ mptcp_set_forced(sk, 0);
|
|
+ mptcp_set_ai(sk, 0);
|
|
+ mptcp_set_md(sk, 0);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void mptcp_balia_cwnd_event(struct sock *sk, enum tcp_ca_event event)
|
|
+{
|
|
+ if (event == CA_EVENT_COMPLETE_CWR || event == CA_EVENT_LOSS)
|
|
+ mptcp_balia_recalc_ai(sk);
|
|
+}
|
|
+
|
|
+static void mptcp_balia_set_state(struct sock *sk, u8 ca_state)
|
|
+{
|
|
+ if (!mptcp(tcp_sk(sk)))
|
|
+ return;
|
|
+
|
|
+ mptcp_set_forced(sk, 1);
|
|
+}
|
|
+
|
|
+static void mptcp_balia_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ int snd_cwnd;
|
|
+
|
|
+ if (!mptcp(tp)) {
|
|
+ tcp_reno_cong_avoid(sk, ack, acked);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (!tcp_is_cwnd_limited(sk))
|
|
+ return;
|
|
+
|
|
+ if (tcp_in_slow_start(tp)) {
|
|
+ /* In "safe" area, increase. */
|
|
+ tcp_slow_start(tp, acked);
|
|
+ mptcp_balia_recalc_ai(sk);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (mptcp_get_forced(mptcp_meta_sk(sk))) {
|
|
+ mptcp_balia_recalc_ai(sk);
|
|
+ mptcp_set_forced(sk, 0);
|
|
+ }
|
|
+
|
|
+ snd_cwnd = (int)mptcp_get_ai(sk);
|
|
+
|
|
+ if (tp->snd_cwnd_cnt >= snd_cwnd) {
|
|
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
|
|
+ tp->snd_cwnd++;
|
|
+ mptcp_balia_recalc_ai(sk);
|
|
+ }
|
|
+
|
|
+ tp->snd_cwnd_cnt = 0;
|
|
+ } else {
|
|
+ tp->snd_cwnd_cnt++;
|
|
+ }
|
|
+}
|
|
+
|
|
+static u32 mptcp_balia_ssthresh(struct sock *sk)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ if (unlikely(!mptcp(tp)))
|
|
+ return tcp_reno_ssthresh(sk);
|
|
+ else
|
|
+ return max((u32)(tp->snd_cwnd - mptcp_get_md(sk)), 1U);
|
|
+}
|
|
+
|
|
+static struct tcp_congestion_ops mptcp_balia = {
|
|
+ .init = mptcp_balia_init,
|
|
+ .ssthresh = mptcp_balia_ssthresh,
|
|
+ .cong_avoid = mptcp_balia_cong_avoid,
|
|
+ .undo_cwnd = tcp_reno_undo_cwnd,
|
|
+ .cwnd_event = mptcp_balia_cwnd_event,
|
|
+ .set_state = mptcp_balia_set_state,
|
|
+ .owner = THIS_MODULE,
|
|
+ .name = "balia",
|
|
+};
|
|
+
|
|
+static int __init mptcp_balia_register(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct mptcp_balia) > ICSK_CA_PRIV_SIZE);
|
|
+ return tcp_register_congestion_control(&mptcp_balia);
|
|
+}
|
|
+
|
|
+static void __exit mptcp_balia_unregister(void)
|
|
+{
|
|
+ tcp_unregister_congestion_control(&mptcp_balia);
|
|
+}
|
|
+
|
|
+module_init(mptcp_balia_register);
|
|
+module_exit(mptcp_balia_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Jaehyun Hwang, Anwar Walid, Qiuyu Peng, Steven H. Low");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("MPTCP BALIA CONGESTION CONTROL ALGORITHM");
|
|
+MODULE_VERSION("0.1");
|
|
diff --git a/net/mptcp/mptcp_binder.c b/net/mptcp/mptcp_binder.c
|
|
new file mode 100644
|
|
index 000000000000..7f34a8d00274
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_binder.c
|
|
@@ -0,0 +1,494 @@
|
|
+#include <linux/module.h>
|
|
+
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
+
|
|
+#include <linux/route.h>
|
|
+#include <linux/inet.h>
|
|
+#include <linux/mroute.h>
|
|
+#include <linux/spinlock_types.h>
|
|
+#include <net/inet_ecn.h>
|
|
+#include <net/route.h>
|
|
+#include <net/xfrm.h>
|
|
+#include <net/compat.h>
|
|
+#include <linux/slab.h>
|
|
+
|
|
+#define MPTCP_GW_MAX_LISTS 10
|
|
+#define MPTCP_GW_LIST_MAX_LEN 6
|
|
+#define MPTCP_GW_SYSCTL_MAX_LEN (15 * MPTCP_GW_LIST_MAX_LEN * \
|
|
+ MPTCP_GW_MAX_LISTS)
|
|
+
|
|
+struct mptcp_gw_list {
|
|
+ struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN];
|
|
+ u8 len[MPTCP_GW_MAX_LISTS];
|
|
+};
|
|
+
|
|
+struct binder_priv {
|
|
+ /* Worker struct for subflow establishment */
|
|
+ struct work_struct subflow_work;
|
|
+
|
|
+ struct mptcp_cb *mpcb;
|
|
+
|
|
+ /* Prevent multiple sub-sockets concurrently iterating over sockets */
|
|
+ spinlock_t *flow_lock;
|
|
+};
|
|
+
|
|
+static struct mptcp_gw_list *mptcp_gws;
|
|
+static rwlock_t mptcp_gws_lock;
|
|
+
|
|
+static int mptcp_binder_ndiffports __read_mostly = 1;
|
|
+
|
|
+static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly;
|
|
+
|
|
+static int mptcp_get_avail_list_ipv4(struct sock *sk)
|
|
+{
|
|
+ int i, j, list_taken, opt_ret, opt_len;
|
|
+ unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN];
|
|
+
|
|
+ for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ if (mptcp_gws->len[i] == 0)
|
|
+ goto error;
|
|
+
|
|
+ mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i);
|
|
+ list_taken = 0;
|
|
+
|
|
+ /* Loop through all sub-sockets in this connection */
|
|
+ mptcp_for_each_sub(tcp_sk(sk)->mpcb, mptcp) {
|
|
+ sk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n");
|
|
+
|
|
+ /* Reset length and options buffer, then retrieve
|
|
+ * from socket
|
|
+ */
|
|
+ opt_len = MAX_IPOPTLEN;
|
|
+ memset(opt, 0, MAX_IPOPTLEN);
|
|
+ opt_ret = ip_getsockopt(sk, IPPROTO_IP,
|
|
+ IP_OPTIONS, (char __user *)opt, (int __user *)&opt_len);
|
|
+ if (opt_ret < 0) {
|
|
+ mptcp_debug("%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n",
|
|
+ __func__, opt_ret);
|
|
+ goto error;
|
|
+ }
|
|
+
|
|
+ /* If socket has no options, it has no stake in this list */
|
|
+ if (opt_len <= 0)
|
|
+ continue;
|
|
+
|
|
+ /* Iterate options buffer */
|
|
+ for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) {
|
|
+ if (*opt_ptr == IPOPT_LSRR) {
|
|
+ mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n");
|
|
+ goto sock_lsrr;
|
|
+ }
|
|
+ }
|
|
+ continue;
|
|
+
|
|
+sock_lsrr:
|
|
+ /* Pointer to the 2nd to last address */
|
|
+ opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4;
|
|
+
|
|
+ /* Addresses start 3 bytes after type offset */
|
|
+ opt_ptr += 3;
|
|
+ j = 0;
|
|
+
|
|
+ /* Different length lists cannot be the same */
|
|
+ if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i])
|
|
+ continue;
|
|
+
|
|
+ /* Iterate if we are still inside options list
|
|
+ * and sysctl list
|
|
+ */
|
|
+ while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) {
|
|
+ /* If there is a different address, this list must
|
|
+ * not be set on this socket
|
|
+ */
|
|
+ if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4))
|
|
+ break;
|
|
+
|
|
+ /* Jump 4 bytes to next address */
|
|
+ opt_ptr += 4;
|
|
+ j++;
|
|
+ }
|
|
+
|
|
+ /* Reached the end without a differing address, lists
|
|
+ * are therefore identical.
|
|
+ */
|
|
+ if (j == mptcp_gws->len[i]) {
|
|
+ mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n");
|
|
+ list_taken = 1;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Free list found if not taken by a socket */
|
|
+ if (!list_taken) {
|
|
+ mptcp_debug("mptcp_get_avail_list_ipv4: List free\n");
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (i >= MPTCP_GW_MAX_LISTS)
|
|
+ goto error;
|
|
+
|
|
+ return i;
|
|
+error:
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+/* The list of addresses is parsed each time a new connection is opened,
|
|
+ * to make sure it's up to date. In case of error, all the lists are
|
|
+ * marked as unavailable and the subflow's fingerprint is set to 0.
|
|
+ */
|
|
+static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr)
|
|
+{
|
|
+ int i, j, ret;
|
|
+ unsigned char opt[MAX_IPOPTLEN] = {0};
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0];
|
|
+
|
|
+ /* Read lock: multiple sockets can read LSRR addresses at the same
|
|
+ * time, but writes are done in mutual exclusion.
|
|
+ * Spin lock: must search for free list for one socket at a time, or
|
|
+ * multiple sockets could take the same list.
|
|
+ */
|
|
+ read_lock(&mptcp_gws_lock);
|
|
+ spin_lock(fmp->flow_lock);
|
|
+
|
|
+ i = mptcp_get_avail_list_ipv4(sk);
|
|
+
|
|
+ /* Execution enters here only if a free path is found.
|
|
+ */
|
|
+ if (i >= 0) {
|
|
+ opt[0] = IPOPT_NOP;
|
|
+ opt[1] = IPOPT_LSRR;
|
|
+ opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) *
|
|
+ (mptcp_gws->len[i] + 1) + 3;
|
|
+ opt[3] = IPOPT_MINOFF;
|
|
+ for (j = 0; j < mptcp_gws->len[i]; ++j)
|
|
+ memcpy(opt + 4 +
|
|
+ (j * sizeof(mptcp_gws->list[i][0].s_addr)),
|
|
+ &mptcp_gws->list[i][j].s_addr,
|
|
+ sizeof(mptcp_gws->list[i][0].s_addr));
|
|
+ /* Final destination must be part of IP_OPTIONS parameter. */
|
|
+ memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr,
|
|
+ sizeof(addr.s_addr));
|
|
+
|
|
+ /* setsockopt must be inside the lock, otherwise another
|
|
+ * subflow could fail to see that we have taken a list.
|
|
+ */
|
|
+ ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, (char __user *)opt,
|
|
+ 4 + sizeof(mptcp_gws->list[i][0].s_addr) * (mptcp_gws->len[i] + 1));
|
|
+
|
|
+ if (ret < 0) {
|
|
+ mptcp_debug("%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n",
|
|
+ __func__, ret);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ spin_unlock(fmp->flow_lock);
|
|
+ read_unlock(&mptcp_gws_lock);
|
|
+
|
|
+ return;
|
|
+}
|
|
+
|
|
+/* Parses gateways string for a list of paths to different
|
|
+ * gateways, and stores them for use with the Loose Source Routing (LSRR)
|
|
+ * socket option. Each list must have "," separated addresses, and the lists
|
|
+ * themselves must be separated by "-". Returns -1 in case one or more of the
|
|
+ * addresses is not a valid ipv4/6 address.
|
|
+ */
|
|
+static int mptcp_parse_gateway_ipv4(char *gateways)
|
|
+{
|
|
+ int i, j, k, ret;
|
|
+ char *tmp_string = NULL;
|
|
+ struct in_addr tmp_addr;
|
|
+
|
|
+ tmp_string = kzalloc(16, GFP_KERNEL);
|
|
+ if (tmp_string == NULL)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ write_lock(&mptcp_gws_lock);
|
|
+
|
|
+ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
|
|
+
|
|
+ /* A TMP string is used since inet_pton needs a null terminated string
|
|
+ * but we do not want to modify the sysctl for obvious reasons.
|
|
+ * i will iterate over the SYSCTL string, j will iterate over the
|
|
+ * temporary string where each IP is copied into, k will iterate over
|
|
+ * the IPs in each list.
|
|
+ */
|
|
+ for (i = j = k = 0;
|
|
+ i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS;
|
|
+ ++i) {
|
|
+ if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') {
|
|
+ /* If the temp IP is empty and the current list is
|
|
+ * empty, we are done.
|
|
+ */
|
|
+ if (j == 0 && mptcp_gws->len[k] == 0)
|
|
+ break;
|
|
+
|
|
+ /* Terminate the temp IP string, then if it is
|
|
+ * non-empty parse the IP and copy it.
|
|
+ */
|
|
+ tmp_string[j] = '\0';
|
|
+ if (j > 0) {
|
|
+ mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i);
|
|
+
|
|
+ ret = in4_pton(tmp_string, strlen(tmp_string),
|
|
+ (u8 *)&tmp_addr.s_addr, '\0',
|
|
+ NULL);
|
|
+
|
|
+ if (ret) {
|
|
+ mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n",
|
|
+ ret,
|
|
+ &tmp_addr.s_addr);
|
|
+ memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr,
|
|
+ &tmp_addr.s_addr,
|
|
+ sizeof(tmp_addr.s_addr));
|
|
+ mptcp_gws->len[k]++;
|
|
+ j = 0;
|
|
+ tmp_string[j] = '\0';
|
|
+ /* Since we can't impose a limit to
|
|
+ * what the user can input, make sure
|
|
+ * there are not too many IPs in the
|
|
+ * SYSCTL string.
|
|
+ */
|
|
+ if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) {
|
|
+ mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n",
|
|
+ k,
|
|
+ MPTCP_GW_LIST_MAX_LEN);
|
|
+ goto error;
|
|
+ }
|
|
+ } else {
|
|
+ goto error;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (gateways[i] == '-' || gateways[i] == '\0')
|
|
+ ++k;
|
|
+ } else {
|
|
+ tmp_string[j] = gateways[i];
|
|
+ ++j;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Number of flows is number of gateway lists plus master flow */
|
|
+ mptcp_binder_ndiffports = k+1;
|
|
+
|
|
+ write_unlock(&mptcp_gws_lock);
|
|
+ kfree(tmp_string);
|
|
+
|
|
+ return 0;
|
|
+
|
|
+error:
|
|
+ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
|
|
+ memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN);
|
|
+ write_unlock(&mptcp_gws_lock);
|
|
+ kfree(tmp_string);
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * Create all new subflows, by doing calls to mptcp_initX_subsockets
|
|
+ *
|
|
+ * This function uses a goto next_subflow, to allow releasing the lock between
|
|
+ * new subflows and giving other processes a chance to do some work on the
|
|
+ * socket and potentially finishing the communication.
|
|
+ **/
|
|
+static void create_subflow_worker(struct work_struct *work)
|
|
+{
|
|
+ const struct binder_priv *pm_priv = container_of(work,
|
|
+ struct binder_priv,
|
|
+ subflow_work);
|
|
+ struct mptcp_cb *mpcb = pm_priv->mpcb;
|
|
+ struct sock *meta_sk = mpcb->meta_sk;
|
|
+ int iter = 0;
|
|
+
|
|
+next_subflow:
|
|
+ if (iter) {
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+
|
|
+ cond_resched();
|
|
+ }
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
|
|
+
|
|
+ if (!mptcp(tcp_sk(meta_sk)))
|
|
+ goto exit;
|
|
+
|
|
+ iter++;
|
|
+
|
|
+ if (sock_flag(meta_sk, SOCK_DEAD))
|
|
+ goto exit;
|
|
+
|
|
+ if (mpcb->master_sk &&
|
|
+ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
|
|
+ goto exit;
|
|
+
|
|
+ if (mptcp_binder_ndiffports > iter &&
|
|
+ mptcp_binder_ndiffports > mptcp_subflow_count(mpcb)) {
|
|
+ struct mptcp_loc4 loc;
|
|
+ struct mptcp_rem4 rem;
|
|
+
|
|
+ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
|
|
+ loc.loc4_id = 0;
|
|
+ loc.low_prio = 0;
|
|
+
|
|
+ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
|
|
+ rem.port = inet_sk(meta_sk)->inet_dport;
|
|
+ rem.rem4_id = 0; /* Default 0 */
|
|
+
|
|
+ mptcp_init4_subsockets(meta_sk, &loc, &rem);
|
|
+
|
|
+ goto next_subflow;
|
|
+ }
|
|
+
|
|
+exit:
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ mptcp_mpcb_put(mpcb);
|
|
+ sock_put(meta_sk);
|
|
+}
|
|
+
|
|
+static void binder_new_session(const struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0];
|
|
+ static DEFINE_SPINLOCK(flow_lock);
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ if (meta_sk->sk_family == AF_INET6 &&
|
|
+ !mptcp_v6_is_v4_mapped(meta_sk)) {
|
|
+ mptcp_fallback_default(mpcb);
|
|
+ return;
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ /* Initialize workqueue-struct */
|
|
+ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
|
|
+ fmp->mpcb = mpcb;
|
|
+
|
|
+ fmp->flow_lock = &flow_lock;
|
|
+}
|
|
+
|
|
+static void binder_create_subflows(struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0];
|
|
+
|
|
+ if (mptcp_in_infinite_mapping_weak(mpcb) ||
|
|
+ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
|
|
+ return;
|
|
+
|
|
+ if (!work_pending(&pm_priv->subflow_work)) {
|
|
+ sock_hold(meta_sk);
|
|
+ refcount_inc(&mpcb->mpcb_refcnt);
|
|
+ queue_work(mptcp_wq, &pm_priv->subflow_work);
|
|
+ }
|
|
+}
|
|
+
|
|
+static int binder_get_local_id(const struct sock *meta_sk, sa_family_t family,
|
|
+ union inet_addr *addr, bool *low_prio)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated.
|
|
+ * Inspired from proc_tcp_congestion_control().
|
|
+ */
|
|
+static int proc_mptcp_gateways(struct ctl_table *ctl, int write,
|
|
+ void __user *buffer, size_t *lenp,
|
|
+ loff_t *ppos)
|
|
+{
|
|
+ int ret;
|
|
+ struct ctl_table tbl = {
|
|
+ .maxlen = MPTCP_GW_SYSCTL_MAX_LEN,
|
|
+ };
|
|
+
|
|
+ if (write) {
|
|
+ tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL);
|
|
+ if (tbl.data == NULL)
|
|
+ return -ENOMEM;
|
|
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
|
+ if (ret == 0) {
|
|
+ ret = mptcp_parse_gateway_ipv4(tbl.data);
|
|
+ memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN);
|
|
+ }
|
|
+ kfree(tbl.data);
|
|
+ } else {
|
|
+ ret = proc_dostring(ctl, write, buffer, lenp, ppos);
|
|
+ }
|
|
+
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct mptcp_pm_ops binder __read_mostly = {
|
|
+ .new_session = binder_new_session,
|
|
+ .fully_established = binder_create_subflows,
|
|
+ .get_local_id = binder_get_local_id,
|
|
+ .init_subsocket_v4 = mptcp_v4_add_lsrr,
|
|
+ .name = "binder",
|
|
+ .owner = THIS_MODULE,
|
|
+};
|
|
+
|
|
+static struct ctl_table binder_table[] = {
|
|
+ {
|
|
+ .procname = "mptcp_binder_gateways",
|
|
+ .data = &sysctl_mptcp_binder_gateways,
|
|
+ .maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN,
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_mptcp_gateways
|
|
+ },
|
|
+ { }
|
|
+};
|
|
+
|
|
+static struct ctl_table_header *mptcp_sysctl_binder;
|
|
+
|
|
+/* General initialization of MPTCP_PM */
|
|
+static int __init binder_register(void)
|
|
+{
|
|
+ mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL);
|
|
+ if (!mptcp_gws)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ rwlock_init(&mptcp_gws_lock);
|
|
+
|
|
+ BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE);
|
|
+
|
|
+ mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp",
|
|
+ binder_table);
|
|
+ if (!mptcp_sysctl_binder)
|
|
+ goto sysctl_fail;
|
|
+
|
|
+ if (mptcp_register_path_manager(&binder))
|
|
+ goto pm_failed;
|
|
+
|
|
+ return 0;
|
|
+
|
|
+pm_failed:
|
|
+ unregister_net_sysctl_table(mptcp_sysctl_binder);
|
|
+sysctl_fail:
|
|
+ kfree(mptcp_gws);
|
|
+
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static void binder_unregister(void)
|
|
+{
|
|
+ mptcp_unregister_path_manager(&binder);
|
|
+ unregister_net_sysctl_table(mptcp_sysctl_binder);
|
|
+ kfree(mptcp_gws);
|
|
+}
|
|
+
|
|
+module_init(binder_register);
|
|
+module_exit(binder_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("BINDER MPTCP");
|
|
+MODULE_VERSION("0.1");
|
|
diff --git a/net/mptcp/mptcp_blest.c b/net/mptcp/mptcp_blest.c
|
|
new file mode 100644
|
|
index 000000000000..22e25dd0d44e
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_blest.c
|
|
@@ -0,0 +1,285 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/* MPTCP Scheduler to reduce HoL-blocking and spurious retransmissions.
|
|
+ *
|
|
+ * Algorithm Design:
|
|
+ * Simone Ferlin <ferlin@simula.no>
|
|
+ * Ozgu Alay <ozgu@simula.no>
|
|
+ * Olivier Mehani <olivier.mehani@nicta.com.au>
|
|
+ * Roksana Boreli <roksana.boreli@nicta.com.au>
|
|
+ *
|
|
+ * Initial Implementation:
|
|
+ * Simone Ferlin <ferlin@simula.no>
|
|
+ *
|
|
+ * Additional Authors:
|
|
+ * Daniel Weber <weberd@cs.uni-bonn.de>
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#include <linux/module.h>
|
|
+#include <net/mptcp.h>
|
|
+
|
|
+static unsigned char lambda __read_mostly = 12;
|
|
+module_param(lambda, byte, 0644);
|
|
+MODULE_PARM_DESC(lambda, "Divided by 10 for scaling factor of fast flow rate estimation");
|
|
+
|
|
+static unsigned char max_lambda __read_mostly = 13;
|
|
+module_param(max_lambda, byte, 0644);
|
|
+MODULE_PARM_DESC(max_lambda, "Divided by 10 for maximum scaling factor of fast flow rate estimation");
|
|
+
|
|
+static unsigned char min_lambda __read_mostly = 10;
|
|
+module_param(min_lambda, byte, 0644);
|
|
+MODULE_PARM_DESC(min_lambda, "Divided by 10 for minimum scaling factor of fast flow rate estimation");
|
|
+
|
|
+static unsigned char dyn_lambda_good = 10; /* 1% */
|
|
+module_param(dyn_lambda_good, byte, 0644);
|
|
+MODULE_PARM_DESC(dyn_lambda_good, "Decrease of lambda in positive case.");
|
|
+
|
|
+static unsigned char dyn_lambda_bad = 40; /* 4% */
|
|
+module_param(dyn_lambda_bad, byte, 0644);
|
|
+MODULE_PARM_DESC(dyn_lambda_bad, "Increase of lambda in negative case.");
|
|
+
|
|
+struct blestsched_priv {
|
|
+ u32 last_rbuf_opti;
|
|
+ u32 min_srtt_us;
|
|
+ u32 max_srtt_us;
|
|
+};
|
|
+
|
|
+struct blestsched_cb {
|
|
+ s16 lambda_1000; /* values range from min_lambda * 100 to max_lambda * 100 */
|
|
+ u32 last_lambda_update;
|
|
+};
|
|
+
|
|
+static struct blestsched_priv *blestsched_get_priv(const struct tcp_sock *tp)
|
|
+{
|
|
+ return (struct blestsched_priv *)&tp->mptcp->mptcp_sched[0];
|
|
+}
|
|
+
|
|
+static struct blestsched_cb *blestsched_get_cb(const struct tcp_sock *tp)
|
|
+{
|
|
+ return (struct blestsched_cb *)&tp->mpcb->mptcp_sched[0];
|
|
+}
|
|
+
|
|
+static void blestsched_update_lambda(struct sock *meta_sk, struct sock *sk)
|
|
+{
|
|
+ struct blestsched_cb *blest_cb = blestsched_get_cb(tcp_sk(meta_sk));
|
|
+ struct blestsched_priv *blest_p = blestsched_get_priv(tcp_sk(sk));
|
|
+
|
|
+ if (tcp_jiffies32 - blest_cb->last_lambda_update < usecs_to_jiffies(blest_p->min_srtt_us >> 3))
|
|
+ return;
|
|
+
|
|
+ /* if there have been retransmissions of packets of the slow flow
|
|
+ * during the slow flows last RTT => increase lambda
|
|
+ * otherwise decrease
|
|
+ */
|
|
+ if (tcp_sk(meta_sk)->retrans_stamp) {
|
|
+ /* need to slow down on the slow flow */
|
|
+ blest_cb->lambda_1000 += dyn_lambda_bad;
|
|
+ } else {
|
|
+ /* use the slow flow more */
|
|
+ blest_cb->lambda_1000 -= dyn_lambda_good;
|
|
+ }
|
|
+
|
|
+ /* cap lambda_1000 to its value range */
|
|
+ blest_cb->lambda_1000 = min_t(s16, blest_cb->lambda_1000, max_lambda * 100);
|
|
+ blest_cb->lambda_1000 = max_t(s16, blest_cb->lambda_1000, min_lambda * 100);
|
|
+
|
|
+ blest_cb->last_lambda_update = tcp_jiffies32;
|
|
+}
|
|
+
|
|
+/* how many bytes will sk send during the rtt of another, slower flow? */
|
|
+static u32 blestsched_estimate_bytes(struct sock *sk, u32 time_8)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct blestsched_priv *blest_p = blestsched_get_priv(tp);
|
|
+ struct blestsched_cb *blest_cb = blestsched_get_cb(mptcp_meta_tp(tp));
|
|
+ u32 avg_rtt, num_rtts, ca_cwnd, packets;
|
|
+
|
|
+ avg_rtt = (blest_p->min_srtt_us + blest_p->max_srtt_us) / 2;
|
|
+ if (avg_rtt == 0)
|
|
+ num_rtts = 1; /* sanity */
|
|
+ else
|
|
+ num_rtts = (time_8 / avg_rtt) + 1; /* round up */
|
|
+
|
|
+ /* during num_rtts, how many bytes will be sent on the flow?
|
|
+ * assumes for simplification that Reno is applied as congestion-control
|
|
+ */
|
|
+ if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
|
|
+ /* we are in initial slow start */
|
|
+ if (num_rtts > 16)
|
|
+ num_rtts = 16; /* cap for sanity */
|
|
+ packets = tp->snd_cwnd * ((1 << num_rtts) - 1); /* cwnd + 2*cwnd + 4*cwnd */
|
|
+ } else {
|
|
+ ca_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh + 1); /* assume we jump to CA already */
|
|
+ packets = (ca_cwnd + (num_rtts - 1) / 2) * num_rtts;
|
|
+ }
|
|
+
|
|
+ return div_u64(((u64)packets) * tp->mss_cache * blest_cb->lambda_1000, 1000);
|
|
+}
|
|
+
|
|
+static u32 blestsched_estimate_linger_time(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct blestsched_priv *blest_p = blestsched_get_priv(tp);
|
|
+ u32 estimate, slope, inflight, cwnd;
|
|
+
|
|
+ inflight = tcp_packets_in_flight(tp) + 1; /* take into account the new one */
|
|
+ cwnd = tp->snd_cwnd;
|
|
+
|
|
+ if (inflight >= cwnd) {
|
|
+ estimate = blest_p->max_srtt_us;
|
|
+ } else {
|
|
+ slope = blest_p->max_srtt_us - blest_p->min_srtt_us;
|
|
+ if (cwnd == 0)
|
|
+ cwnd = 1; /* sanity */
|
|
+ estimate = blest_p->min_srtt_us + (slope * inflight) / cwnd;
|
|
+ }
|
|
+
|
|
+ return (tp->srtt_us > estimate) ? tp->srtt_us : estimate;
|
|
+}
|
|
+
|
|
+/* This is the BLEST scheduler. This function decides on which flow to send
|
|
+ * a given MSS. If all subflows are found to be busy or the currently best
|
|
+ * subflow is estimated to possibly cause HoL-blocking, NULL is returned.
|
|
+ */
|
|
+struct sock *blest_get_available_subflow(struct sock *meta_sk, struct sk_buff *skb,
|
|
+ bool zero_wnd_test)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct sock *bestsk, *minsk = NULL;
|
|
+ struct tcp_sock *meta_tp, *besttp;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct blestsched_priv *blest_p;
|
|
+ u32 min_srtt = U32_MAX;
|
|
+
|
|
+ /* Answer data_fin on same subflow!!! */
|
|
+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
|
|
+ skb && mptcp_is_data_fin(skb)) {
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ bestsk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (tcp_sk(bestsk)->mptcp->path_index == mpcb->dfin_path_index &&
|
|
+ mptcp_is_available(bestsk, skb, zero_wnd_test))
|
|
+ return bestsk;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* First, find the overall best subflow */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ bestsk = mptcp_to_sock(mptcp);
|
|
+ besttp = tcp_sk(bestsk);
|
|
+ blest_p = blestsched_get_priv(besttp);
|
|
+
|
|
+ /* Set of states for which we are allowed to send data */
|
|
+ if (!mptcp_sk_can_send(bestsk))
|
|
+ continue;
|
|
+
|
|
+ /* We do not send data on this subflow unless it is
|
|
+ * fully established, i.e. the 4th ack has been received.
|
|
+ */
|
|
+ if (besttp->mptcp->pre_established)
|
|
+ continue;
|
|
+
|
|
+ blest_p->min_srtt_us = min(blest_p->min_srtt_us, besttp->srtt_us);
|
|
+ blest_p->max_srtt_us = max(blest_p->max_srtt_us, besttp->srtt_us);
|
|
+
|
|
+ /* record minimal rtt */
|
|
+ if (besttp->srtt_us < min_srtt) {
|
|
+ min_srtt = besttp->srtt_us;
|
|
+ minsk = bestsk;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* find the current best subflow according to the default scheduler */
|
|
+ bestsk = get_available_subflow(meta_sk, skb, zero_wnd_test);
|
|
+
|
|
+ /* if we decided to use a slower flow, we have the option of not using it at all */
|
|
+ if (bestsk && minsk && bestsk != minsk) {
|
|
+ u32 slow_linger_time, fast_bytes, slow_inflight_bytes, slow_bytes, avail_space;
|
|
+ u32 buffered_bytes = 0;
|
|
+
|
|
+ meta_tp = tcp_sk(meta_sk);
|
|
+ besttp = tcp_sk(bestsk);
|
|
+
|
|
+ blestsched_update_lambda(meta_sk, bestsk);
|
|
+
|
|
+ /* if we send this SKB now, it will be acked in besttp->srtt seconds
|
|
+ * during this time: how many bytes will we send on the fast flow?
|
|
+ */
|
|
+ slow_linger_time = blestsched_estimate_linger_time(bestsk);
|
|
+ fast_bytes = blestsched_estimate_bytes(minsk, slow_linger_time);
|
|
+
|
|
+ if (skb)
|
|
+ buffered_bytes = skb->len;
|
|
+
|
|
+ /* is the required space available in the mptcp meta send window?
|
|
+ * we assume that all bytes inflight on the slow path will be acked in besttp->srtt seconds
|
|
+ * (just like the SKB if it was sent now) -> that means that those inflight bytes will
|
|
+ * keep occupying space in the meta window until then
|
|
+ */
|
|
+ slow_inflight_bytes = besttp->write_seq - besttp->snd_una;
|
|
+ slow_bytes = buffered_bytes + slow_inflight_bytes; // bytes of this SKB plus those in flight already
|
|
+
|
|
+ avail_space = (slow_bytes < meta_tp->snd_wnd) ? (meta_tp->snd_wnd - slow_bytes) : 0;
|
|
+
|
|
+ if (fast_bytes > avail_space) {
|
|
+ /* sending this SKB on the slow flow means
|
|
+ * we wouldn't be able to send all the data we'd like to send on the fast flow
|
|
+ * so don't do that
|
|
+ */
|
|
+ return NULL;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return bestsk;
|
|
+}
|
|
+
|
|
+static void blestsched_init(struct sock *sk)
|
|
+{
|
|
+ struct blestsched_priv *blest_p = blestsched_get_priv(tcp_sk(sk));
|
|
+ struct blestsched_cb *blest_cb = blestsched_get_cb(tcp_sk(mptcp_meta_sk(sk)));
|
|
+
|
|
+ blest_p->last_rbuf_opti = tcp_jiffies32;
|
|
+ blest_p->min_srtt_us = U32_MAX;
|
|
+ blest_p->max_srtt_us = 0;
|
|
+
|
|
+ if (!blest_cb->lambda_1000) {
|
|
+ blest_cb->lambda_1000 = lambda * 100;
|
|
+ blest_cb->last_lambda_update = tcp_jiffies32;
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct mptcp_sched_ops mptcp_sched_blest = {
|
|
+ .get_subflow = blest_get_available_subflow,
|
|
+ .next_segment = mptcp_next_segment,
|
|
+ .init = blestsched_init,
|
|
+ .name = "blest",
|
|
+ .owner = THIS_MODULE,
|
|
+};
|
|
+
|
|
+static int __init blest_register(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct blestsched_priv) > MPTCP_SCHED_SIZE);
|
|
+ BUILD_BUG_ON(sizeof(struct blestsched_cb) > MPTCP_SCHED_DATA_SIZE);
|
|
+
|
|
+ if (mptcp_register_scheduler(&mptcp_sched_blest))
|
|
+ return -1;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void blest_unregister(void)
|
|
+{
|
|
+ mptcp_unregister_scheduler(&mptcp_sched_blest);
|
|
+}
|
|
+
|
|
+module_init(blest_register);
|
|
+module_exit(blest_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Simone Ferlin, Daniel Weber");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("BLEST scheduler for MPTCP, based on default minimum RTT scheduler");
|
|
+MODULE_VERSION("0.95");
|
|
diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c
|
|
new file mode 100644
|
|
index 000000000000..9eb7628053f6
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_coupled.c
|
|
@@ -0,0 +1,262 @@
|
|
+/*
|
|
+ * MPTCP implementation - Linked Increase congestion control Algorithm (LIA)
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
|
|
+ *
|
|
+ * Current Maintainer & Author:
|
|
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
|
|
+ *
|
|
+ * Additional authors:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ * Gregory Detal <gregory.detal@uclouvain.be>
|
|
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
|
|
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
|
|
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
|
|
+ * Andreas Ripke <ripke@neclab.eu>
|
|
+ * Vlad Dogaru <vlad.dogaru@intel.com>
|
|
+ * Octavian Purdila <octavian.purdila@intel.com>
|
|
+ * John Ronan <jronan@tssg.org>
|
|
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
|
|
+ * Brandon Heller <brandonh@stanford.edu>
|
|
+ *
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+#include <net/tcp.h>
|
|
+#include <net/mptcp.h>
|
|
+
|
|
+#include <linux/module.h>
|
|
+
|
|
+/* Scaling is done in the numerator with alpha_scale_num and in the denominator
|
|
+ * with alpha_scale_den.
|
|
+ *
|
|
+ * To downscale, we just need to use alpha_scale.
|
|
+ *
|
|
+ * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2)
|
|
+ */
|
|
+static int alpha_scale_den = 10;
|
|
+static int alpha_scale_num = 32;
|
|
+static int alpha_scale = 12;
|
|
+
|
|
+struct mptcp_ccc {
|
|
+ u64 alpha;
|
|
+ bool forced_update;
|
|
+};
|
|
+
|
|
+static inline int mptcp_ccc_sk_can_send(const struct sock *sk)
|
|
+{
|
|
+ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
|
|
+}
|
|
+
|
|
+static inline u64 mptcp_get_alpha(const struct sock *meta_sk)
|
|
+{
|
|
+ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha;
|
|
+}
|
|
+
|
|
+static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha)
|
|
+{
|
|
+ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha;
|
|
+}
|
|
+
|
|
+static inline u64 mptcp_ccc_scale(u32 val, int scale)
|
|
+{
|
|
+ return (u64) val << scale;
|
|
+}
|
|
+
|
|
+static inline bool mptcp_get_forced(const struct sock *meta_sk)
|
|
+{
|
|
+ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update;
|
|
+}
|
|
+
|
|
+static inline void mptcp_set_forced(const struct sock *meta_sk, bool force)
|
|
+{
|
|
+ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force;
|
|
+}
|
|
+
|
|
+static void mptcp_ccc_recalc_alpha(const struct sock *sk)
|
|
+{
|
|
+ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
|
|
+ const struct mptcp_tcp_sock *mptcp;
|
|
+ int best_cwnd = 0, best_rtt = 0, can_send = 0;
|
|
+ u64 max_numerator = 0, sum_denominator = 0, alpha = 1;
|
|
+
|
|
+ if (!mpcb)
|
|
+ return;
|
|
+
|
|
+ /* Do regular alpha-calculation for multiple subflows */
|
|
+
|
|
+ /* Find the max numerator of the alpha-calculation */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ const struct sock *sub_sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
|
|
+ u64 tmp;
|
|
+
|
|
+ if (!mptcp_ccc_sk_can_send(sub_sk))
|
|
+ continue;
|
|
+
|
|
+ can_send++;
|
|
+
|
|
+ /* We need to look for the path, that provides the max-value.
|
|
+ * Integer-overflow is not possible here, because
|
|
+ * tmp will be in u64.
|
|
+ */
|
|
+ tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd,
|
|
+ alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us);
|
|
+
|
|
+ if (tmp >= max_numerator) {
|
|
+ max_numerator = tmp;
|
|
+ best_cwnd = sub_tp->snd_cwnd;
|
|
+ best_rtt = sub_tp->srtt_us;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* No subflow is able to send - we don't care anymore */
|
|
+ if (unlikely(!can_send))
|
|
+ goto exit;
|
|
+
|
|
+ /* Calculate the denominator */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ const struct sock *sub_sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
|
|
+
|
|
+ if (!mptcp_ccc_sk_can_send(sub_sk))
|
|
+ continue;
|
|
+
|
|
+ sum_denominator += div_u64(
|
|
+ mptcp_ccc_scale(sub_tp->snd_cwnd,
|
|
+ alpha_scale_den) * best_rtt,
|
|
+ sub_tp->srtt_us);
|
|
+ }
|
|
+ sum_denominator *= sum_denominator;
|
|
+ if (unlikely(!sum_denominator)) {
|
|
+ pr_err("%s: sum_denominator == 0\n", __func__);
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ const struct sock *sub_sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
|
|
+ pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u",
|
|
+ __func__, sub_tp->mptcp->path_index,
|
|
+ sub_sk->sk_state, sub_tp->srtt_us,
|
|
+ sub_tp->snd_cwnd);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator);
|
|
+
|
|
+ if (unlikely(!alpha))
|
|
+ alpha = 1;
|
|
+
|
|
+exit:
|
|
+ mptcp_set_alpha(mptcp_meta_sk(sk), alpha);
|
|
+}
|
|
+
|
|
+static void mptcp_ccc_init(struct sock *sk)
|
|
+{
|
|
+ if (mptcp(tcp_sk(sk))) {
|
|
+ mptcp_set_forced(mptcp_meta_sk(sk), 0);
|
|
+ mptcp_set_alpha(mptcp_meta_sk(sk), 1);
|
|
+ }
|
|
+ /* If we do not mptcp, behave like reno: return */
|
|
+}
|
|
+
|
|
+static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event)
|
|
+{
|
|
+ if (event == CA_EVENT_LOSS)
|
|
+ mptcp_ccc_recalc_alpha(sk);
|
|
+}
|
|
+
|
|
+static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state)
|
|
+{
|
|
+ if (!mptcp(tcp_sk(sk)))
|
|
+ return;
|
|
+
|
|
+ mptcp_set_forced(mptcp_meta_sk(sk), 1);
|
|
+}
|
|
+
|
|
+static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ int snd_cwnd;
|
|
+ u64 alpha;
|
|
+
|
|
+ if (!mptcp(tp)) {
|
|
+ tcp_reno_cong_avoid(sk, ack, acked);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (!tcp_is_cwnd_limited(sk))
|
|
+ return;
|
|
+
|
|
+ if (tcp_in_slow_start(tp)) {
|
|
+ /* In "safe" area, increase. */
|
|
+ tcp_slow_start(tp, acked);
|
|
+ mptcp_ccc_recalc_alpha(sk);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (mptcp_get_forced(mptcp_meta_sk(sk))) {
|
|
+ mptcp_ccc_recalc_alpha(sk);
|
|
+ mptcp_set_forced(mptcp_meta_sk(sk), 0);
|
|
+ }
|
|
+
|
|
+ alpha = mptcp_get_alpha(mptcp_meta_sk(sk));
|
|
+
|
|
+ /* This may happen, if at the initialization, the mpcb
|
|
+ * was not yet attached to the sock, and thus
|
|
+ * initializing alpha failed.
|
|
+ */
|
|
+ if (unlikely(!alpha))
|
|
+ alpha = 1;
|
|
+
|
|
+ snd_cwnd = (int)div_u64((u64)mptcp_ccc_scale(1, alpha_scale), alpha);
|
|
+
|
|
+ /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd)
|
|
+ * Thus, we select here the max value.
|
|
+ */
|
|
+ if (snd_cwnd < tp->snd_cwnd)
|
|
+ snd_cwnd = tp->snd_cwnd;
|
|
+
|
|
+ if (tp->snd_cwnd_cnt >= snd_cwnd) {
|
|
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
|
|
+ tp->snd_cwnd++;
|
|
+ mptcp_ccc_recalc_alpha(sk);
|
|
+ }
|
|
+
|
|
+ tp->snd_cwnd_cnt = 0;
|
|
+ } else {
|
|
+ tp->snd_cwnd_cnt++;
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct tcp_congestion_ops mptcp_ccc = {
|
|
+ .init = mptcp_ccc_init,
|
|
+ .ssthresh = tcp_reno_ssthresh,
|
|
+ .cong_avoid = mptcp_ccc_cong_avoid,
|
|
+ .undo_cwnd = tcp_reno_undo_cwnd,
|
|
+ .cwnd_event = mptcp_ccc_cwnd_event,
|
|
+ .set_state = mptcp_ccc_set_state,
|
|
+ .owner = THIS_MODULE,
|
|
+ .name = "lia",
|
|
+};
|
|
+
|
|
+static int __init mptcp_ccc_register(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE);
|
|
+ return tcp_register_congestion_control(&mptcp_ccc);
|
|
+}
|
|
+
|
|
+static void __exit mptcp_ccc_unregister(void)
|
|
+{
|
|
+ tcp_unregister_congestion_control(&mptcp_ccc);
|
|
+}
|
|
+
|
|
+module_init(mptcp_ccc_register);
|
|
+module_exit(mptcp_ccc_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Christoph Paasch, Sébastien Barré");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM");
|
|
+MODULE_VERSION("0.1");
|
|
diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c
|
|
new file mode 100644
|
|
index 000000000000..db01ec142111
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_ctrl.c
|
|
@@ -0,0 +1,3313 @@
|
|
+/*
|
|
+ * MPTCP implementation - MPTCP-control
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
|
|
+ *
|
|
+ * Current Maintainer & Author:
|
|
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
|
|
+ *
|
|
+ * Additional authors:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ * Gregory Detal <gregory.detal@uclouvain.be>
|
|
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
|
|
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
|
|
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
|
|
+ * Andreas Ripke <ripke@neclab.eu>
|
|
+ * Vlad Dogaru <vlad.dogaru@intel.com>
|
|
+ * Octavian Purdila <octavian.purdila@intel.com>
|
|
+ * John Ronan <jronan@tssg.org>
|
|
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
|
|
+ * Brandon Heller <brandonh@stanford.edu>
|
|
+ *
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#include <crypto/sha.h>
|
|
+
|
|
+#include <net/inet_common.h>
|
|
+#include <net/inet6_hashtables.h>
|
|
+#include <net/ipv6.h>
|
|
+#include <net/ip6_checksum.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+#include <net/ip6_route.h>
|
|
+#include <net/mptcp_v6.h>
|
|
+#endif
|
|
+#include <net/sock.h>
|
|
+#include <net/tcp.h>
|
|
+#include <net/tcp_states.h>
|
|
+#include <net/transp_v6.h>
|
|
+#include <net/xfrm.h>
|
|
+
|
|
+#include <linux/memblock.h>
|
|
+#include <linux/cryptohash.h>
|
|
+#include <linux/kconfig.h>
|
|
+#include <linux/module.h>
|
|
+#include <linux/netpoll.h>
|
|
+#include <linux/proc_fs.h>
|
|
+#include <linux/list.h>
|
|
+#include <linux/jhash.h>
|
|
+#include <linux/tcp.h>
|
|
+#include <linux/net.h>
|
|
+#include <linux/in.h>
|
|
+#include <linux/random.h>
|
|
+#include <linux/inetdevice.h>
|
|
+#include <linux/workqueue.h>
|
|
+#include <linux/atomic.h>
|
|
+#include <linux/sysctl.h>
|
|
+
|
|
+static struct kmem_cache *mptcp_sock_cache __read_mostly;
|
|
+static struct kmem_cache *mptcp_cb_cache __read_mostly;
|
|
+static struct kmem_cache *mptcp_tw_cache __read_mostly;
|
|
+
|
|
+int sysctl_mptcp_enabled __read_mostly = 1;
|
|
+int sysctl_mptcp_version __read_mostly = 0;
|
|
+static int min_mptcp_version;
|
|
+static int max_mptcp_version = 1;
|
|
+int sysctl_mptcp_checksum __read_mostly = 1;
|
|
+int sysctl_mptcp_debug __read_mostly;
|
|
+EXPORT_SYMBOL(sysctl_mptcp_debug);
|
|
+int sysctl_mptcp_syn_retries __read_mostly = 3;
|
|
+
|
|
+bool mptcp_init_failed __read_mostly;
|
|
+
|
|
+struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE;
|
|
+EXPORT_SYMBOL(mptcp_static_key);
|
|
+
|
|
+static void mptcp_key_hash(u8 version, u64 key, u32 *token, u64 *idsn);
|
|
+
|
|
+static int proc_mptcp_path_manager(struct ctl_table *ctl, int write,
|
|
+ void __user *buffer, size_t *lenp,
|
|
+ loff_t *ppos)
|
|
+{
|
|
+ char val[MPTCP_PM_NAME_MAX];
|
|
+ struct ctl_table tbl = {
|
|
+ .data = val,
|
|
+ .maxlen = MPTCP_PM_NAME_MAX,
|
|
+ };
|
|
+ int ret;
|
|
+
|
|
+ mptcp_get_default_path_manager(val);
|
|
+
|
|
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
|
+ if (write && ret == 0)
|
|
+ ret = mptcp_set_default_path_manager(val);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int proc_mptcp_scheduler(struct ctl_table *ctl, int write,
|
|
+ void __user *buffer, size_t *lenp,
|
|
+ loff_t *ppos)
|
|
+{
|
|
+ char val[MPTCP_SCHED_NAME_MAX];
|
|
+ struct ctl_table tbl = {
|
|
+ .data = val,
|
|
+ .maxlen = MPTCP_SCHED_NAME_MAX,
|
|
+ };
|
|
+ int ret;
|
|
+
|
|
+ mptcp_get_default_scheduler(val);
|
|
+
|
|
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
|
+ if (write && ret == 0)
|
|
+ ret = mptcp_set_default_scheduler(val);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct ctl_table mptcp_table[] = {
|
|
+ {
|
|
+ .procname = "mptcp_enabled",
|
|
+ .data = &sysctl_mptcp_enabled,
|
|
+ .maxlen = sizeof(int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec
|
|
+ },
|
|
+ {
|
|
+ .procname = "mptcp_version",
|
|
+ .data = &sysctl_mptcp_version,
|
|
+ .mode = 0644,
|
|
+ .maxlen = sizeof(int),
|
|
+ .proc_handler = &proc_dointvec_minmax,
|
|
+ .extra1 = &min_mptcp_version,
|
|
+ .extra2 = &max_mptcp_version,
|
|
+ },
|
|
+ {
|
|
+ .procname = "mptcp_checksum",
|
|
+ .data = &sysctl_mptcp_checksum,
|
|
+ .maxlen = sizeof(int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec
|
|
+ },
|
|
+ {
|
|
+ .procname = "mptcp_debug",
|
|
+ .data = &sysctl_mptcp_debug,
|
|
+ .maxlen = sizeof(int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec
|
|
+ },
|
|
+ {
|
|
+ .procname = "mptcp_syn_retries",
|
|
+ .data = &sysctl_mptcp_syn_retries,
|
|
+ .maxlen = sizeof(int),
|
|
+ .mode = 0644,
|
|
+ .proc_handler = &proc_dointvec
|
|
+ },
|
|
+ {
|
|
+ .procname = "mptcp_path_manager",
|
|
+ .mode = 0644,
|
|
+ .maxlen = MPTCP_PM_NAME_MAX,
|
|
+ .proc_handler = proc_mptcp_path_manager,
|
|
+ },
|
|
+ {
|
|
+ .procname = "mptcp_scheduler",
|
|
+ .mode = 0644,
|
|
+ .maxlen = MPTCP_SCHED_NAME_MAX,
|
|
+ .proc_handler = proc_mptcp_scheduler,
|
|
+ },
|
|
+ { }
|
|
+};
|
|
+
|
|
+static inline u32 mptcp_hash_tk(u32 token, struct mptcp_hashtable *htable)
|
|
+{
|
|
+ return token & htable->mask;
|
|
+}
|
|
+
|
|
+struct mptcp_hashtable mptcp_tk_htable;
|
|
+EXPORT_SYMBOL(mptcp_tk_htable);
|
|
+
|
|
+/* The following hash table is used to avoid collision of token */
|
|
+static struct mptcp_hashtable mptcp_reqsk_tk_htb;
|
|
+
|
|
+/* Lock, protecting the two hash-tables that hold the token. Namely,
|
|
+ * mptcp_reqsk_tk_htb and tk_hashtable
|
|
+ */
|
|
+static spinlock_t mptcp_tk_hashlock;
|
|
+
|
|
+static bool mptcp_reqsk_find_tk(const u32 token)
|
|
+{
|
|
+ const u32 hash = mptcp_hash_tk(token, &mptcp_reqsk_tk_htb);
|
|
+ const struct mptcp_request_sock *mtreqsk;
|
|
+ const struct hlist_nulls_node *node;
|
|
+
|
|
+begin:
|
|
+ hlist_nulls_for_each_entry_rcu(mtreqsk, node,
|
|
+ &mptcp_reqsk_tk_htb.hashtable[hash],
|
|
+ hash_entry) {
|
|
+ if (token == mtreqsk->mptcp_loc_token)
|
|
+ return true;
|
|
+ }
|
|
+ /* A request-socket is destroyed by RCU. So, it might have been recycled
|
|
+ * and put into another hash-table list. So, after the lookup we may
|
|
+ * end up in a different list. So, we may need to restart.
|
|
+ *
|
|
+ * See also the comment in __inet_lookup_established.
|
|
+ */
|
|
+ if (get_nulls_value(node) != hash)
|
|
+ goto begin;
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, const u32 token)
|
|
+{
|
|
+ u32 hash = mptcp_hash_tk(token, &mptcp_reqsk_tk_htb);
|
|
+
|
|
+ hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->hash_entry,
|
|
+ &mptcp_reqsk_tk_htb.hashtable[hash]);
|
|
+}
|
|
+
|
|
+static void mptcp_reqsk_remove_tk(const struct request_sock *reqsk)
|
|
+{
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+ spin_lock(&mptcp_tk_hashlock);
|
|
+ hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->hash_entry);
|
|
+ spin_unlock(&mptcp_tk_hashlock);
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+void mptcp_reqsk_destructor(struct request_sock *req)
|
|
+{
|
|
+ if (!mptcp_rsk(req)->is_sub)
|
|
+ mptcp_reqsk_remove_tk(req);
|
|
+}
|
|
+
|
|
+static void __mptcp_hash_insert(struct tcp_sock *meta_tp, const u32 token)
|
|
+{
|
|
+ u32 hash = mptcp_hash_tk(token, &mptcp_tk_htable);
|
|
+
|
|
+ hlist_nulls_add_head_rcu(&meta_tp->tk_table,
|
|
+ &mptcp_tk_htable.hashtable[hash]);
|
|
+ meta_tp->inside_tk_table = 1;
|
|
+}
|
|
+
|
|
+static bool mptcp_find_token(u32 token)
|
|
+{
|
|
+ const u32 hash = mptcp_hash_tk(token, &mptcp_tk_htable);
|
|
+ const struct tcp_sock *meta_tp;
|
|
+ const struct hlist_nulls_node *node;
|
|
+
|
|
+begin:
|
|
+ hlist_nulls_for_each_entry_rcu(meta_tp, node,
|
|
+ &mptcp_tk_htable.hashtable[hash],
|
|
+ tk_table) {
|
|
+ if (token == meta_tp->mptcp_loc_token)
|
|
+ return true;
|
|
+ }
|
|
+ /* A TCP-socket is destroyed by RCU. So, it might have been recycled
|
|
+ * and put into another hash-table list. So, after the lookup we may
|
|
+ * end up in a different list. So, we may need to restart.
|
|
+ *
|
|
+ * See also the comment in __inet_lookup_established.
|
|
+ */
|
|
+ if (get_nulls_value(node) != hash)
|
|
+ goto begin;
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void mptcp_set_key_reqsk(struct request_sock *req,
|
|
+ const struct sk_buff *skb,
|
|
+ u32 seed)
|
|
+{
|
|
+ const struct inet_request_sock *ireq = inet_rsk(req);
|
|
+ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
|
|
+
|
|
+ if (skb->protocol == htons(ETH_P_IP)) {
|
|
+ mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr,
|
|
+ ip_hdr(skb)->daddr,
|
|
+ htons(ireq->ir_num),
|
|
+ ireq->ir_rmt_port,
|
|
+ seed);
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ } else {
|
|
+ mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32,
|
|
+ ipv6_hdr(skb)->daddr.s6_addr32,
|
|
+ htons(ireq->ir_num),
|
|
+ ireq->ir_rmt_port,
|
|
+ seed);
|
|
+#endif
|
|
+ }
|
|
+
|
|
+ mptcp_key_hash(mtreq->mptcp_ver, mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);
|
|
+}
|
|
+
|
|
+/* New MPTCP-connection request, prepare a new token for the meta-socket that
|
|
+ * will be created in mptcp_check_req_master(), and store the received token.
|
|
+ */
|
|
+static void mptcp_reqsk_new_mptcp(struct request_sock *req,
|
|
+ const struct sock *sk,
|
|
+ const struct mptcp_options_received *mopt,
|
|
+ const struct sk_buff *skb)
|
|
+{
|
|
+ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
|
|
+
|
|
+ inet_rsk(req)->saw_mpc = 1;
|
|
+ mtreq->mptcp_ver = mopt->mptcp_ver;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+ spin_lock(&mptcp_tk_hashlock);
|
|
+ do {
|
|
+ mptcp_set_key_reqsk(req, skb, mptcp_seed++);
|
|
+ } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
|
|
+ mptcp_find_token(mtreq->mptcp_loc_token));
|
|
+ mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token);
|
|
+ spin_unlock(&mptcp_tk_hashlock);
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (mtreq->mptcp_ver == MPTCP_VERSION_0) {
|
|
+ mtreq->mptcp_rem_key = mopt->mptcp_sender_key;
|
|
+ mtreq->rem_key_set = 1;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int mptcp_reqsk_new_cookie(struct request_sock *req,
|
|
+ const struct sock *sk,
|
|
+ const struct mptcp_options_received *mopt,
|
|
+ const struct sk_buff *skb)
|
|
+{
|
|
+ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
|
|
+
|
|
+ /* Must happen before mptcp_set_key_reqsk to generate the token with
|
|
+ * the proper hash algo.
|
|
+ */
|
|
+ mtreq->mptcp_ver = mopt->mptcp_ver;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+ spin_lock(&mptcp_tk_hashlock);
|
|
+
|
|
+ mptcp_set_key_reqsk(req, skb, tcp_rsk(req)->snt_isn);
|
|
+
|
|
+ if (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
|
|
+ mptcp_find_token(mtreq->mptcp_loc_token)) {
|
|
+ spin_unlock(&mptcp_tk_hashlock);
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ inet_rsk(req)->saw_mpc = 1;
|
|
+
|
|
+ spin_unlock(&mptcp_tk_hashlock);
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (mtreq->mptcp_ver == MPTCP_VERSION_0) {
|
|
+ mtreq->mptcp_rem_key = mopt->mptcp_sender_key;
|
|
+ mtreq->rem_key_set = 1;
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void mptcp_set_key_sk(const struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ const struct inet_sock *isk = inet_sk(sk);
|
|
+
|
|
+ if (sk->sk_family == AF_INET)
|
|
+ tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr,
|
|
+ isk->inet_daddr,
|
|
+ isk->inet_sport,
|
|
+ isk->inet_dport,
|
|
+ mptcp_seed++);
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ else
|
|
+ tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32,
|
|
+ sk->sk_v6_daddr.s6_addr32,
|
|
+ isk->inet_sport,
|
|
+ isk->inet_dport,
|
|
+ mptcp_seed++);
|
|
+#endif
|
|
+
|
|
+ mptcp_key_hash(tp->mptcp_ver, tp->mptcp_loc_key, &tp->mptcp_loc_token, NULL);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_JUMP_LABEL
|
|
+static atomic_t mptcp_needed_deferred;
|
|
+static atomic_t mptcp_wanted;
|
|
+
|
|
+static void mptcp_clear(struct work_struct *work)
|
|
+{
|
|
+ int deferred = atomic_xchg(&mptcp_needed_deferred, 0);
|
|
+ int wanted;
|
|
+
|
|
+ wanted = atomic_add_return(deferred, &mptcp_wanted);
|
|
+ if (wanted > 0)
|
|
+ static_key_enable(&mptcp_static_key);
|
|
+ else
|
|
+ static_key_disable(&mptcp_static_key);
|
|
+}
|
|
+
|
|
+static DECLARE_WORK(mptcp_work, mptcp_clear);
|
|
+#endif
|
|
+
|
|
+static void mptcp_enable_static_key_bh(void)
|
|
+{
|
|
+#ifdef CONFIG_JUMP_LABEL
|
|
+ int wanted;
|
|
+
|
|
+ while (1) {
|
|
+ wanted = atomic_read(&mptcp_wanted);
|
|
+ if (wanted <= 0)
|
|
+ break;
|
|
+ if (atomic_cmpxchg(&mptcp_wanted, wanted, wanted + 1) == wanted)
|
|
+ return;
|
|
+ }
|
|
+ atomic_inc(&mptcp_needed_deferred);
|
|
+ schedule_work(&mptcp_work);
|
|
+#else
|
|
+ static_key_slow_inc(&mptcp_static_key);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static void mptcp_enable_static_key(void)
|
|
+{
|
|
+#ifdef CONFIG_JUMP_LABEL
|
|
+ atomic_inc(&mptcp_wanted);
|
|
+ static_key_enable(&mptcp_static_key);
|
|
+#else
|
|
+ static_key_slow_inc(&mptcp_static_key);
|
|
+#endif
|
|
+}
|
|
+
|
|
+void mptcp_disable_static_key(void)
|
|
+{
|
|
+#ifdef CONFIG_JUMP_LABEL
|
|
+ int wanted;
|
|
+
|
|
+ while (1) {
|
|
+ wanted = atomic_read(&mptcp_wanted);
|
|
+ if (wanted <= 1)
|
|
+ break;
|
|
+ if (atomic_cmpxchg(&mptcp_wanted, wanted, wanted - 1) == wanted)
|
|
+ return;
|
|
+ }
|
|
+ atomic_dec(&mptcp_needed_deferred);
|
|
+ schedule_work(&mptcp_work);
|
|
+#else
|
|
+ static_key_slow_dec(&mptcp_static_key);
|
|
+#endif
|
|
+}
|
|
+
|
|
+void mptcp_enable_sock(struct sock *sk)
|
|
+{
|
|
+ if (!sock_flag(sk, SOCK_MPTCP)) {
|
|
+ sock_set_flag(sk, SOCK_MPTCP);
|
|
+ tcp_sk(sk)->mptcp_ver = sysctl_mptcp_version;
|
|
+
|
|
+ /* Necessary here, because MPTCP can be enabled/disabled through
|
|
+ * a setsockopt.
|
|
+ */
|
|
+ if (sk->sk_family == AF_INET)
|
|
+ inet_csk(sk)->icsk_af_ops = &mptcp_v4_specific;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ else if (mptcp_v6_is_v4_mapped(sk))
|
|
+ inet_csk(sk)->icsk_af_ops = &mptcp_v6_mapped;
|
|
+ else
|
|
+ inet_csk(sk)->icsk_af_ops = &mptcp_v6_specific;
|
|
+#endif
|
|
+
|
|
+ mptcp_enable_static_key();
|
|
+ }
|
|
+}
|
|
+
|
|
+void mptcp_disable_sock(struct sock *sk)
|
|
+{
|
|
+ if (sock_flag(sk, SOCK_MPTCP)) {
|
|
+ sock_reset_flag(sk, SOCK_MPTCP);
|
|
+
|
|
+ /* Necessary here, because MPTCP can be enabled/disabled through
|
|
+ * a setsockopt.
|
|
+ */
|
|
+ if (sk->sk_family == AF_INET)
|
|
+ inet_csk(sk)->icsk_af_ops = &ipv4_specific;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ else if (mptcp_v6_is_v4_mapped(sk))
|
|
+ inet_csk(sk)->icsk_af_ops = &ipv6_mapped;
|
|
+ else
|
|
+ inet_csk(sk)->icsk_af_ops = &ipv6_specific;
|
|
+#endif
|
|
+
|
|
+ mptcp_disable_static_key();
|
|
+ }
|
|
+}
|
|
+
|
|
+void mptcp_connect_init(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+ spin_lock(&mptcp_tk_hashlock);
|
|
+ do {
|
|
+ mptcp_set_key_sk(sk);
|
|
+ } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) ||
|
|
+ mptcp_find_token(tp->mptcp_loc_token));
|
|
+
|
|
+ __mptcp_hash_insert(tp, tp->mptcp_loc_token);
|
|
+ spin_unlock(&mptcp_tk_hashlock);
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * This function increments the refcount of the mpcb struct.
|
|
+ * It is the responsibility of the caller to decrement when releasing
|
|
+ * the structure.
|
|
+ */
|
|
+struct sock *mptcp_hash_find(const struct net *net, const u32 token)
|
|
+{
|
|
+ const u32 hash = mptcp_hash_tk(token, &mptcp_tk_htable);
|
|
+ const struct tcp_sock *meta_tp;
|
|
+ struct sock *meta_sk = NULL;
|
|
+ const struct hlist_nulls_node *node;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+begin:
|
|
+ hlist_nulls_for_each_entry_rcu(meta_tp, node,
|
|
+ &mptcp_tk_htable.hashtable[hash],
|
|
+ tk_table) {
|
|
+ meta_sk = (struct sock *)meta_tp;
|
|
+ if (token == meta_tp->mptcp_loc_token &&
|
|
+ net_eq(net, sock_net(meta_sk))) {
|
|
+ if (unlikely(!refcount_inc_not_zero(&meta_sk->sk_refcnt)))
|
|
+ goto out;
|
|
+ if (unlikely(token != meta_tp->mptcp_loc_token ||
|
|
+ !net_eq(net, sock_net(meta_sk)))) {
|
|
+ sock_gen_put(meta_sk);
|
|
+ goto begin;
|
|
+ }
|
|
+ goto found;
|
|
+ }
|
|
+ }
|
|
+ /* A TCP-socket is destroyed by RCU. So, it might have been recycled
|
|
+ * and put into another hash-table list. So, after the lookup we may
|
|
+ * end up in a different list. So, we may need to restart.
|
|
+ *
|
|
+ * See also the comment in __inet_lookup_established.
|
|
+ */
|
|
+ if (get_nulls_value(node) != hash)
|
|
+ goto begin;
|
|
+out:
|
|
+ meta_sk = NULL;
|
|
+found:
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+ return meta_sk;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(mptcp_hash_find);
|
|
+
|
|
+void mptcp_hash_remove_bh(struct tcp_sock *meta_tp)
|
|
+{
|
|
+ /* remove from the token hashtable */
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+ spin_lock(&mptcp_tk_hashlock);
|
|
+ hlist_nulls_del_init_rcu(&meta_tp->tk_table);
|
|
+ meta_tp->inside_tk_table = 0;
|
|
+ spin_unlock(&mptcp_tk_hashlock);
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+struct sock *mptcp_select_ack_sock(const struct sock *meta_sk)
|
|
+{
|
|
+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct sock *rttsk = NULL, *lastsk = NULL;
|
|
+ u32 min_time = 0, last_active = 0;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(meta_tp->mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ u32 elapsed;
|
|
+
|
|
+ if (!mptcp_sk_can_send_ack(sk) || tp->pf)
|
|
+ continue;
|
|
+
|
|
+ elapsed = keepalive_time_elapsed(tp);
|
|
+
|
|
+ /* We take the one with the lowest RTT within a reasonable
|
|
+ * (meta-RTO)-timeframe
|
|
+ */
|
|
+ if (elapsed < inet_csk(meta_sk)->icsk_rto) {
|
|
+ if (!min_time || tp->srtt_us < min_time) {
|
|
+ min_time = tp->srtt_us;
|
|
+ rttsk = sk;
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* Otherwise, we just take the most recent active */
|
|
+ if (!rttsk && (!last_active || elapsed < last_active)) {
|
|
+ last_active = elapsed;
|
|
+ lastsk = sk;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (rttsk)
|
|
+ return rttsk;
|
|
+
|
|
+ return lastsk;
|
|
+}
|
|
+EXPORT_SYMBOL(mptcp_select_ack_sock);
|
|
+
|
|
+static void mptcp_sock_def_error_report(struct sock *sk)
|
|
+{
|
|
+ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ if (!sock_flag(sk, SOCK_DEAD)) {
|
|
+ if (tp->send_mp_fclose && sk->sk_err == ETIMEDOUT) {
|
|
+ /* Called by the keep alive timer (tcp_write_timeout),
|
|
+ * when the limit of fastclose retransmissions has been
|
|
+ * reached. Send a TCP RST to clear the status of any
|
|
+ * stateful firewall (typically conntrack) which are
|
|
+ * not aware of mptcp and cannot understand the
|
|
+ * fastclose option.
|
|
+ */
|
|
+ tp->ops->send_active_reset(sk, GFP_ATOMIC);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* record this info that can be used by PM after the sf close */
|
|
+ tp->mptcp->sk_err = sk->sk_err;
|
|
+
|
|
+ if (!tp->tcp_disconnect && mptcp_in_infinite_mapping_weak(mpcb)) {
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+
|
|
+ meta_sk->sk_err = sk->sk_err;
|
|
+ meta_sk->sk_err_soft = sk->sk_err_soft;
|
|
+
|
|
+ if (!sock_flag(meta_sk, SOCK_DEAD))
|
|
+ meta_sk->sk_error_report(meta_sk);
|
|
+
|
|
+ WARN(meta_sk->sk_state == TCP_CLOSE,
|
|
+ "Meta already closed i_rcv %u i_snd %u send_i %u flags %#lx\n",
|
|
+ mpcb->infinite_mapping_rcv, mpcb->infinite_mapping_snd,
|
|
+ mpcb->send_infinite_mapping, meta_sk->sk_flags);
|
|
+
|
|
+ if (meta_sk->sk_state != TCP_CLOSE)
|
|
+ tcp_done(meta_sk);
|
|
+ }
|
|
+
|
|
+ sk->sk_err = 0;
|
|
+ return;
|
|
+}
|
|
+
|
|
+void mptcp_mpcb_put(struct mptcp_cb *mpcb)
|
|
+{
|
|
+ if (refcount_dec_and_test(&mpcb->mpcb_refcnt)) {
|
|
+ mptcp_cleanup_path_manager(mpcb);
|
|
+ mptcp_cleanup_scheduler(mpcb);
|
|
+ kfree(mpcb->master_info);
|
|
+ kmem_cache_free(mptcp_cb_cache, mpcb);
|
|
+ }
|
|
+}
|
|
+EXPORT_SYMBOL(mptcp_mpcb_put);
|
|
+
|
|
+static void mptcp_mpcb_cleanup(struct mptcp_cb *mpcb)
|
|
+{
|
|
+ struct mptcp_tw *mptw;
|
|
+
|
|
+ /* The mpcb is disappearing - we can make the final
|
|
+ * update to the rcv_nxt of the time-wait-sock and remove
|
|
+ * its reference to the mpcb.
|
|
+ */
|
|
+ spin_lock_bh(&mpcb->mpcb_list_lock);
|
|
+ list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) {
|
|
+ list_del_rcu(&mptw->list);
|
|
+ mptw->in_list = 0;
|
|
+ mptcp_mpcb_put(mpcb);
|
|
+ rcu_assign_pointer(mptw->mpcb, NULL);
|
|
+ }
|
|
+ spin_unlock_bh(&mpcb->mpcb_list_lock);
|
|
+
|
|
+ mptcp_mpcb_put(mpcb);
|
|
+}
|
|
+
|
|
+static void mptcp_sock_destruct(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ if (!is_meta_sk(sk)) {
|
|
+ BUG_ON(!hlist_unhashed(&tp->mptcp->cb_list));
|
|
+
|
|
+ kmem_cache_free(mptcp_sock_cache, tp->mptcp);
|
|
+ tp->mptcp = NULL;
|
|
+
|
|
+ /* Taken when mpcb pointer was set */
|
|
+ sock_put(mptcp_meta_sk(sk));
|
|
+ mptcp_mpcb_put(tp->mpcb);
|
|
+ } else {
|
|
+ mptcp_debug("%s destroying meta-sk token %#x\n", __func__,
|
|
+ tcp_sk(sk)->mpcb->mptcp_loc_token);
|
|
+
|
|
+ mptcp_mpcb_cleanup(tp->mpcb);
|
|
+ }
|
|
+
|
|
+ WARN_ON(!static_key_false(&mptcp_static_key));
|
|
+
|
|
+ /* Must be called here, because this will decrement the jump-label. */
|
|
+ inet_sock_destruct(sk);
|
|
+}
|
|
+
|
|
+void mptcp_destroy_sock(struct sock *sk)
|
|
+{
|
|
+ if (is_meta_sk(sk)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct hlist_node *tmp;
|
|
+
|
|
+ __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue);
|
|
+
|
|
+ /* We have to close all remaining subflows. Normally, they
|
|
+ * should all be about to get closed. But, if the kernel is
|
|
+ * forcing a closure (e.g., tcp_write_err), the subflows might
|
|
+ * not have been closed properly (as we are waiting for the
|
|
+ * DATA_ACK of the DATA_FIN).
|
|
+ */
|
|
+ mptcp_for_each_sub_safe(tcp_sk(sk)->mpcb, mptcp, tmp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ /* Already did call tcp_close - waiting for graceful
|
|
+ * closure, or if we are retransmitting fast-close on
|
|
+ * the subflow. The reset (or timeout) will kill the
|
|
+ * subflow..
|
|
+ */
|
|
+ if (tcp_sk(sk_it)->closing ||
|
|
+ tcp_sk(sk_it)->send_mp_fclose)
|
|
+ continue;
|
|
+
|
|
+ /* Allow the delayed work first to prevent time-wait state */
|
|
+ if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work))
|
|
+ continue;
|
|
+
|
|
+ mptcp_sub_close(sk_it, 0);
|
|
+ }
|
|
+ } else {
|
|
+ mptcp_del_sock(sk);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void mptcp_set_state(struct sock *sk)
|
|
+{
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+
|
|
+ /* Meta is not yet established - wake up the application */
|
|
+ if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) &&
|
|
+ sk->sk_state == TCP_ESTABLISHED) {
|
|
+ tcp_set_state(meta_sk, TCP_ESTABLISHED);
|
|
+
|
|
+ if (!sock_flag(meta_sk, SOCK_DEAD)) {
|
|
+ meta_sk->sk_state_change(meta_sk);
|
|
+ sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT);
|
|
+ }
|
|
+
|
|
+ tcp_sk(meta_sk)->lsndtime = tcp_jiffies32;
|
|
+ }
|
|
+
|
|
+ if (sk->sk_state == TCP_CLOSE) {
|
|
+ if (!sock_flag(sk, SOCK_DEAD))
|
|
+ mptcp_sub_close(sk, 0);
|
|
+ }
|
|
+}
|
|
+
|
|
+static int mptcp_set_congestion_control(struct sock *meta_sk, const char *name,
|
|
+ bool load, bool reinit, bool cap_net_admin)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ int err, result = 0;
|
|
+
|
|
+ result = __tcp_set_congestion_control(meta_sk, name, load, reinit, cap_net_admin);
|
|
+
|
|
+ tcp_sk(meta_sk)->mpcb->tcp_ca_explicit_set = true;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ err = __tcp_set_congestion_control(sk_it, name, load, reinit, cap_net_admin);
|
|
+ if (err)
|
|
+ result = err;
|
|
+ }
|
|
+ return result;
|
|
+}
|
|
+
|
|
+static void mptcp_assign_congestion_control(struct sock *sk)
|
|
+{
|
|
+ struct inet_connection_sock *icsk = inet_csk(sk);
|
|
+ struct inet_connection_sock *meta_icsk = inet_csk(mptcp_meta_sk(sk));
|
|
+ const struct tcp_congestion_ops *ca = meta_icsk->icsk_ca_ops;
|
|
+
|
|
+ /* Congestion control is the same as meta. Thus, it has been
|
|
+ * try_module_get'd by tcp_assign_congestion_control.
|
|
+ * Congestion control on meta was not explicitly configured by
|
|
+ * application, leave default or route based.
|
|
+ */
|
|
+ if (icsk->icsk_ca_ops == ca ||
|
|
+ !tcp_sk(mptcp_meta_sk(sk))->mpcb->tcp_ca_explicit_set)
|
|
+ return;
|
|
+
|
|
+ /* Use the same congestion control as set on the meta-sk */
|
|
+ if (!try_module_get(ca->owner)) {
|
|
+ /* This should never happen. The congestion control is linked
|
|
+ * to the meta-socket (through tcp_assign_congestion_control)
|
|
+ * who "holds" the refcnt on the module.
|
|
+ */
|
|
+ WARN(1, "Could not get the congestion control!");
|
|
+ return;
|
|
+ }
|
|
+ module_put(icsk->icsk_ca_ops->owner);
|
|
+ icsk->icsk_ca_ops = ca;
|
|
+
|
|
+ /* Clear out private data before diag gets it and
|
|
+ * the ca has not been initialized.
|
|
+ */
|
|
+ if (ca->get_info)
|
|
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
|
|
+
|
|
+ return;
|
|
+}
|
|
+
|
|
+siphash_key_t mptcp_secret __read_mostly;
|
|
+u32 mptcp_seed = 0;
|
|
+
|
|
+#define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4)
|
|
+
|
|
+static void mptcp_key_sha256(const u64 key, u32 *token, u64 *idsn)
|
|
+{
|
|
+ u32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
|
|
+ struct sha256_state state;
|
|
+
|
|
+ sha256_init(&state);
|
|
+ sha256_update(&state, (const u8 *)&key, sizeof(key));
|
|
+ sha256_final(&state, (u8 *)mptcp_hashed_key);
|
|
+
|
|
+ if (token)
|
|
+ *token = mptcp_hashed_key[0];
|
|
+ if (idsn)
|
|
+ *idsn = ntohll(*((__be64 *)&mptcp_hashed_key[6]));
|
|
+}
|
|
+
|
|
+static void mptcp_hmac_sha256(const u8 *key_1, const u8 *key_2, u8 *hash_out,
|
|
+ int arg_num, va_list list)
|
|
+{
|
|
+ u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
|
|
+ struct sha256_state state;
|
|
+ int index, msg_length;
|
|
+ int length = 0;
|
|
+ u8 *msg;
|
|
+ int i;
|
|
+
|
|
+ /* Generate key xored with ipad */
|
|
+ memset(input, 0x36, SHA256_BLOCK_SIZE);
|
|
+ for (i = 0; i < 8; i++)
|
|
+ input[i] ^= key_1[i];
|
|
+ for (i = 0; i < 8; i++)
|
|
+ input[i + 8] ^= key_2[i];
|
|
+
|
|
+ index = SHA256_BLOCK_SIZE;
|
|
+ msg_length = 0;
|
|
+ for (i = 0; i < arg_num; i++) {
|
|
+ length = va_arg(list, int);
|
|
+ msg = va_arg(list, u8 *);
|
|
+ BUG_ON(index + length >= sizeof(input)); /* Message is too long */
|
|
+ memcpy(&input[index], msg, length);
|
|
+ index += length;
|
|
+ msg_length += length;
|
|
+ }
|
|
+
|
|
+ sha256_init(&state);
|
|
+ sha256_update(&state, input, SHA256_BLOCK_SIZE + msg_length);
|
|
+ sha256_final(&state, &input[SHA256_BLOCK_SIZE]);
|
|
+
|
|
+ /* Prepare second part of hmac */
|
|
+ memset(input, 0x5C, SHA256_BLOCK_SIZE);
|
|
+ for (i = 0; i < 8; i++)
|
|
+ input[i] ^= key_1[i];
|
|
+ for (i = 0; i < 8; i++)
|
|
+ input[i + 8] ^= key_2[i];
|
|
+
|
|
+ sha256_init(&state);
|
|
+ sha256_update(&state, input, sizeof(input));
|
|
+ sha256_final(&state, hash_out);
|
|
+}
|
|
+
|
|
+static void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn)
|
|
+{
|
|
+ u32 workspace[SHA_WORKSPACE_WORDS];
|
|
+ u32 mptcp_hashed_key[SHA_DIGEST_WORDS];
|
|
+ u8 input[64];
|
|
+ int i;
|
|
+
|
|
+ memset(workspace, 0, sizeof(workspace));
|
|
+
|
|
+ /* Initialize input with appropriate padding */
|
|
+ memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte
|
|
+ * is explicitly set too
|
|
+ */
|
|
+ memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */
|
|
+ input[8] = 0x80; /* Padding: First bit after message = 1 */
|
|
+ input[63] = 0x40; /* Padding: Length of the message = 64 bits */
|
|
+
|
|
+ sha_init(mptcp_hashed_key);
|
|
+ sha_transform(mptcp_hashed_key, input, workspace);
|
|
+
|
|
+ for (i = 0; i < 5; i++)
|
|
+ mptcp_hashed_key[i] = (__force u32)cpu_to_be32(mptcp_hashed_key[i]);
|
|
+
|
|
+ if (token)
|
|
+ *token = mptcp_hashed_key[0];
|
|
+ if (idsn)
|
|
+ *idsn = ntohll(*((__be64 *)&mptcp_hashed_key[3]));
|
|
+}
|
|
+
|
|
+static void mptcp_key_hash(u8 version, u64 key, u32 *token, u64 *idsn)
|
|
+{
|
|
+ if (version == MPTCP_VERSION_0)
|
|
+ mptcp_key_sha1(key, token, idsn);
|
|
+ else if (version >= MPTCP_VERSION_1)
|
|
+ mptcp_key_sha256(key, token, idsn);
|
|
+}
|
|
+
|
|
+static void mptcp_hmac_sha1(const u8 *key_1, const u8 *key_2, u32 *hash_out,
|
|
+ int arg_num, va_list list)
|
|
+{
|
|
+ u32 workspace[SHA_WORKSPACE_WORDS];
|
|
+ u8 input[128]; /* 2 512-bit blocks */
|
|
+ int i;
|
|
+ int index;
|
|
+ int length;
|
|
+ u8 *msg;
|
|
+
|
|
+ memset(workspace, 0, sizeof(workspace));
|
|
+
|
|
+ /* Generate key xored with ipad */
|
|
+ memset(input, 0x36, 64);
|
|
+ for (i = 0; i < 8; i++)
|
|
+ input[i] ^= key_1[i];
|
|
+ for (i = 0; i < 8; i++)
|
|
+ input[i + 8] ^= key_2[i];
|
|
+
|
|
+ index = 64;
|
|
+ for (i = 0; i < arg_num; i++) {
|
|
+ length = va_arg(list, int);
|
|
+ msg = va_arg(list, u8 *);
|
|
+ BUG_ON(index + length > 125); /* Message is too long */
|
|
+ memcpy(&input[index], msg, length);
|
|
+ index += length;
|
|
+ }
|
|
+
|
|
+ input[index] = 0x80; /* Padding: First bit after message = 1 */
|
|
+ memset(&input[index + 1], 0, (126 - index));
|
|
+
|
|
+ /* Padding: Length of the message = 512 + message length (bits) */
|
|
+ input[126] = 0x02;
|
|
+ input[127] = ((index - 64) * 8); /* Message length (bits) */
|
|
+
|
|
+ sha_init(hash_out);
|
|
+ sha_transform(hash_out, input, workspace);
|
|
+ memset(workspace, 0, sizeof(workspace));
|
|
+
|
|
+ sha_transform(hash_out, &input[64], workspace);
|
|
+ memset(workspace, 0, sizeof(workspace));
|
|
+
|
|
+ for (i = 0; i < 5; i++)
|
|
+ hash_out[i] = (__force u32)cpu_to_be32(hash_out[i]);
|
|
+
|
|
+ /* Prepare second part of hmac */
|
|
+ memset(input, 0x5C, 64);
|
|
+ for (i = 0; i < 8; i++)
|
|
+ input[i] ^= key_1[i];
|
|
+ for (i = 0; i < 8; i++)
|
|
+ input[i + 8] ^= key_2[i];
|
|
+
|
|
+ memcpy(&input[64], hash_out, 20);
|
|
+ input[84] = 0x80;
|
|
+ memset(&input[85], 0, 41);
|
|
+
|
|
+ /* Padding: Length of the message = 512 + 160 bits */
|
|
+ input[126] = 0x02;
|
|
+ input[127] = 0xA0;
|
|
+
|
|
+ sha_init(hash_out);
|
|
+ sha_transform(hash_out, input, workspace);
|
|
+ memset(workspace, 0, sizeof(workspace));
|
|
+
|
|
+ sha_transform(hash_out, &input[64], workspace);
|
|
+
|
|
+ for (i = 0; i < 5; i++)
|
|
+ hash_out[i] = (__force u32)cpu_to_be32(hash_out[i]);
|
|
+}
|
|
+
|
|
+void mptcp_hmac(u8 ver, const u8 *key_1, const u8 *key_2, u8 *hash_out,
|
|
+ int arg_num, ...)
|
|
+{
|
|
+ va_list args;
|
|
+
|
|
+ va_start(args, arg_num);
|
|
+ if (ver == MPTCP_VERSION_0)
|
|
+ mptcp_hmac_sha1(key_1, key_2, (u32 *)hash_out, arg_num, args);
|
|
+ else if (ver >= MPTCP_VERSION_1)
|
|
+ mptcp_hmac_sha256(key_1, key_2, hash_out, arg_num, args);
|
|
+ va_end(args);
|
|
+}
|
|
+EXPORT_SYMBOL(mptcp_hmac);
|
|
+
|
|
+static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk)
|
|
+{
|
|
+ /* Socket-options handled by sk_clone_lock while creating the meta-sk.
|
|
+ * ======
|
|
+ * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT,
|
|
+ * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER,
|
|
+ * TCP_NODELAY, TCP_CORK
|
|
+ *
|
|
+ * Socket-options handled in this function here
|
|
+ * ======
|
|
+ * TCP_DEFER_ACCEPT
|
|
+ * SO_KEEPALIVE
|
|
+ *
|
|
+ * Socket-options on the todo-list
|
|
+ * ======
|
|
+ * SO_BINDTODEVICE - should probably prevent creation of new subsocks
|
|
+ * across other devices. - what about the api-draft?
|
|
+ * SO_DEBUG
|
|
+ * SO_REUSEADDR - probably we don't care about this
|
|
+ * SO_DONTROUTE, SO_BROADCAST
|
|
+ * SO_OOBINLINE
|
|
+ * SO_LINGER
|
|
+ * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM
|
|
+ * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM
|
|
+ * SO_RXQ_OVFL
|
|
+ * TCP_COOKIE_TRANSACTIONS
|
|
+ * TCP_MAXSEG
|
|
+ * TCP_THIN_* - Handled by sk_clone_lock, but we need to support this
|
|
+ * in mptcp_meta_retransmit_timer. AND we need to check
|
|
+ * what is about the subsockets.
|
|
+ * TCP_LINGER2
|
|
+ * TCP_WINDOW_CLAMP
|
|
+ * TCP_USER_TIMEOUT
|
|
+ * TCP_MD5SIG
|
|
+ *
|
|
+ * Socket-options of no concern for the meta-socket (but for the subsocket)
|
|
+ * ======
|
|
+ * SO_PRIORITY
|
|
+ * SO_MARK
|
|
+ * TCP_CONGESTION
|
|
+ * TCP_SYNCNT
|
|
+ * TCP_QUICKACK
|
|
+ */
|
|
+
|
|
+ /* DEFER_ACCEPT should not be set on the meta, as we want to accept new subflows directly */
|
|
+ inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0;
|
|
+
|
|
+ /* Keepalives are handled entirely at the MPTCP-layer */
|
|
+ if (sock_flag(meta_sk, SOCK_KEEPOPEN)) {
|
|
+ inet_csk_reset_keepalive_timer(meta_sk,
|
|
+ keepalive_time_when(tcp_sk(meta_sk)));
|
|
+ sock_reset_flag(master_sk, SOCK_KEEPOPEN);
|
|
+ inet_csk_delete_keepalive_timer(master_sk);
|
|
+ }
|
|
+
|
|
+ /* Do not propagate subflow-errors up to the MPTCP-layer */
|
|
+ inet_sk(master_sk)->recverr = 0;
|
|
+}
|
|
+
|
|
+/* Called without holding lock on meta_sk */
|
|
+static void mptcp_sub_inherit_sockopts(const struct sock *meta_sk, struct sock *sub_sk)
|
|
+{
|
|
+ __u8 meta_tos;
|
|
+
|
|
+ /* IP_TOS also goes to the subflow. */
|
|
+ meta_tos = READ_ONCE(inet_sk(meta_sk)->tos);
|
|
+ if (inet_sk(sub_sk)->tos != meta_tos) {
|
|
+ inet_sk(sub_sk)->tos = meta_tos;
|
|
+ sub_sk->sk_priority = meta_sk->sk_priority;
|
|
+ sk_dst_reset(sub_sk);
|
|
+ }
|
|
+
|
|
+ /* IPV6_TCLASS */
|
|
+ if (sub_sk->sk_family == AF_INET6 && meta_sk->sk_family == AF_INET6)
|
|
+ inet6_sk(sub_sk)->tclass = inet6_sk(meta_sk)->tclass;
|
|
+
|
|
+ /* Inherit SO_REUSEADDR */
|
|
+ sub_sk->sk_reuse = meta_sk->sk_reuse;
|
|
+
|
|
+ /* Inherit SO_MARK: can be used for routing or filtering */
|
|
+ sub_sk->sk_mark = meta_sk->sk_mark;
|
|
+
|
|
+ /* Inherit snd/rcv-buffer locks */
|
|
+ sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
|
|
+
|
|
+ /* Nagle/Cork is forced off on the subflows. It is handled at the meta-layer */
|
|
+ tcp_sk(sub_sk)->nonagle = TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
|
|
+
|
|
+ /* Keepalives are handled entirely at the MPTCP-layer */
|
|
+ if (sock_flag(sub_sk, SOCK_KEEPOPEN)) {
|
|
+ sock_reset_flag(sub_sk, SOCK_KEEPOPEN);
|
|
+ inet_csk_delete_keepalive_timer(sub_sk);
|
|
+ }
|
|
+
|
|
+ /* Do not propagate subflow-errors up to the MPTCP-layer */
|
|
+ inet_sk(sub_sk)->recverr = 0;
|
|
+}
|
|
+
|
|
+void mptcp_prepare_for_backlog(struct sock *sk, struct sk_buff *skb)
|
|
+{
|
|
+ /* In case of success (in mptcp_backlog_rcv) and error (in kfree_skb) of
|
|
+ * sk_add_backlog, we will decrement the sk refcount.
|
|
+ */
|
|
+ sock_hold(sk);
|
|
+ skb->sk = sk;
|
|
+ skb->destructor = sock_efree;
|
|
+}
|
|
+
|
|
+int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb)
|
|
+{
|
|
+ /* skb-sk may be NULL if we receive a packet immediatly after the
|
|
+ * SYN/ACK + MP_CAPABLE.
|
|
+ */
|
|
+ struct sock *sk = skb->sk ? skb->sk : meta_sk;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) {
|
|
+ kfree_skb(skb);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ /* Decrement sk refcnt when calling the skb destructor.
|
|
+ * Refcnt is incremented and skb destructor is set in tcp_v{4,6}_rcv via
|
|
+ * mptcp_prepare_for_backlog() here above.
|
|
+ */
|
|
+ skb_orphan(skb);
|
|
+
|
|
+ if (sk->sk_family == AF_INET)
|
|
+ ret = tcp_v4_do_rcv(sk, skb);
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ else
|
|
+ ret = tcp_v6_do_rcv(sk, skb);
|
|
+#endif
|
|
+
|
|
+ sock_put(sk);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void mptcp_init_buffer_space(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ int space;
|
|
+
|
|
+ tcp_init_buffer_space(sk);
|
|
+
|
|
+ if (is_master_tp(tp)) {
|
|
+ meta_tp->rcvq_space.space = meta_tp->rcv_wnd;
|
|
+ tcp_mstamp_refresh(meta_tp);
|
|
+ meta_tp->rcvq_space.time = meta_tp->tcp_mstamp;
|
|
+ meta_tp->rcvq_space.seq = meta_tp->copied_seq;
|
|
+
|
|
+ /* If there is only one subflow, we just use regular TCP
|
|
+ * autotuning. User-locks are handled already by
|
|
+ * tcp_init_buffer_space
|
|
+ */
|
|
+ meta_tp->window_clamp = tp->window_clamp;
|
|
+ meta_tp->rcv_ssthresh = tp->rcv_ssthresh;
|
|
+ meta_sk->sk_rcvbuf = sk->sk_rcvbuf;
|
|
+ meta_sk->sk_sndbuf = sk->sk_sndbuf;
|
|
+
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK)
|
|
+ goto snd_buf;
|
|
+
|
|
+ /* Adding a new subflow to the rcv-buffer space. We make a simple
|
|
+ * addition, to give some space to allow traffic on the new subflow.
|
|
+ * Autotuning will increase it further later on.
|
|
+ */
|
|
+ space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf,
|
|
+ sock_net(meta_sk)->ipv4.sysctl_tcp_rmem[2]);
|
|
+ if (space > meta_sk->sk_rcvbuf) {
|
|
+ meta_tp->window_clamp += tp->window_clamp;
|
|
+ meta_tp->rcv_ssthresh += tp->rcv_ssthresh;
|
|
+ meta_sk->sk_rcvbuf = space;
|
|
+ }
|
|
+
|
|
+snd_buf:
|
|
+ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
|
|
+ return;
|
|
+
|
|
+ /* Adding a new subflow to the send-buffer space. We make a simple
|
|
+ * addition, to give some space to allow traffic on the new subflow.
|
|
+ * Autotuning will increase it further later on.
|
|
+ */
|
|
+ space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf,
|
|
+ sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2]);
|
|
+ if (space > meta_sk->sk_sndbuf) {
|
|
+ meta_sk->sk_sndbuf = space;
|
|
+ meta_sk->sk_write_space(meta_sk);
|
|
+ }
|
|
+}
|
|
+
|
|
+struct lock_class_key meta_key;
|
|
+char *meta_key_name = "sk_lock-AF_INET-MPTCP";
|
|
+struct lock_class_key meta_slock_key;
|
|
+char *meta_slock_key_name = "slock-AF_INET-MPTCP";
|
|
+
|
|
+static const struct tcp_sock_ops mptcp_meta_specific = {
|
|
+ .__select_window = __mptcp_select_window,
|
|
+ .select_window = mptcp_select_window,
|
|
+ .select_initial_window = mptcp_select_initial_window,
|
|
+ .init_buffer_space = mptcp_init_buffer_space,
|
|
+ .set_rto = mptcp_tcp_set_rto,
|
|
+ .should_expand_sndbuf = mptcp_should_expand_sndbuf,
|
|
+ .send_fin = mptcp_send_fin,
|
|
+ .write_xmit = mptcp_write_xmit,
|
|
+ .send_active_reset = mptcp_send_active_reset,
|
|
+ .write_wakeup = mptcp_write_wakeup,
|
|
+ .retransmit_timer = mptcp_meta_retransmit_timer,
|
|
+ .time_wait = mptcp_time_wait,
|
|
+ .cleanup_rbuf = mptcp_cleanup_rbuf,
|
|
+ .set_cong_ctrl = mptcp_set_congestion_control,
|
|
+};
|
|
+
|
|
+static const struct tcp_sock_ops mptcp_sub_specific = {
|
|
+ .__select_window = __mptcp_select_window,
|
|
+ .select_window = mptcp_select_window,
|
|
+ .select_initial_window = mptcp_select_initial_window,
|
|
+ .init_buffer_space = mptcp_init_buffer_space,
|
|
+ .set_rto = mptcp_tcp_set_rto,
|
|
+ .should_expand_sndbuf = mptcp_should_expand_sndbuf,
|
|
+ .send_fin = tcp_send_fin,
|
|
+ .write_xmit = tcp_write_xmit,
|
|
+ .send_active_reset = tcp_send_active_reset,
|
|
+ .write_wakeup = tcp_write_wakeup,
|
|
+ .retransmit_timer = mptcp_sub_retransmit_timer,
|
|
+ .time_wait = tcp_time_wait,
|
|
+ .cleanup_rbuf = tcp_cleanup_rbuf,
|
|
+ .set_cong_ctrl = __tcp_set_congestion_control,
|
|
+};
|
|
+
|
|
+void mptcp_initialize_recv_vars(struct tcp_sock *meta_tp, struct mptcp_cb *mpcb,
|
|
+ __u64 remote_key)
|
|
+{
|
|
+ u64 idsn;
|
|
+
|
|
+ mpcb->mptcp_rem_key = remote_key;
|
|
+ mpcb->rem_key_set = 1;
|
|
+ mptcp_key_hash(mpcb->mptcp_ver, mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn);
|
|
+
|
|
+ idsn++;
|
|
+ mpcb->rcv_high_order[0] = idsn >> 32;
|
|
+ mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1;
|
|
+ meta_tp->copied_seq = (u32)idsn;
|
|
+ meta_tp->rcv_nxt = (u32)idsn;
|
|
+ meta_tp->rcv_wup = (u32)idsn;
|
|
+ meta_tp->rcv_right_edge = meta_tp->rcv_wup + meta_tp->rcv_wnd;
|
|
+
|
|
+ meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;
|
|
+}
|
|
+
|
|
+static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key,
|
|
+ int rem_key_set, __u8 mptcp_ver, u32 window)
|
|
+{
|
|
+ struct mptcp_cb *mpcb;
|
|
+ struct sock *master_sk;
|
|
+ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
|
|
+ struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk);
|
|
+ u64 snd_idsn;
|
|
+
|
|
+ dst_release(meta_sk->sk_rx_dst);
|
|
+ meta_sk->sk_rx_dst = NULL;
|
|
+ /* This flag is set to announce sock_lock_init to
|
|
+ * reclassify the lock-class of the master socket.
|
|
+ */
|
|
+ meta_tp->is_master_sk = 1;
|
|
+ master_sk = sk_clone_lock(meta_sk, GFP_ATOMIC | __GFP_ZERO);
|
|
+ meta_tp->is_master_sk = 0;
|
|
+ if (!master_sk)
|
|
+ goto err_alloc_master;
|
|
+
|
|
+ /* Same as in inet_csk_clone_lock - need to init to 0 */
|
|
+ memset(&inet_csk(master_sk)->icsk_accept_queue, 0,
|
|
+ sizeof(inet_csk(master_sk)->icsk_accept_queue));
|
|
+
|
|
+ master_tp = tcp_sk(master_sk);
|
|
+ master_tp->inside_tk_table = 0;
|
|
+
|
|
+ mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC);
|
|
+ if (!mpcb)
|
|
+ goto err_alloc_mpcb;
|
|
+
|
|
+ /* Store the mptcp version agreed on initial handshake */
|
|
+ mpcb->mptcp_ver = mptcp_ver;
|
|
+
|
|
+ /* Store the keys and generate the peer's token */
|
|
+ mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key;
|
|
+ mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token;
|
|
+
|
|
+ /* Generate Initial data-sequence-numbers */
|
|
+ mptcp_key_hash(mpcb->mptcp_ver, mpcb->mptcp_loc_key, NULL, &snd_idsn);
|
|
+ snd_idsn++;
|
|
+ mpcb->snd_high_order[0] = snd_idsn >> 32;
|
|
+ mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1;
|
|
+
|
|
+ mpcb->meta_sk = meta_sk;
|
|
+ mpcb->master_sk = master_sk;
|
|
+
|
|
+ skb_queue_head_init(&mpcb->reinject_queue);
|
|
+ mutex_init(&mpcb->mpcb_mutex);
|
|
+
|
|
+ /* Init time-wait stuff */
|
|
+ INIT_LIST_HEAD(&mpcb->tw_list);
|
|
+
|
|
+ INIT_HLIST_HEAD(&mpcb->callback_list);
|
|
+ INIT_HLIST_HEAD(&mpcb->conn_list);
|
|
+ spin_lock_init(&mpcb->mpcb_list_lock);
|
|
+
|
|
+ mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf;
|
|
+ mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf;
|
|
+ mpcb->orig_window_clamp = meta_tp->window_clamp;
|
|
+
|
|
+ /* The meta is directly linked - set refcnt to 1 */
|
|
+ refcount_set(&mpcb->mpcb_refcnt, 1);
|
|
+
|
|
+ if (!meta_tp->inside_tk_table) {
|
|
+ /* Adding the meta_tp in the token hashtable - coming from server-side */
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+ spin_lock(&mptcp_tk_hashlock);
|
|
+
|
|
+ /* With lockless listeners, we might process two ACKs at the
|
|
+ * same time. With TCP, inet_csk_complete_hashdance takes care
|
|
+ * of this. But, for MPTCP this would be too late if we add
|
|
+ * this MPTCP-socket in the token table (new subflows might
|
|
+ * come in and match on this socket here.
|
|
+ * So, we need to check if someone else already added the token
|
|
+ * and revert in that case. The other guy won the race...
|
|
+ */
|
|
+ if (mptcp_find_token(mpcb->mptcp_loc_token)) {
|
|
+ spin_unlock(&mptcp_tk_hashlock);
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ goto err_insert_token;
|
|
+ }
|
|
+ __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token);
|
|
+
|
|
+ spin_unlock(&mptcp_tk_hashlock);
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+ }
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ if (meta_icsk->icsk_af_ops == &mptcp_v6_mapped) {
|
|
+ struct tcp6_sock *master_tp6 = (struct tcp6_sock *)master_sk;
|
|
+ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
|
|
+
|
|
+ inet_sk(master_sk)->pinet6 = &master_tp6->inet6;
|
|
+
|
|
+ newnp = inet6_sk(master_sk);
|
|
+ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
|
|
+
|
|
+ newnp->ipv6_mc_list = NULL;
|
|
+ newnp->ipv6_ac_list = NULL;
|
|
+ newnp->ipv6_fl_list = NULL;
|
|
+ newnp->pktoptions = NULL;
|
|
+ newnp->opt = NULL;
|
|
+
|
|
+ newnp->rxopt.all = 0;
|
|
+ newnp->repflow = 0;
|
|
+ np->rxopt.all = 0;
|
|
+ np->repflow = 0;
|
|
+ } else if (meta_sk->sk_family == AF_INET6) {
|
|
+ struct tcp6_sock *master_tp6 = (struct tcp6_sock *)master_sk;
|
|
+ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
|
|
+ struct ipv6_txoptions *opt;
|
|
+
|
|
+ inet_sk(master_sk)->pinet6 = &master_tp6->inet6;
|
|
+
|
|
+ /* The following heavily inspired from tcp_v6_syn_recv_sock() */
|
|
+ newnp = inet6_sk(master_sk);
|
|
+ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
|
|
+
|
|
+ newnp->ipv6_mc_list = NULL;
|
|
+ newnp->ipv6_ac_list = NULL;
|
|
+ newnp->ipv6_fl_list = NULL;
|
|
+ newnp->pktoptions = NULL;
|
|
+ newnp->opt = NULL;
|
|
+
|
|
+ newnp->rxopt.all = 0;
|
|
+ newnp->repflow = 0;
|
|
+ np->rxopt.all = 0;
|
|
+ np->repflow = 0;
|
|
+
|
|
+ opt = rcu_dereference(np->opt);
|
|
+ if (opt) {
|
|
+ opt = ipv6_dup_options(master_sk, opt);
|
|
+ RCU_INIT_POINTER(newnp->opt, opt);
|
|
+ }
|
|
+ inet_csk(master_sk)->icsk_ext_hdr_len = 0;
|
|
+ if (opt)
|
|
+ inet_csk(master_sk)->icsk_ext_hdr_len = opt->opt_nflen +
|
|
+ opt->opt_flen;
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ meta_tp->mptcp = NULL;
|
|
+
|
|
+ meta_tp->write_seq = (u32)snd_idsn;
|
|
+ meta_tp->snd_sml = meta_tp->write_seq;
|
|
+ meta_tp->snd_una = meta_tp->write_seq;
|
|
+ meta_tp->snd_nxt = meta_tp->write_seq;
|
|
+ meta_tp->pushed_seq = meta_tp->write_seq;
|
|
+ meta_tp->snd_up = meta_tp->write_seq;
|
|
+
|
|
+ if (rem_key_set)
|
|
+ mptcp_initialize_recv_vars(meta_tp, mpcb, remote_key);
|
|
+
|
|
+ meta_tp->snd_wnd = window;
|
|
+ meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */
|
|
+
|
|
+ meta_tp->packets_out = 0;
|
|
+ meta_icsk->icsk_probes_out = 0;
|
|
+
|
|
+ rcu_assign_pointer(inet_sk(meta_sk)->inet_opt, NULL);
|
|
+
|
|
+ /* Set mptcp-pointers */
|
|
+ master_tp->mpcb = mpcb;
|
|
+ master_tp->meta_sk = meta_sk;
|
|
+ meta_tp->mpcb = mpcb;
|
|
+ meta_tp->meta_sk = meta_sk;
|
|
+
|
|
+ /* Initialize the queues */
|
|
+ master_tp->out_of_order_queue = RB_ROOT;
|
|
+ master_sk->tcp_rtx_queue = RB_ROOT;
|
|
+ INIT_LIST_HEAD(&master_tp->tsq_node);
|
|
+ INIT_LIST_HEAD(&master_tp->tsorted_sent_queue);
|
|
+
|
|
+ master_tp->fastopen_req = NULL;
|
|
+
|
|
+ master_sk->sk_tsq_flags = 0;
|
|
+ /* icsk_bind_hash inherited from the meta, but it will be properly set in
|
|
+ * mptcp_create_master_sk. Same operation is done in inet_csk_clone_lock.
|
|
+ */
|
|
+ inet_csk(master_sk)->icsk_bind_hash = NULL;
|
|
+
|
|
+ /* Init the accept_queue structure, we support a queue of 32 pending
|
|
+ * connections, it does not need to be huge, since we only store here
|
|
+ * pending subflow creations.
|
|
+ */
|
|
+ reqsk_queue_alloc(&meta_icsk->icsk_accept_queue);
|
|
+ meta_sk->sk_max_ack_backlog = 32;
|
|
+ meta_sk->sk_ack_backlog = 0;
|
|
+
|
|
+ if (!sock_flag(meta_sk, SOCK_MPTCP)) {
|
|
+ mptcp_enable_static_key_bh();
|
|
+ sock_set_flag(meta_sk, SOCK_MPTCP);
|
|
+ }
|
|
+
|
|
+ /* Redefine function-pointers as the meta-sk is now fully ready */
|
|
+ meta_tp->mpc = 1;
|
|
+ meta_tp->ops = &mptcp_meta_specific;
|
|
+
|
|
+ meta_sk->sk_backlog_rcv = mptcp_backlog_rcv;
|
|
+ meta_sk->sk_destruct = mptcp_sock_destruct;
|
|
+
|
|
+ /* Meta-level retransmit timer */
|
|
+ meta_icsk->icsk_rto *= 2; /* Double of initial - rto */
|
|
+
|
|
+ tcp_init_xmit_timers(master_sk);
|
|
+ /* Has been set for sending out the SYN */
|
|
+ inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS);
|
|
+
|
|
+ mptcp_mpcb_inherit_sockopts(meta_sk, master_sk);
|
|
+
|
|
+ mptcp_init_path_manager(mpcb);
|
|
+ mptcp_init_scheduler(mpcb);
|
|
+
|
|
+ if (!try_module_get(inet_csk(master_sk)->icsk_ca_ops->owner))
|
|
+ tcp_assign_congestion_control(master_sk);
|
|
+
|
|
+ master_tp->saved_syn = NULL;
|
|
+
|
|
+ mptcp_debug("%s: created mpcb with token %#x\n",
|
|
+ __func__, mpcb->mptcp_loc_token);
|
|
+
|
|
+ return 0;
|
|
+
|
|
+err_insert_token:
|
|
+ kmem_cache_free(mptcp_cb_cache, mpcb);
|
|
+
|
|
+err_alloc_mpcb:
|
|
+ inet_sk(master_sk)->inet_opt = NULL;
|
|
+ master_sk->sk_state = TCP_CLOSE;
|
|
+ sock_orphan(master_sk);
|
|
+ bh_unlock_sock(master_sk);
|
|
+ sk_free(master_sk);
|
|
+
|
|
+err_alloc_master:
|
|
+ return -ENOBUFS;
|
|
+}
|
|
+
|
|
+/* Called without holding lock on mpcb */
|
|
+static u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ /* Start at 1, because 0 is reserved for the meta-sk */
|
|
+ for (i = 1; i < sizeof(mpcb->path_index_bits) * 8; i++) {
|
|
+ if (!test_and_set_bit(i, &mpcb->path_index_bits))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (i == sizeof(mpcb->path_index_bits) * 8)
|
|
+ return 0;
|
|
+ return i;
|
|
+}
|
|
+
|
|
+/* May be called without holding the meta-level lock */
|
|
+int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
|
|
+ gfp_t flags)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags);
|
|
+ if (!tp->mptcp)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb);
|
|
+ /* No more space for more subflows? */
|
|
+ if (!tp->mptcp->path_index) {
|
|
+ kmem_cache_free(mptcp_sock_cache, tp->mptcp);
|
|
+ return -EPERM;
|
|
+ }
|
|
+
|
|
+ INIT_HLIST_NODE(&tp->mptcp->cb_list);
|
|
+
|
|
+ tp->mptcp->tp = tp;
|
|
+ tp->mpcb = mpcb;
|
|
+ tp->meta_sk = meta_sk;
|
|
+
|
|
+ if (!sock_flag(sk, SOCK_MPTCP)) {
|
|
+ mptcp_enable_static_key_bh();
|
|
+ sock_set_flag(sk, SOCK_MPTCP);
|
|
+ }
|
|
+
|
|
+ tp->mpc = 1;
|
|
+ tp->ops = &mptcp_sub_specific;
|
|
+
|
|
+ tp->mptcp->loc_id = loc_id;
|
|
+ tp->mptcp->rem_id = rem_id;
|
|
+ if (mpcb->sched_ops->init)
|
|
+ mpcb->sched_ops->init(sk);
|
|
+
|
|
+ /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be
|
|
+ * included in mptcp_del_sock(), because the mpcb must remain alive
|
|
+ * until the last subsocket is completely destroyed.
|
|
+ */
|
|
+ sock_hold(meta_sk);
|
|
+ refcount_inc(&mpcb->mpcb_refcnt);
|
|
+
|
|
+ spin_lock_bh(&mpcb->mpcb_list_lock);
|
|
+ hlist_add_head_rcu(&tp->mptcp->node, &mpcb->conn_list);
|
|
+ spin_unlock_bh(&mpcb->mpcb_list_lock);
|
|
+
|
|
+ tp->mptcp->attached = 1;
|
|
+
|
|
+ mptcp_sub_inherit_sockopts(meta_sk, sk);
|
|
+ INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq);
|
|
+
|
|
+ /* Properly inherit CC from the meta-socket */
|
|
+ mptcp_assign_congestion_control(sk);
|
|
+
|
|
+ /* As we successfully allocated the mptcp_tcp_sock, we have to
|
|
+ * change the function-pointers here (for sk_destruct to work correctly)
|
|
+ */
|
|
+ sk->sk_error_report = mptcp_sock_def_error_report;
|
|
+ sk->sk_data_ready = mptcp_data_ready;
|
|
+ sk->sk_write_space = mptcp_write_space;
|
|
+ sk->sk_state_change = mptcp_set_state;
|
|
+ sk->sk_destruct = mptcp_sock_destruct;
|
|
+
|
|
+ if (sk->sk_family == AF_INET)
|
|
+ mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d\n",
|
|
+ __func__ , mpcb->mptcp_loc_token,
|
|
+ tp->mptcp->path_index,
|
|
+ &((struct inet_sock *)tp)->inet_saddr,
|
|
+ ntohs(((struct inet_sock *)tp)->inet_sport),
|
|
+ &((struct inet_sock *)tp)->inet_daddr,
|
|
+ ntohs(((struct inet_sock *)tp)->inet_dport));
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ else
|
|
+ mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d\n",
|
|
+ __func__ , mpcb->mptcp_loc_token,
|
|
+ tp->mptcp->path_index, &inet6_sk(sk)->saddr,
|
|
+ ntohs(((struct inet_sock *)tp)->inet_sport),
|
|
+ &sk->sk_v6_daddr,
|
|
+ ntohs(((struct inet_sock *)tp)->inet_dport));
|
|
+#endif
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void mptcp_del_sock(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct mptcp_cb *mpcb;
|
|
+
|
|
+ if (!tp->mptcp || !tp->mptcp->attached)
|
|
+ return;
|
|
+
|
|
+ mpcb = tp->mpcb;
|
|
+
|
|
+ if (mpcb->sched_ops->release)
|
|
+ mpcb->sched_ops->release(sk);
|
|
+
|
|
+ if (mpcb->pm_ops->delete_subflow)
|
|
+ mpcb->pm_ops->delete_subflow(sk);
|
|
+
|
|
+ mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n",
|
|
+ __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,
|
|
+ sk->sk_state, is_meta_sk(sk));
|
|
+
|
|
+ spin_lock_bh(&mpcb->mpcb_list_lock);
|
|
+ hlist_del_init_rcu(&tp->mptcp->node);
|
|
+ spin_unlock_bh(&mpcb->mpcb_list_lock);
|
|
+
|
|
+ tp->mptcp->attached = 0;
|
|
+ mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index);
|
|
+
|
|
+ if (!tcp_write_queue_empty(sk) || !tcp_rtx_queue_empty(sk))
|
|
+ mptcp_reinject_data(sk, 0);
|
|
+
|
|
+ if (is_master_tp(tp)) {
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+
|
|
+ if (meta_tp->record_master_info &&
|
|
+ !sock_flag(meta_sk, SOCK_DEAD)) {
|
|
+ mpcb->master_info = kmalloc(sizeof(*mpcb->master_info),
|
|
+ GFP_ATOMIC);
|
|
+
|
|
+ if (mpcb->master_info)
|
|
+ tcp_get_info(sk, mpcb->master_info, true);
|
|
+ }
|
|
+
|
|
+ mpcb->master_sk = NULL;
|
|
+ } else if (tp->mptcp->pre_established) {
|
|
+ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Updates the MPTCP-session based on path-manager information (e.g., addresses,
|
|
+ * low-prio flows,...).
|
|
+ */
|
|
+void mptcp_update_metasocket(const struct sock *meta_sk)
|
|
+{
|
|
+ if (tcp_sk(meta_sk)->mpcb->pm_ops->new_session)
|
|
+ tcp_sk(meta_sk)->mpcb->pm_ops->new_session(meta_sk);
|
|
+}
|
|
+
|
|
+/* Clean up the receive buffer for full frames taken by the user,
|
|
+ * then send an ACK if necessary. COPIED is the number of bytes
|
|
+ * tcp_recvmsg has given to the user so far, it speeds up the
|
|
+ * calculation of whether or not we must ACK for the sake of
|
|
+ * a window update.
|
|
+ * (inspired from tcp_cleanup_rbuf())
|
|
+ */
|
|
+void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ bool recheck_rcv_window = false;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ __u32 rcv_window_now = 0;
|
|
+
|
|
+ if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) {
|
|
+ rcv_window_now = tcp_receive_window_now(meta_tp);
|
|
+
|
|
+ /* Optimize, __mptcp_select_window() is not cheap. */
|
|
+ if (2 * rcv_window_now <= meta_tp->window_clamp)
|
|
+ recheck_rcv_window = true;
|
|
+ }
|
|
+
|
|
+ mptcp_for_each_sub(meta_tp->mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ const struct inet_connection_sock *icsk = inet_csk(sk);
|
|
+
|
|
+ if (!mptcp_sk_can_send_ack(sk))
|
|
+ continue;
|
|
+
|
|
+ if (!inet_csk_ack_scheduled(sk))
|
|
+ goto second_part;
|
|
+ /* Delayed ACKs frequently hit locked sockets during bulk
|
|
+ * receive.
|
|
+ */
|
|
+ if (icsk->icsk_ack.blocked ||
|
|
+ /* Once-per-two-segments ACK was not sent by tcp_input.c */
|
|
+ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
|
|
+ /* If this read emptied read buffer, we send ACK, if
|
|
+ * connection is not bidirectional, user drained
|
|
+ * receive buffer and there was a small segment
|
|
+ * in queue.
|
|
+ */
|
|
+ (copied > 0 &&
|
|
+ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
|
|
+ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
|
|
+ !icsk->icsk_ack.pingpong)) &&
|
|
+ !atomic_read(&meta_sk->sk_rmem_alloc))) {
|
|
+ tcp_send_ack(sk);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+second_part:
|
|
+ /* This here is the second part of tcp_cleanup_rbuf */
|
|
+ if (recheck_rcv_window) {
|
|
+ __u32 new_window = tp->ops->__select_window(sk);
|
|
+
|
|
+ /* Send ACK now, if this read freed lots of space
|
|
+ * in our buffer. Certainly, new_window is new window.
|
|
+ * We can advertise it now, if it is not less than
|
|
+ * current one.
|
|
+ * "Lots" means "at least twice" here.
|
|
+ */
|
|
+ if (new_window && new_window >= 2 * rcv_window_now)
|
|
+ tcp_send_ack(sk);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static int mptcp_sub_send_fin(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
|
|
+ int mss_now;
|
|
+
|
|
+ /* Optimization, tack on the FIN if we have a queue of
|
|
+ * unsent frames. But be careful about outgoing SACKS
|
|
+ * and IP options.
|
|
+ */
|
|
+ mss_now = tcp_current_mss(sk);
|
|
+
|
|
+ if (tcp_send_head(sk) != NULL) {
|
|
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
|
|
+ TCP_SKB_CB(skb)->end_seq++;
|
|
+ tp->write_seq++;
|
|
+ } else {
|
|
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC);
|
|
+ if (!skb)
|
|
+ return 1;
|
|
+
|
|
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
|
|
+ skb_reserve(skb, MAX_TCP_HEADER);
|
|
+ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
|
|
+ tcp_init_nondata_skb(skb, tp->write_seq,
|
|
+ TCPHDR_ACK | TCPHDR_FIN);
|
|
+ tcp_queue_skb(sk, skb);
|
|
+ }
|
|
+ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void mptcp_sub_close_doit(struct sock *sk)
|
|
+{
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ if (sock_flag(sk, SOCK_DEAD))
|
|
+ return;
|
|
+
|
|
+ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) {
|
|
+ tp->closing = 1;
|
|
+ tcp_close(sk, 0);
|
|
+ } else if (tcp_close_state(sk)) {
|
|
+ sk->sk_shutdown |= SEND_SHUTDOWN;
|
|
+ tcp_send_fin(sk);
|
|
+ }
|
|
+}
|
|
+
|
|
+void mptcp_sub_close_wq(struct work_struct *work)
|
|
+{
|
|
+ struct tcp_sock *tp = container_of(work, struct mptcp_tcp_sock, work.work)->tp;
|
|
+ struct sock *sk = (struct sock *)tp;
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
|
|
+
|
|
+ mptcp_sub_close_doit(sk);
|
|
+
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ mptcp_mpcb_put(mpcb);
|
|
+ sock_put(sk);
|
|
+}
|
|
+
|
|
+void mptcp_sub_close(struct sock *sk, unsigned long delay)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct delayed_work *work = &tcp_sk(sk)->mptcp->work;
|
|
+
|
|
+ /* We are already closing - e.g., call from sock_def_error_report upon
|
|
+ * tcp_disconnect in tcp_close.
|
|
+ */
|
|
+ if (tp->closing)
|
|
+ return;
|
|
+
|
|
+ /* Work already scheduled ? */
|
|
+ if (work_pending(&work->work)) {
|
|
+ /* Work present - who will be first ? */
|
|
+ if (jiffies + delay > work->timer.expires)
|
|
+ return;
|
|
+
|
|
+ /* Try canceling - if it fails, work will be executed soon */
|
|
+ if (!cancel_delayed_work(work))
|
|
+ return;
|
|
+ sock_put(sk);
|
|
+ mptcp_mpcb_put(tp->mpcb);
|
|
+ }
|
|
+
|
|
+ if (!delay) {
|
|
+ unsigned char old_state = sk->sk_state;
|
|
+
|
|
+ /* We directly send the FIN. Because it may take so a long time,
|
|
+ * untile the work-queue will get scheduled...
|
|
+ *
|
|
+ * If mptcp_sub_send_fin returns 1, it failed and thus we reset
|
|
+ * the old state so that tcp_close will finally send the fin
|
|
+ * in user-context.
|
|
+ */
|
|
+ if (!sk->sk_err && old_state != TCP_CLOSE &&
|
|
+ tcp_close_state(sk) && mptcp_sub_send_fin(sk)) {
|
|
+ if (old_state == TCP_ESTABLISHED)
|
|
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
|
|
+ sk->sk_state = old_state;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ sock_hold(sk);
|
|
+ refcount_inc(&tp->mpcb->mpcb_refcnt);
|
|
+ queue_delayed_work(mptcp_wq, work, delay);
|
|
+}
|
|
+
|
|
+void mptcp_sub_force_close(struct sock *sk)
|
|
+{
|
|
+ /* The below tcp_done may have freed the socket, if he is already dead.
|
|
+ * Thus, we are not allowed to access it afterwards. That's why
|
|
+ * we have to store the dead-state in this local variable.
|
|
+ */
|
|
+ int sock_is_dead = sock_flag(sk, SOCK_DEAD);
|
|
+
|
|
+ tcp_sk(sk)->mp_killed = 1;
|
|
+
|
|
+ if (sk->sk_state != TCP_CLOSE)
|
|
+ tcp_done(sk);
|
|
+
|
|
+ if (!sock_is_dead)
|
|
+ mptcp_sub_close(sk, 0);
|
|
+}
|
|
+EXPORT_SYMBOL(mptcp_sub_force_close);
|
|
+
|
|
+/* Update the mpcb send window, based on the contributions
|
|
+ * of each subflow
|
|
+ */
|
|
+void mptcp_update_sndbuf(const struct tcp_sock *tp)
|
|
+{
|
|
+ struct sock *meta_sk = tp->meta_sk;
|
|
+ int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (!mptcp_sk_can_send(sk))
|
|
+ continue;
|
|
+
|
|
+ new_sndbuf += sk->sk_sndbuf;
|
|
+
|
|
+ if (new_sndbuf > sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2] ||
|
|
+ new_sndbuf < 0) {
|
|
+ new_sndbuf = sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2];
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ meta_sk->sk_sndbuf = max(min(new_sndbuf,
|
|
+ sock_net(meta_sk)->ipv4.sysctl_tcp_wmem[2]),
|
|
+ meta_sk->sk_sndbuf);
|
|
+
|
|
+ /* The subflow's call to sk_write_space in tcp_new_space ends up in
|
|
+ * mptcp_write_space.
|
|
+ * It has nothing to do with waking up the application.
|
|
+ * So, we do it here.
|
|
+ */
|
|
+ if (old_sndbuf != meta_sk->sk_sndbuf)
|
|
+ meta_sk->sk_write_space(meta_sk);
|
|
+}
|
|
+
|
|
+/* Similar to: tcp_close */
|
|
+void mptcp_close(struct sock *meta_sk, long timeout)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct sk_buff *skb;
|
|
+ int data_was_unread = 0;
|
|
+ int state;
|
|
+
|
|
+ mptcp_debug("%s: Close of meta_sk with tok %#x\n",
|
|
+ __func__, mpcb->mptcp_loc_token);
|
|
+
|
|
+ WARN_ON(refcount_inc_not_zero(&mpcb->mpcb_refcnt) == 0);
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
|
|
+
|
|
+ if (meta_tp->inside_tk_table)
|
|
+ /* Detach the mpcb from the token hashtable */
|
|
+ mptcp_hash_remove_bh(meta_tp);
|
|
+
|
|
+ meta_sk->sk_shutdown = SHUTDOWN_MASK;
|
|
+ /* We need to flush the recv. buffs. We do this only on the
|
|
+ * descriptor close, not protocol-sourced closes, because the
|
|
+ * reader process may not have drained the data yet!
|
|
+ */
|
|
+ while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) {
|
|
+ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
|
|
+
|
|
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
|
|
+ len--;
|
|
+ data_was_unread += len;
|
|
+ __kfree_skb(skb);
|
|
+ }
|
|
+
|
|
+ sk_mem_reclaim(meta_sk);
|
|
+
|
|
+ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
|
|
+ if (meta_sk->sk_state == TCP_CLOSE) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct hlist_node *tmp;
|
|
+
|
|
+ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (tcp_sk(sk_it)->send_mp_fclose)
|
|
+ continue;
|
|
+ mptcp_sub_close(sk_it, 0);
|
|
+ }
|
|
+ goto adjudge_to_death;
|
|
+ }
|
|
+
|
|
+ if (data_was_unread) {
|
|
+ /* Unread data was tossed, zap the connection. */
|
|
+ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE);
|
|
+ tcp_set_state(meta_sk, TCP_CLOSE);
|
|
+ tcp_sk(meta_sk)->ops->send_active_reset(meta_sk,
|
|
+ meta_sk->sk_allocation);
|
|
+ } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) {
|
|
+ /* Check zero linger _after_ checking for unread data. */
|
|
+ meta_sk->sk_prot->disconnect(meta_sk, 0);
|
|
+ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
|
|
+ } else if (tcp_close_state(meta_sk)) {
|
|
+ mptcp_send_fin(meta_sk);
|
|
+ } else if (meta_tp->snd_una == meta_tp->write_seq) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct hlist_node *tmp;
|
|
+
|
|
+ /* The DATA_FIN has been sent and acknowledged
|
|
+ * (e.g., by sk_shutdown). Close all the other subflows
|
|
+ */
|
|
+ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+ unsigned long delay = 0;
|
|
+ /* If we are the passive closer, don't trigger
|
|
+ * subflow-fin until the subflow has been finned
|
|
+ * by the peer. - thus we add a delay
|
|
+ */
|
|
+ if (mpcb->passive_close &&
|
|
+ sk_it->sk_state == TCP_ESTABLISHED)
|
|
+ delay = inet_csk(sk_it)->icsk_rto << 3;
|
|
+
|
|
+ mptcp_sub_close(sk_it, delay);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ sk_stream_wait_close(meta_sk, timeout);
|
|
+
|
|
+adjudge_to_death:
|
|
+ state = meta_sk->sk_state;
|
|
+ sock_hold(meta_sk);
|
|
+ sock_orphan(meta_sk);
|
|
+
|
|
+ /* socket will be freed after mptcp_close - we have to prevent
|
|
+ * access from the subflows.
|
|
+ */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ /* Similar to sock_orphan, but we don't set it DEAD, because
|
|
+ * the callbacks are still set and must be called.
|
|
+ */
|
|
+ write_lock_bh(&sk_it->sk_callback_lock);
|
|
+ sk_set_socket(sk_it, NULL);
|
|
+ sk_it->sk_wq = NULL;
|
|
+ write_unlock_bh(&sk_it->sk_callback_lock);
|
|
+ }
|
|
+
|
|
+ if (mpcb->pm_ops->close_session)
|
|
+ mpcb->pm_ops->close_session(meta_sk);
|
|
+
|
|
+ /* It is the last release_sock in its life. It will remove backlog. */
|
|
+ release_sock(meta_sk);
|
|
+
|
|
+ /* Now socket is owned by kernel and we acquire BH lock
|
|
+ * to finish close. No need to check for user refs.
|
|
+ */
|
|
+ local_bh_disable();
|
|
+ bh_lock_sock(meta_sk);
|
|
+ WARN_ON(sock_owned_by_user(meta_sk));
|
|
+
|
|
+ percpu_counter_inc(meta_sk->sk_prot->orphan_count);
|
|
+
|
|
+ /* Have we already been destroyed by a softirq or backlog? */
|
|
+ if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE)
|
|
+ goto out;
|
|
+
|
|
+ /* This is a (useful) BSD violating of the RFC. There is a
|
|
+ * problem with TCP as specified in that the other end could
|
|
+ * keep a socket open forever with no application left this end.
|
|
+ * We use a 3 minute timeout (about the same as BSD) then kill
|
|
+ * our end. If they send after that then tough - BUT: long enough
|
|
+ * that we won't make the old 4*rto = almost no time - whoops
|
|
+ * reset mistake.
|
|
+ *
|
|
+ * Nope, it was not mistake. It is really desired behaviour
|
|
+ * f.e. on http servers, when such sockets are useless, but
|
|
+ * consume significant resources. Let's do it with special
|
|
+ * linger2 option. --ANK
|
|
+ */
|
|
+
|
|
+ if (meta_sk->sk_state == TCP_FIN_WAIT2) {
|
|
+ if (meta_tp->linger2 < 0) {
|
|
+ tcp_set_state(meta_sk, TCP_CLOSE);
|
|
+ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);
|
|
+ __NET_INC_STATS(sock_net(meta_sk),
|
|
+ LINUX_MIB_TCPABORTONLINGER);
|
|
+ } else {
|
|
+ const int tmo = tcp_fin_time(meta_sk);
|
|
+
|
|
+ if (tmo > TCP_TIMEWAIT_LEN) {
|
|
+ inet_csk_reset_keepalive_timer(meta_sk,
|
|
+ tmo - TCP_TIMEWAIT_LEN);
|
|
+ } else {
|
|
+ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2,
|
|
+ tmo);
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ if (meta_sk->sk_state != TCP_CLOSE) {
|
|
+ sk_mem_reclaim(meta_sk);
|
|
+ if (tcp_check_oom(meta_sk, 0)) {
|
|
+ if (net_ratelimit())
|
|
+ pr_info("MPTCP: out of memory: force closing socket\n");
|
|
+ tcp_set_state(meta_sk, TCP_CLOSE);
|
|
+ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);
|
|
+ __NET_INC_STATS(sock_net(meta_sk),
|
|
+ LINUX_MIB_TCPABORTONMEMORY);
|
|
+ }
|
|
+ }
|
|
+
|
|
+
|
|
+ if (meta_sk->sk_state == TCP_CLOSE)
|
|
+ inet_csk_destroy_sock(meta_sk);
|
|
+ /* Otherwise, socket is reprieved until protocol close. */
|
|
+
|
|
+out:
|
|
+ bh_unlock_sock(meta_sk);
|
|
+ local_bh_enable();
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ mptcp_mpcb_put(mpcb);
|
|
+ sock_put(meta_sk); /* Taken by sock_hold */
|
|
+}
|
|
+
|
|
+void mptcp_disconnect(struct sock *meta_sk)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct hlist_node *tmp;
|
|
+
|
|
+ __skb_queue_purge(&meta_tp->mpcb->reinject_queue);
|
|
+
|
|
+ if (meta_tp->inside_tk_table)
|
|
+ mptcp_hash_remove_bh(meta_tp);
|
|
+
|
|
+ local_bh_disable();
|
|
+ mptcp_for_each_sub_safe(meta_tp->mpcb, mptcp, tmp) {
|
|
+ struct sock *subsk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (spin_is_locked(&subsk->sk_lock.slock))
|
|
+ bh_unlock_sock(subsk);
|
|
+
|
|
+ tcp_sk(subsk)->tcp_disconnect = 1;
|
|
+
|
|
+ meta_sk->sk_prot->disconnect(subsk, O_NONBLOCK);
|
|
+
|
|
+ sock_orphan(subsk);
|
|
+
|
|
+ percpu_counter_inc(meta_sk->sk_prot->orphan_count);
|
|
+
|
|
+ inet_csk_destroy_sock(subsk);
|
|
+ }
|
|
+ local_bh_enable();
|
|
+
|
|
+ mptcp_mpcb_cleanup(meta_tp->mpcb);
|
|
+ meta_tp->meta_sk = NULL;
|
|
+
|
|
+ meta_tp->send_mp_fclose = 0;
|
|
+ meta_tp->mpc = 0;
|
|
+ meta_tp->ops = &tcp_specific;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ if (meta_sk->sk_family == AF_INET6)
|
|
+ meta_sk->sk_backlog_rcv = tcp_v6_do_rcv;
|
|
+ else
|
|
+ meta_sk->sk_backlog_rcv = tcp_v4_do_rcv;
|
|
+#else
|
|
+ meta_sk->sk_backlog_rcv = tcp_v4_do_rcv;
|
|
+#endif
|
|
+ meta_sk->sk_destruct = inet_sock_destruct;
|
|
+}
|
|
+
|
|
+
|
|
+/* Returns True if we should enable MPTCP for that socket. */
|
|
+bool mptcp_doit(struct sock *sk)
|
|
+{
|
|
+ const struct dst_entry *dst = __sk_dst_get(sk);
|
|
+
|
|
+ /* Don't do mptcp over loopback */
|
|
+ if (sk->sk_family == AF_INET &&
|
|
+ (ipv4_is_loopback(inet_sk(sk)->inet_daddr) ||
|
|
+ ipv4_is_loopback(inet_sk(sk)->inet_saddr)))
|
|
+ return false;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ if (sk->sk_family == AF_INET6 &&
|
|
+ (ipv6_addr_loopback(&sk->sk_v6_daddr) ||
|
|
+ ipv6_addr_loopback(&inet6_sk(sk)->saddr)))
|
|
+ return false;
|
|
+#endif
|
|
+ if (mptcp_v6_is_v4_mapped(sk) &&
|
|
+ ipv4_is_loopback(inet_sk(sk)->inet_saddr))
|
|
+ return false;
|
|
+
|
|
+#ifdef CONFIG_TCP_MD5SIG
|
|
+ /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */
|
|
+ if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk))
|
|
+ return false;
|
|
+#endif
|
|
+
|
|
+ if (dst->dev && (dst->dev->flags & IFF_NOMULTIPATH))
|
|
+ return false;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key,
|
|
+ int rem_key_set, __u8 mptcp_ver, u32 window)
|
|
+{
|
|
+ struct tcp_sock *master_tp;
|
|
+ struct sock *master_sk;
|
|
+
|
|
+ if (mptcp_alloc_mpcb(meta_sk, remote_key, rem_key_set, mptcp_ver, window))
|
|
+ goto err_alloc_mpcb;
|
|
+
|
|
+ master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
|
|
+ master_tp = tcp_sk(master_sk);
|
|
+
|
|
+ if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC))
|
|
+ goto err_add_sock;
|
|
+
|
|
+ if (__inet_inherit_port(meta_sk, master_sk) < 0)
|
|
+ goto err_add_sock;
|
|
+
|
|
+ meta_sk->sk_prot->unhash(meta_sk);
|
|
+ inet_ehash_nolisten(master_sk, NULL);
|
|
+
|
|
+ master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd;
|
|
+
|
|
+ return 0;
|
|
+
|
|
+err_add_sock:
|
|
+ inet_csk_prepare_forced_close(master_sk);
|
|
+ tcp_done(master_sk);
|
|
+
|
|
+err_alloc_mpcb:
|
|
+ return -ENOBUFS;
|
|
+}
|
|
+
|
|
+static int __mptcp_check_req_master(struct sock *child,
|
|
+ const struct mptcp_options_received *mopt,
|
|
+ struct request_sock *req)
|
|
+{
|
|
+ struct tcp_sock *child_tp = tcp_sk(child);
|
|
+ struct sock *meta_sk = child;
|
|
+ struct mptcp_cb *mpcb;
|
|
+ struct mptcp_request_sock *mtreq;
|
|
+
|
|
+ /* Never contained an MP_CAPABLE */
|
|
+ if (!inet_rsk(req)->mptcp_rqsk)
|
|
+ return 1;
|
|
+
|
|
+ mtreq = mptcp_rsk(req);
|
|
+
|
|
+ if (!inet_rsk(req)->saw_mpc) {
|
|
+ /* Fallback to regular TCP, because we saw one SYN without
|
|
+ * MP_CAPABLE. In tcp_check_req we continue the regular path.
|
|
+ * But, the socket has been added to the reqsk_tk_htb, so we
|
|
+ * must still remove it.
|
|
+ */
|
|
+ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
|
|
+ mptcp_reqsk_remove_tk(req);
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ /* mopt can be NULL when coming from FAST-OPEN */
|
|
+ if (mopt && mopt->saw_mpc && mtreq->mptcp_ver == MPTCP_VERSION_1) {
|
|
+ mtreq->mptcp_rem_key = mopt->mptcp_sender_key;
|
|
+ mtreq->rem_key_set = 1;
|
|
+ }
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
|
|
+
|
|
+ /* Just set this values to pass them to mptcp_alloc_mpcb */
|
|
+ child_tp->mptcp_loc_key = mtreq->mptcp_loc_key;
|
|
+ child_tp->mptcp_loc_token = mtreq->mptcp_loc_token;
|
|
+
|
|
+ if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key,
|
|
+ mtreq->rem_key_set, mtreq->mptcp_ver,
|
|
+ child_tp->snd_wnd)) {
|
|
+ inet_csk_prepare_forced_close(meta_sk);
|
|
+ tcp_done(meta_sk);
|
|
+
|
|
+ return -ENOBUFS;
|
|
+ }
|
|
+
|
|
+ child = tcp_sk(child)->mpcb->master_sk;
|
|
+ child_tp = tcp_sk(child);
|
|
+ mpcb = child_tp->mpcb;
|
|
+
|
|
+ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
|
|
+ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
|
|
+
|
|
+ mpcb->dss_csum = mtreq->dss_csum;
|
|
+ mpcb->server_side = 1;
|
|
+
|
|
+ /* Needs to be done here additionally, because when accepting a
|
|
+ * new connection we pass by __reqsk_free and not reqsk_free.
|
|
+ */
|
|
+ mptcp_reqsk_remove_tk(req);
|
|
+
|
|
+ /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */
|
|
+ sock_put(meta_sk);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req)
|
|
+{
|
|
+ struct sock *meta_sk = child, *master_sk;
|
|
+ struct sk_buff *skb;
|
|
+ u32 new_mapping;
|
|
+ int ret;
|
|
+
|
|
+ ret = __mptcp_check_req_master(child, NULL, req);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
|
|
+
|
|
+ /* We need to rewind copied_seq as it is set to IDSN + 1 and as we have
|
|
+ * pre-MPTCP data in the receive queue.
|
|
+ */
|
|
+ tcp_sk(meta_sk)->copied_seq -= tcp_sk(master_sk)->rcv_nxt -
|
|
+ tcp_rsk(req)->rcv_isn - 1;
|
|
+
|
|
+ /* Map subflow sequence number to data sequence numbers. We need to map
|
|
+ * these data to [IDSN - len - 1, IDSN[.
|
|
+ */
|
|
+ new_mapping = tcp_sk(meta_sk)->copied_seq - tcp_rsk(req)->rcv_isn - 1;
|
|
+
|
|
+ /* There should be only one skb: the SYN + data. */
|
|
+ skb_queue_walk(&meta_sk->sk_receive_queue, skb) {
|
|
+ TCP_SKB_CB(skb)->seq += new_mapping;
|
|
+ TCP_SKB_CB(skb)->end_seq += new_mapping;
|
|
+ }
|
|
+
|
|
+ /* With fastopen we change the semantics of the relative subflow
|
|
+ * sequence numbers to deal with middleboxes that could add/remove
|
|
+ * multiple bytes in the SYN. We chose to start counting at rcv_nxt - 1
|
|
+ * instead of the regular TCP ISN.
|
|
+ */
|
|
+ tcp_sk(master_sk)->mptcp->rcv_isn = tcp_sk(master_sk)->rcv_nxt - 1;
|
|
+
|
|
+ /* We need to update copied_seq of the master_sk to account for the
|
|
+ * already moved data to the meta receive queue.
|
|
+ */
|
|
+ tcp_sk(master_sk)->copied_seq = tcp_sk(master_sk)->rcv_nxt;
|
|
+
|
|
+ /* Handled by the master_sk */
|
|
+ tcp_sk(meta_sk)->fastopen_rsk = NULL;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int mptcp_check_req_master(struct sock *sk, struct sock *child,
|
|
+ struct request_sock *req, const struct sk_buff *skb,
|
|
+ const struct mptcp_options_received *mopt,
|
|
+ int drop, u32 tsoff)
|
|
+{
|
|
+ struct sock *meta_sk = child;
|
|
+ int ret;
|
|
+
|
|
+ ret = __mptcp_check_req_master(child, mopt, req);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ child = tcp_sk(child)->mpcb->master_sk;
|
|
+
|
|
+ sock_rps_save_rxhash(child, skb);
|
|
+
|
|
+ /* drop indicates that we come from tcp_check_req and thus need to
|
|
+ * handle the request-socket fully.
|
|
+ */
|
|
+ if (drop) {
|
|
+ tcp_synack_rtt_meas(child, req);
|
|
+
|
|
+ inet_csk_reqsk_queue_drop(sk, req);
|
|
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
|
|
+ if (!inet_csk_reqsk_queue_add(sk, req, meta_sk)) {
|
|
+ bh_unlock_sock(meta_sk);
|
|
+ /* No sock_put() of the meta needed. The reference has
|
|
+ * already been dropped in __mptcp_check_req_master().
|
|
+ */
|
|
+ sock_put(child);
|
|
+ return -1;
|
|
+ }
|
|
+ } else {
|
|
+ /* Thus, we come from syn-cookies */
|
|
+ refcount_set(&req->rsk_refcnt, 1);
|
|
+ tcp_sk(meta_sk)->tsoffset = tsoff;
|
|
+ if (!inet_csk_reqsk_queue_add(sk, req, meta_sk)) {
|
|
+ bh_unlock_sock(meta_sk);
|
|
+ /* No sock_put() of the meta needed. The reference has
|
|
+ * already been dropped in __mptcp_check_req_master().
|
|
+ */
|
|
+ sock_put(child);
|
|
+ reqsk_put(req);
|
|
+ return -1;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* May be called without holding the meta-level lock */
|
|
+struct sock *mptcp_check_req_child(struct sock *meta_sk,
|
|
+ struct sock *child,
|
|
+ struct request_sock *req,
|
|
+ struct sk_buff *skb,
|
|
+ const struct mptcp_options_received *mopt)
|
|
+{
|
|
+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
|
|
+ struct tcp_sock *child_tp = tcp_sk(child);
|
|
+ u8 hash_mac_check[SHA256_DIGEST_SIZE];
|
|
+
|
|
+ if (!mopt->join_ack) {
|
|
+ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKFAIL);
|
|
+ goto teardown;
|
|
+ }
|
|
+
|
|
+ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key,
|
|
+ (u8 *)&mpcb->mptcp_loc_key, hash_mac_check, 2,
|
|
+ 4, (u8 *)&mtreq->mptcp_rem_nonce,
|
|
+ 4, (u8 *)&mtreq->mptcp_loc_nonce);
|
|
+
|
|
+ if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20)) {
|
|
+ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKMAC);
|
|
+ goto teardown;
|
|
+ }
|
|
+
|
|
+ /* Point it to the same struct socket and wq as the meta_sk */
|
|
+ sk_set_socket(child, meta_sk->sk_socket);
|
|
+ child->sk_wq = meta_sk->sk_wq;
|
|
+
|
|
+ if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) {
|
|
+ /* Has been inherited, but now child_tp->mptcp is NULL */
|
|
+ child_tp->mpc = 0;
|
|
+ child_tp->ops = &tcp_specific;
|
|
+
|
|
+ /* TODO when we support acking the third ack for new subflows,
|
|
+ * we should silently discard this third ack, by returning NULL.
|
|
+ *
|
|
+ * Maybe, at the retransmission we will have enough memory to
|
|
+ * fully add the socket to the meta-sk.
|
|
+ */
|
|
+ goto teardown;
|
|
+ }
|
|
+
|
|
+ /* The child is a clone of the meta socket, we must now reset
|
|
+ * some of the fields
|
|
+ */
|
|
+ child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio;
|
|
+
|
|
+ /* We should allow proper increase of the snd/rcv-buffers. Thus, we
|
|
+ * use the original values instead of the bloated up ones from the
|
|
+ * clone.
|
|
+ */
|
|
+ child->sk_sndbuf = mpcb->orig_sk_sndbuf;
|
|
+ child->sk_rcvbuf = mpcb->orig_sk_rcvbuf;
|
|
+
|
|
+ child_tp->mptcp->slave_sk = 1;
|
|
+ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
|
|
+ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
|
|
+ child_tp->mptcp->init_rcv_wnd = req->rsk_rcv_wnd;
|
|
+
|
|
+ child->sk_tsq_flags = 0;
|
|
+
|
|
+ child_tp->packets_out = 0;
|
|
+
|
|
+ tcp_reset_vars(child);
|
|
+
|
|
+ sock_rps_save_rxhash(child, skb);
|
|
+ tcp_synack_rtt_meas(child, req);
|
|
+
|
|
+ if (mpcb->pm_ops->established_subflow)
|
|
+ mpcb->pm_ops->established_subflow(child);
|
|
+
|
|
+ /* Subflows do not use the accept queue, as they
|
|
+ * are attached immediately to the mpcb.
|
|
+ */
|
|
+ inet_csk_reqsk_queue_drop(meta_sk, req);
|
|
+ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);
|
|
+
|
|
+ /* The refcnt is initialized to 2, because regular TCP will put him
|
|
+ * in the socket's listener queue. However, we do not have a listener-queue.
|
|
+ * So, we need to make sure that this request-sock indeed gets destroyed.
|
|
+ */
|
|
+ reqsk_put(req);
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKRX);
|
|
+
|
|
+ if (inet_sk(child)->inet_sport != inet_sk(meta_sk)->inet_sport)
|
|
+ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINALTERNATEPORT);
|
|
+
|
|
+ return child;
|
|
+
|
|
+teardown:
|
|
+ req->rsk_ops->send_reset(meta_sk, skb);
|
|
+
|
|
+ /* Drop this request - sock creation failed. */
|
|
+ inet_csk_reqsk_queue_drop(meta_sk, req);
|
|
+ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);
|
|
+ inet_csk_prepare_forced_close(child);
|
|
+ tcp_done(child);
|
|
+ bh_unlock_sock(meta_sk);
|
|
+ return meta_sk;
|
|
+}
|
|
+
|
|
+int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw)
|
|
+{
|
|
+ struct mptcp_tw *mptw;
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+
|
|
+ /* A subsocket in tw can only receive data. So, if we are in
|
|
+ * infinite-receive, then we should not reply with a data-ack or act
|
|
+ * upon general MPTCP-signaling. We prevent this by simply not creating
|
|
+ * the mptcp_tw_sock.
|
|
+ */
|
|
+ if (mpcb->infinite_mapping_rcv) {
|
|
+ tw->mptcp_tw = NULL;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ /* Alloc MPTCP-tw-sock */
|
|
+ mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC);
|
|
+ if (!mptw) {
|
|
+ tw->mptcp_tw = NULL;
|
|
+ return -ENOBUFS;
|
|
+ }
|
|
+
|
|
+ refcount_inc(&mpcb->mpcb_refcnt);
|
|
+
|
|
+ tw->mptcp_tw = mptw;
|
|
+ mptw->loc_key = mpcb->mptcp_loc_key;
|
|
+ mptw->meta_tw = mpcb->in_time_wait;
|
|
+ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
|
|
+ if (mptw->meta_tw && mpcb->mptw_state != TCP_TIME_WAIT)
|
|
+ mptw->rcv_nxt++;
|
|
+ rcu_assign_pointer(mptw->mpcb, mpcb);
|
|
+
|
|
+ spin_lock_bh(&mpcb->mpcb_list_lock);
|
|
+ list_add_rcu(&mptw->list, &tp->mpcb->tw_list);
|
|
+ mptw->in_list = 1;
|
|
+ spin_unlock_bh(&mpcb->mpcb_list_lock);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void mptcp_twsk_destructor(struct tcp_timewait_sock *tw)
|
|
+{
|
|
+ struct mptcp_cb *mpcb;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+ mpcb = rcu_dereference(tw->mptcp_tw->mpcb);
|
|
+
|
|
+ /* If we are still holding a ref to the mpcb, we have to remove ourself
|
|
+ * from the list and drop the ref properly.
|
|
+ */
|
|
+ if (mpcb && refcount_inc_not_zero(&mpcb->mpcb_refcnt)) {
|
|
+ spin_lock(&mpcb->mpcb_list_lock);
|
|
+ if (tw->mptcp_tw->in_list) {
|
|
+ list_del_rcu(&tw->mptcp_tw->list);
|
|
+ tw->mptcp_tw->in_list = 0;
|
|
+ /* Put, because we added it to the list */
|
|
+ mptcp_mpcb_put(mpcb);
|
|
+ }
|
|
+ spin_unlock(&mpcb->mpcb_list_lock);
|
|
+
|
|
+ /* Second time, because we increased it above */
|
|
+ mptcp_mpcb_put(mpcb);
|
|
+ }
|
|
+
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw);
|
|
+}
|
|
+
|
|
+/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a
|
|
+ * data-fin.
|
|
+ */
|
|
+void mptcp_time_wait(struct sock *meta_sk, int state, int timeo)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_tw *mptw;
|
|
+
|
|
+ if (mptcp_in_infinite_mapping_weak(meta_tp->mpcb)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct hlist_node *tmp;
|
|
+
|
|
+ mptcp_for_each_sub_safe(meta_tp->mpcb, mptcp, tmp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (sk_it->sk_state == TCP_CLOSE)
|
|
+ continue;
|
|
+
|
|
+ tcp_sk(sk_it)->ops->time_wait(sk_it, state, timeo);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Used for sockets that go into tw after the meta
|
|
+ * (see mptcp_init_tw_sock())
|
|
+ */
|
|
+ meta_tp->mpcb->in_time_wait = 1;
|
|
+ meta_tp->mpcb->mptw_state = state;
|
|
+
|
|
+ /* Update the time-wait-sock's information */
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+ list_for_each_entry_rcu(mptw, &meta_tp->mpcb->tw_list, list) {
|
|
+ mptw->meta_tw = 1;
|
|
+ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(meta_tp);
|
|
+
|
|
+ /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 -
|
|
+ * pretend as if the DATA_FIN has already reached us, that way
|
|
+ * the checks in tcp_timewait_state_process will be good as the
|
|
+ * DATA_FIN comes in.
|
|
+ */
|
|
+ if (state != TCP_TIME_WAIT)
|
|
+ mptw->rcv_nxt++;
|
|
+ }
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (meta_sk->sk_state != TCP_CLOSE)
|
|
+ tcp_done(meta_sk);
|
|
+}
|
|
+
|
|
+void mptcp_tsq_flags(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+
|
|
+ /* It will be handled as a regular deferred-call */
|
|
+ if (is_meta_sk(sk))
|
|
+ return;
|
|
+
|
|
+ if (hlist_unhashed(&tp->mptcp->cb_list)) {
|
|
+ hlist_add_head(&tp->mptcp->cb_list, &tp->mpcb->callback_list);
|
|
+ /* We need to hold it here, as the sock_hold is not assured
|
|
+ * by the release_sock as it is done in regular TCP.
|
|
+ *
|
|
+ * The subsocket may get inet_csk_destroy'd while it is inside
|
|
+ * the callback_list.
|
|
+ */
|
|
+ sock_hold(sk);
|
|
+ }
|
|
+
|
|
+ if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &meta_sk->sk_tsq_flags))
|
|
+ sock_hold(meta_sk);
|
|
+}
|
|
+
|
|
+void mptcp_tsq_sub_deferred(struct sock *meta_sk)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct hlist_node *tmp;
|
|
+
|
|
+ __sock_put(meta_sk);
|
|
+ hlist_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) {
|
|
+ struct tcp_sock *tp = mptcp->tp;
|
|
+ struct sock *sk = (struct sock *)tp;
|
|
+
|
|
+ hlist_del_init(&mptcp->cb_list);
|
|
+ sk->sk_prot->release_cb(sk);
|
|
+ /* Final sock_put (cfr. mptcp_tsq_flags) */
|
|
+ sock_put(sk);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* May be called without holding the meta-level lock */
|
|
+void mptcp_join_reqsk_init(const struct mptcp_cb *mpcb,
|
|
+ const struct request_sock *req,
|
|
+ struct sk_buff *skb)
|
|
+{
|
|
+ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
|
|
+ u8 mptcp_hash_mac[SHA256_DIGEST_SIZE];
|
|
+ struct mptcp_options_received mopt;
|
|
+
|
|
+ mptcp_init_mp_opt(&mopt);
|
|
+ tcp_parse_mptcp_options(skb, &mopt);
|
|
+
|
|
+ mtreq->is_sub = 1;
|
|
+ inet_rsk(req)->mptcp_rqsk = 1;
|
|
+
|
|
+ mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;
|
|
+
|
|
+ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key,
|
|
+ (u8 *)&mpcb->mptcp_rem_key, mptcp_hash_mac, 2,
|
|
+ 4, (u8 *)&mtreq->mptcp_loc_nonce,
|
|
+ 4, (u8 *)&mtreq->mptcp_rem_nonce);
|
|
+ mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;
|
|
+
|
|
+ mtreq->rem_id = mopt.rem_id;
|
|
+ mtreq->rcv_low_prio = mopt.low_prio;
|
|
+ inet_rsk(req)->saw_mpc = 1;
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(mpcb->meta_sk), MPTCP_MIB_JOINSYNRX);
|
|
+}
|
|
+
|
|
+void mptcp_reqsk_init(struct request_sock *req, const struct sock *sk,
|
|
+ const struct sk_buff *skb, bool want_cookie)
|
|
+{
|
|
+ struct mptcp_options_received mopt;
|
|
+ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
|
|
+
|
|
+ mptcp_init_mp_opt(&mopt);
|
|
+ tcp_parse_mptcp_options(skb, &mopt);
|
|
+
|
|
+ mtreq->dss_csum = mopt.dss_csum;
|
|
+
|
|
+ if (want_cookie) {
|
|
+ if (!mptcp_reqsk_new_cookie(req, sk, &mopt, skb))
|
|
+ /* No key available - back to regular TCP */
|
|
+ inet_rsk(req)->mptcp_rqsk = 0;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ mptcp_reqsk_new_mptcp(req, sk, &mopt, skb);
|
|
+}
|
|
+
|
|
+void mptcp_cookies_reqsk_init(struct request_sock *req,
|
|
+ struct mptcp_options_received *mopt,
|
|
+ struct sk_buff *skb)
|
|
+{
|
|
+ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
|
|
+
|
|
+ /* Absolutely need to always initialize this. */
|
|
+ mtreq->hash_entry.pprev = NULL;
|
|
+
|
|
+ mtreq->mptcp_ver = mopt->mptcp_ver;
|
|
+ mtreq->mptcp_rem_key = mopt->mptcp_sender_key;
|
|
+ mtreq->mptcp_loc_key = mopt->mptcp_receiver_key;
|
|
+ mtreq->rem_key_set = 1;
|
|
+
|
|
+ /* Generate the token */
|
|
+ mptcp_key_hash(mtreq->mptcp_ver, mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+ spin_lock(&mptcp_tk_hashlock);
|
|
+
|
|
+ /* Check, if the key is still free */
|
|
+ if (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
|
|
+ mptcp_find_token(mtreq->mptcp_loc_token))
|
|
+ goto out;
|
|
+
|
|
+ inet_rsk(req)->saw_mpc = 1;
|
|
+ mtreq->is_sub = 0;
|
|
+ inet_rsk(req)->mptcp_rqsk = 1;
|
|
+ mtreq->dss_csum = mopt->dss_csum;
|
|
+
|
|
+out:
|
|
+ spin_unlock(&mptcp_tk_hashlock);
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+int mptcp_conn_request(struct sock *sk, struct sk_buff *skb)
|
|
+{
|
|
+ struct mptcp_options_received mopt;
|
|
+
|
|
+ mptcp_init_mp_opt(&mopt);
|
|
+ tcp_parse_mptcp_options(skb, &mopt);
|
|
+
|
|
+ if (mopt.is_mp_join)
|
|
+ return mptcp_do_join_short(skb, &mopt, sock_net(sk));
|
|
+ if (mopt.drop_me)
|
|
+ goto drop;
|
|
+
|
|
+ if (!sock_flag(sk, SOCK_MPTCP))
|
|
+ mopt.saw_mpc = 0;
|
|
+
|
|
+ /* If the requested version is higher than what we support, fall back */
|
|
+ if (mopt.saw_mpc && mopt.mptcp_ver > tcp_sk(sk)->mptcp_ver)
|
|
+ mopt.saw_mpc = 0;
|
|
+
|
|
+ if (skb->protocol == htons(ETH_P_IP)) {
|
|
+ if (mopt.saw_mpc) {
|
|
+ if (skb_rtable(skb)->rt_flags &
|
|
+ (RTCF_BROADCAST | RTCF_MULTICAST))
|
|
+ goto drop;
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVE);
|
|
+ return tcp_conn_request(&mptcp_request_sock_ops,
|
|
+ &mptcp_request_sock_ipv4_ops,
|
|
+ sk, skb);
|
|
+ }
|
|
+
|
|
+ return tcp_v4_conn_request(sk, skb);
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ } else {
|
|
+ if (mopt.saw_mpc) {
|
|
+ if (!ipv6_unicast_destination(skb))
|
|
+ goto drop;
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVE);
|
|
+ return tcp_conn_request(&mptcp6_request_sock_ops,
|
|
+ &mptcp_request_sock_ipv6_ops,
|
|
+ sk, skb);
|
|
+ }
|
|
+
|
|
+ return tcp_v6_conn_request(sk, skb);
|
|
+#endif
|
|
+ }
|
|
+drop:
|
|
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int mptcp_finish_handshake(struct sock *child, struct sk_buff *skb)
|
|
+ __releases(&child->sk_lock.slock)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ /* We don't call tcp_child_process here, because we hold
|
|
+ * already the meta-sk-lock and are sure that it is not owned
|
|
+ * by the user.
|
|
+ */
|
|
+ tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
|
|
+ ret = tcp_rcv_state_process(child, skb);
|
|
+ bh_unlock_sock(child);
|
|
+ sock_put(child);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void __mptcp_get_info(const struct sock *meta_sk,
|
|
+ struct mptcp_meta_info *info)
|
|
+{
|
|
+ const struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
|
|
+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ u32 now = tcp_jiffies32;
|
|
+
|
|
+ memset(info, 0, sizeof(*info));
|
|
+
|
|
+ info->mptcpi_state = meta_sk->sk_state;
|
|
+ info->mptcpi_retransmits = meta_icsk->icsk_retransmits;
|
|
+ info->mptcpi_probes = meta_icsk->icsk_probes_out;
|
|
+ info->mptcpi_backoff = meta_icsk->icsk_backoff;
|
|
+
|
|
+ info->mptcpi_rto = jiffies_to_usecs(meta_icsk->icsk_rto);
|
|
+
|
|
+ info->mptcpi_unacked = meta_tp->packets_out;
|
|
+
|
|
+ info->mptcpi_last_data_sent = jiffies_to_msecs(now - meta_tp->lsndtime);
|
|
+ info->mptcpi_last_data_recv = jiffies_to_msecs(now - meta_icsk->icsk_ack.lrcvtime);
|
|
+ info->mptcpi_last_ack_recv = jiffies_to_msecs(now - meta_tp->rcv_tstamp);
|
|
+
|
|
+ info->mptcpi_total_retrans = meta_tp->total_retrans;
|
|
+
|
|
+ info->mptcpi_bytes_acked = meta_tp->bytes_acked;
|
|
+ info->mptcpi_bytes_received = meta_tp->bytes_received;
|
|
+}
|
|
+
|
|
+static void mptcp_get_sub_info(struct sock *sk, struct mptcp_sub_info *info)
|
|
+{
|
|
+ struct inet_sock *inet = inet_sk(sk);
|
|
+
|
|
+ memset(info, 0, sizeof(*info));
|
|
+
|
|
+ if (sk->sk_family == AF_INET) {
|
|
+ info->src_v4.sin_family = AF_INET;
|
|
+ info->src_v4.sin_port = inet->inet_sport;
|
|
+
|
|
+ info->src_v4.sin_addr.s_addr = inet->inet_rcv_saddr;
|
|
+ if (!info->src_v4.sin_addr.s_addr)
|
|
+ info->src_v4.sin_addr.s_addr = inet->inet_saddr;
|
|
+
|
|
+ info->dst_v4.sin_family = AF_INET;
|
|
+ info->dst_v4.sin_port = inet->inet_dport;
|
|
+ info->dst_v4.sin_addr.s_addr = inet->inet_daddr;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ } else {
|
|
+ struct ipv6_pinfo *np = inet6_sk(sk);
|
|
+
|
|
+ info->src_v6.sin6_family = AF_INET6;
|
|
+ info->src_v6.sin6_port = inet->inet_sport;
|
|
+
|
|
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
|
|
+ info->src_v6.sin6_addr = np->saddr;
|
|
+ else
|
|
+ info->src_v6.sin6_addr = sk->sk_v6_rcv_saddr;
|
|
+
|
|
+ info->dst_v6.sin6_family = AF_INET6;
|
|
+ info->dst_v6.sin6_port = inet->inet_dport;
|
|
+ info->dst_v6.sin6_addr = sk->sk_v6_daddr;
|
|
+#endif
|
|
+ }
|
|
+}
|
|
+
|
|
+int mptcp_get_info(const struct sock *meta_sk, char __user *optval, int optlen)
|
|
+{
|
|
+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+
|
|
+ struct mptcp_meta_info meta_info;
|
|
+ struct mptcp_info m_info;
|
|
+
|
|
+ unsigned int info_len;
|
|
+
|
|
+ /* Check again with the lock held */
|
|
+ if (!mptcp(meta_tp))
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (copy_from_user(&m_info, optval, optlen))
|
|
+ return -EFAULT;
|
|
+
|
|
+ if (m_info.meta_info) {
|
|
+ unsigned int len;
|
|
+
|
|
+ __mptcp_get_info(meta_sk, &meta_info);
|
|
+
|
|
+ /* Need to set this, if user thinks that tcp_info is bigger than ours */
|
|
+ len = min_t(unsigned int, m_info.meta_len, sizeof(meta_info));
|
|
+ m_info.meta_len = len;
|
|
+
|
|
+ if (copy_to_user((void __user *)m_info.meta_info, &meta_info, len))
|
|
+ return -EFAULT;
|
|
+ }
|
|
+
|
|
+ /* Need to set this, if user thinks that tcp_info is bigger than ours */
|
|
+ info_len = min_t(unsigned int, m_info.tcp_info_len, sizeof(struct tcp_info));
|
|
+ m_info.tcp_info_len = info_len;
|
|
+
|
|
+ if (m_info.initial) {
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+
|
|
+ if (mpcb->master_sk) {
|
|
+ struct tcp_info info;
|
|
+
|
|
+ tcp_get_info(mpcb->master_sk, &info, true);
|
|
+ if (copy_to_user((void __user *)m_info.initial, &info, info_len))
|
|
+ return -EFAULT;
|
|
+ } else if (meta_tp->record_master_info && mpcb->master_info) {
|
|
+ if (copy_to_user((void __user *)m_info.initial, mpcb->master_info, info_len))
|
|
+ return -EFAULT;
|
|
+ } else {
|
|
+ return meta_tp->record_master_info ? -ENOMEM : -EINVAL;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (m_info.subflows) {
|
|
+ unsigned int len, sub_len = 0;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ char __user *ptr;
|
|
+
|
|
+ ptr = (char __user *)m_info.subflows;
|
|
+ len = m_info.sub_len;
|
|
+
|
|
+ mptcp_for_each_sub(meta_tp->mpcb, mptcp) {
|
|
+ struct tcp_info t_info;
|
|
+ unsigned int tmp_len;
|
|
+
|
|
+ tcp_get_info(mptcp_to_sock(mptcp), &t_info, true);
|
|
+
|
|
+ tmp_len = min_t(unsigned int, len, info_len);
|
|
+ len -= tmp_len;
|
|
+
|
|
+ if (copy_to_user(ptr, &t_info, tmp_len))
|
|
+ return -EFAULT;
|
|
+
|
|
+ ptr += tmp_len;
|
|
+ sub_len += tmp_len;
|
|
+
|
|
+ if (len == 0)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ m_info.sub_len = sub_len;
|
|
+ }
|
|
+
|
|
+ if (m_info.subflow_info) {
|
|
+ unsigned int len, sub_info_len, total_sub_info_len = 0;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ char __user *ptr;
|
|
+
|
|
+ ptr = (char __user *)m_info.subflow_info;
|
|
+ len = m_info.total_sub_info_len;
|
|
+
|
|
+ sub_info_len = min_t(unsigned int, m_info.sub_info_len,
|
|
+ sizeof(struct mptcp_sub_info));
|
|
+ m_info.sub_info_len = sub_info_len;
|
|
+
|
|
+ mptcp_for_each_sub(meta_tp->mpcb, mptcp) {
|
|
+ struct mptcp_sub_info m_sub_info;
|
|
+ unsigned int tmp_len;
|
|
+
|
|
+ mptcp_get_sub_info(mptcp_to_sock(mptcp), &m_sub_info);
|
|
+
|
|
+ tmp_len = min_t(unsigned int, len, sub_info_len);
|
|
+ len -= tmp_len;
|
|
+
|
|
+ if (copy_to_user(ptr, &m_sub_info, tmp_len))
|
|
+ return -EFAULT;
|
|
+
|
|
+ ptr += tmp_len;
|
|
+ total_sub_info_len += tmp_len;
|
|
+
|
|
+ if (len == 0)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ m_info.total_sub_info_len = total_sub_info_len;
|
|
+ }
|
|
+
|
|
+ if (copy_to_user(optval, &m_info, optlen))
|
|
+ return -EFAULT;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void mptcp_clear_sk(struct sock *sk, int size)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ /* we do not want to clear tk_table field, because of RCU lookups */
|
|
+ sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table.next));
|
|
+
|
|
+ size -= offsetof(struct tcp_sock, tk_table.pprev);
|
|
+ memset((char *)&tp->tk_table.pprev, 0, size);
|
|
+}
|
|
+
|
|
+static const struct snmp_mib mptcp_snmp_list[] = {
|
|
+ SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE),
|
|
+ SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE),
|
|
+ SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK),
|
|
+ SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK),
|
|
+ SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK),
|
|
+ SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
|
|
+ SNMP_MIB_ITEM("MPCapableRetransFallback", MPTCP_MIB_MPCAPABLERETRANSFALLBACK),
|
|
+ SNMP_MIB_ITEM("MPTCPCsumEnabled", MPTCP_MIB_CSUMENABLED),
|
|
+ SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
|
|
+ SNMP_MIB_ITEM("MPFailRX", MPTCP_MIB_MPFAILRX),
|
|
+ SNMP_MIB_ITEM("MPCsumFail", MPTCP_MIB_CSUMFAIL),
|
|
+ SNMP_MIB_ITEM("MPFastcloseRX", MPTCP_MIB_FASTCLOSERX),
|
|
+ SNMP_MIB_ITEM("MPFastcloseTX", MPTCP_MIB_FASTCLOSETX),
|
|
+ SNMP_MIB_ITEM("MPFallbackAckSub", MPTCP_MIB_FBACKSUB),
|
|
+ SNMP_MIB_ITEM("MPFallbackAckInit", MPTCP_MIB_FBACKINIT),
|
|
+ SNMP_MIB_ITEM("MPFallbackDataSub", MPTCP_MIB_FBDATASUB),
|
|
+ SNMP_MIB_ITEM("MPFallbackDataInit", MPTCP_MIB_FBDATAINIT),
|
|
+ SNMP_MIB_ITEM("MPRemoveAddrSubDelete", MPTCP_MIB_REMADDRSUB),
|
|
+ SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),
|
|
+ SNMP_MIB_ITEM("MPJoinAlreadyFallenback", MPTCP_MIB_JOINFALLBACK),
|
|
+ SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX),
|
|
+ SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX),
|
|
+ SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX),
|
|
+ SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC),
|
|
+ SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
|
|
+ SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
|
|
+ SNMP_MIB_ITEM("MPJoinAckMissing", MPTCP_MIB_JOINACKFAIL),
|
|
+ SNMP_MIB_ITEM("MPJoinAckRTO", MPTCP_MIB_JOINACKRTO),
|
|
+ SNMP_MIB_ITEM("MPJoinAckRexmit", MPTCP_MIB_JOINACKRXMIT),
|
|
+ SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW),
|
|
+ SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
|
|
+ SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
|
|
+ SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH),
|
|
+ SNMP_MIB_ITEM("DSSTrimHead", MPTCP_MIB_DSSTRIMHEAD),
|
|
+ SNMP_MIB_ITEM("DSSSplitTail", MPTCP_MIB_DSSSPLITTAIL),
|
|
+ SNMP_MIB_ITEM("DSSPurgeOldSubSegs", MPTCP_MIB_PURGEOLD),
|
|
+ SNMP_MIB_ITEM("AddAddrRx", MPTCP_MIB_ADDADDRRX),
|
|
+ SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX),
|
|
+ SNMP_MIB_ITEM("RemAddrRx", MPTCP_MIB_REMADDRRX),
|
|
+ SNMP_MIB_ITEM("RemAddrTx", MPTCP_MIB_REMADDRTX),
|
|
+ SNMP_MIB_ITEM("MPJoinAlternatePort", MPTCP_MIB_JOINALTERNATEPORT),
|
|
+ SNMP_MIB_ITEM("MPCurrEstab", MPTCP_MIB_CURRESTAB),
|
|
+ SNMP_MIB_SENTINEL
|
|
+};
|
|
+
|
|
+struct workqueue_struct *mptcp_wq;
|
|
+EXPORT_SYMBOL(mptcp_wq);
|
|
+
|
|
+/* Output /proc/net/mptcp */
|
|
+static int mptcp_pm_seq_show(struct seq_file *seq, void *v)
|
|
+{
|
|
+ struct tcp_sock *meta_tp;
|
|
+ const struct net *net = seq->private;
|
|
+ unsigned int i, n = 0;
|
|
+
|
|
+ seq_printf(seq, " sl loc_tok rem_tok v6 local_address remote_address st ns tx_queue rx_queue inode");
|
|
+ seq_putc(seq, '\n');
|
|
+
|
|
+ for (i = 0; i <= mptcp_tk_htable.mask; i++) {
|
|
+ struct hlist_nulls_node *node;
|
|
+ rcu_read_lock();
|
|
+ local_bh_disable();
|
|
+ hlist_nulls_for_each_entry_rcu(meta_tp, node,
|
|
+ &mptcp_tk_htable.hashtable[i],
|
|
+ tk_table) {
|
|
+ struct sock *meta_sk = (struct sock *)meta_tp;
|
|
+ struct inet_sock *isk = inet_sk(meta_sk);
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+
|
|
+ if (!mptcp(meta_tp) || !net_eq(net, sock_net(meta_sk)))
|
|
+ continue;
|
|
+
|
|
+ if (!mpcb)
|
|
+ continue;
|
|
+
|
|
+ if (capable(CAP_NET_ADMIN)) {
|
|
+ seq_printf(seq, "%4d: %04X %04X ", n++,
|
|
+ mpcb->mptcp_loc_token,
|
|
+ mpcb->mptcp_rem_token);
|
|
+ } else {
|
|
+ seq_printf(seq, "%4d: %04X %04X ", n++, -1, -1);
|
|
+ }
|
|
+ if (meta_sk->sk_family == AF_INET ||
|
|
+ mptcp_v6_is_v4_mapped(meta_sk)) {
|
|
+ seq_printf(seq, " 0 %08X:%04X %08X:%04X ",
|
|
+ isk->inet_rcv_saddr,
|
|
+ ntohs(isk->inet_sport),
|
|
+ isk->inet_daddr,
|
|
+ ntohs(isk->inet_dport));
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ } else if (meta_sk->sk_family == AF_INET6) {
|
|
+ struct in6_addr *src = &meta_sk->sk_v6_rcv_saddr;
|
|
+ struct in6_addr *dst = &meta_sk->sk_v6_daddr;
|
|
+ seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X",
|
|
+ src->s6_addr32[0], src->s6_addr32[1],
|
|
+ src->s6_addr32[2], src->s6_addr32[3],
|
|
+ ntohs(isk->inet_sport),
|
|
+ dst->s6_addr32[0], dst->s6_addr32[1],
|
|
+ dst->s6_addr32[2], dst->s6_addr32[3],
|
|
+ ntohs(isk->inet_dport));
|
|
+#endif
|
|
+ }
|
|
+
|
|
+ seq_printf(seq, " %02X %02X %08X:%08X %lu",
|
|
+ meta_sk->sk_state, mptcp_subflow_count(mpcb),
|
|
+ meta_tp->write_seq - meta_tp->snd_una,
|
|
+ max_t(int, meta_tp->rcv_nxt -
|
|
+ meta_tp->copied_seq, 0),
|
|
+ sock_i_ino(meta_sk));
|
|
+ seq_putc(seq, '\n');
|
|
+ }
|
|
+
|
|
+ local_bh_enable();
|
|
+ rcu_read_unlock();
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int mptcp_snmp_seq_show(struct seq_file *seq, void *v)
|
|
+{
|
|
+ struct net *net = seq->private;
|
|
+ int i;
|
|
+
|
|
+ for (i = 0; mptcp_snmp_list[i].name != NULL; i++)
|
|
+ seq_printf(seq, "%-32s\t%ld\n", mptcp_snmp_list[i].name,
|
|
+ snmp_fold_field(net->mptcp.mptcp_statistics,
|
|
+ mptcp_snmp_list[i].entry));
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int mptcp_pm_init_net(struct net *net)
|
|
+{
|
|
+ net->mptcp.mptcp_statistics = alloc_percpu(struct mptcp_mib);
|
|
+ if (!net->mptcp.mptcp_statistics)
|
|
+ goto out_mptcp_mibs;
|
|
+
|
|
+#ifdef CONFIG_PROC_FS
|
|
+ net->mptcp.proc_net_mptcp = proc_net_mkdir(net, "mptcp_net", net->proc_net);
|
|
+ if (!net->mptcp.proc_net_mptcp)
|
|
+ goto out_proc_net_mptcp;
|
|
+ if (!proc_create_net_single("mptcp", S_IRUGO, net->mptcp.proc_net_mptcp,
|
|
+ mptcp_pm_seq_show, NULL))
|
|
+ goto out_mptcp_net_mptcp;
|
|
+ if (!proc_create_net_single("snmp", S_IRUGO, net->mptcp.proc_net_mptcp,
|
|
+ mptcp_snmp_seq_show, NULL))
|
|
+ goto out_mptcp_net_snmp;
|
|
+#endif
|
|
+
|
|
+ return 0;
|
|
+
|
|
+#ifdef CONFIG_PROC_FS
|
|
+out_mptcp_net_snmp:
|
|
+ remove_proc_entry("mptcp", net->mptcp.proc_net_mptcp);
|
|
+out_mptcp_net_mptcp:
|
|
+ remove_proc_subtree("mptcp_net", net->proc_net);
|
|
+ net->mptcp.proc_net_mptcp = NULL;
|
|
+out_proc_net_mptcp:
|
|
+ free_percpu(net->mptcp.mptcp_statistics);
|
|
+#endif
|
|
+out_mptcp_mibs:
|
|
+ return -ENOMEM;
|
|
+}
|
|
+
|
|
+static void mptcp_pm_exit_net(struct net *net)
|
|
+{
|
|
+ remove_proc_entry("snmp", net->mptcp.proc_net_mptcp);
|
|
+ remove_proc_entry("mptcp", net->mptcp.proc_net_mptcp);
|
|
+ remove_proc_subtree("mptcp_net", net->proc_net);
|
|
+ free_percpu(net->mptcp.mptcp_statistics);
|
|
+}
|
|
+
|
|
+static struct pernet_operations mptcp_pm_proc_ops = {
|
|
+ .init = mptcp_pm_init_net,
|
|
+ .exit = mptcp_pm_exit_net,
|
|
+};
|
|
+
|
|
+static unsigned long mptcp_htable_entries __initdata;
|
|
+
|
|
+static int __init set_mptcp_htable_entries(char *str)
|
|
+{
|
|
+ ssize_t ret;
|
|
+
|
|
+ if (!str)
|
|
+ return 0;
|
|
+
|
|
+ ret = kstrtoul(str, 0, &mptcp_htable_entries);
|
|
+ if (ret)
|
|
+ return 0;
|
|
+
|
|
+ return 1;
|
|
+}
|
|
+__setup("mptcp_htable_entries=", set_mptcp_htable_entries);
|
|
+
|
|
+/* General initialization of mptcp */
|
|
+void __init mptcp_init(void)
|
|
+{
|
|
+ unsigned int i;
|
|
+ struct ctl_table_header *mptcp_sysctl;
|
|
+
|
|
+ mptcp_sock_cache = kmem_cache_create("mptcp_sock",
|
|
+ sizeof(struct mptcp_tcp_sock),
|
|
+ 0, SLAB_HWCACHE_ALIGN,
|
|
+ NULL);
|
|
+ if (!mptcp_sock_cache)
|
|
+ goto mptcp_sock_cache_failed;
|
|
+
|
|
+ mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb),
|
|
+ 0, SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN,
|
|
+ NULL);
|
|
+ if (!mptcp_cb_cache)
|
|
+ goto mptcp_cb_cache_failed;
|
|
+
|
|
+ mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw),
|
|
+ 0, SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN,
|
|
+ NULL);
|
|
+ if (!mptcp_tw_cache)
|
|
+ goto mptcp_tw_cache_failed;
|
|
+
|
|
+ get_random_bytes(&mptcp_secret, sizeof(mptcp_secret));
|
|
+
|
|
+ mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
|
|
+ if (!mptcp_wq)
|
|
+ goto alloc_workqueue_failed;
|
|
+
|
|
+ mptcp_tk_htable.hashtable =
|
|
+ alloc_large_system_hash("MPTCP tokens",
|
|
+ sizeof(mptcp_tk_htable.hashtable[0]),
|
|
+ mptcp_htable_entries,
|
|
+ 18, /* one slot per 256KB of memory */
|
|
+ 0,
|
|
+ NULL,
|
|
+ &mptcp_tk_htable.mask,
|
|
+ 1024,
|
|
+ mptcp_htable_entries ? 0 : 1024 * 1024);
|
|
+
|
|
+ for (i = 0; i <= mptcp_tk_htable.mask; i++)
|
|
+ INIT_HLIST_NULLS_HEAD(&mptcp_tk_htable.hashtable[i], i);
|
|
+
|
|
+ mptcp_reqsk_tk_htb.hashtable =
|
|
+ alloc_large_system_hash("MPTCP request tokens",
|
|
+ sizeof(mptcp_reqsk_tk_htb.hashtable[0]),
|
|
+ mptcp_htable_entries,
|
|
+ 18, /* one slot per 256KB of memory */
|
|
+ 0,
|
|
+ NULL,
|
|
+ &mptcp_reqsk_tk_htb.mask,
|
|
+ 1024,
|
|
+ mptcp_htable_entries ? 0 : 1024 * 1024);
|
|
+
|
|
+ for (i = 0; i <= mptcp_reqsk_tk_htb.mask; i++)
|
|
+ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb.hashtable[i], i);
|
|
+
|
|
+
|
|
+ spin_lock_init(&mptcp_tk_hashlock);
|
|
+
|
|
+ if (register_pernet_subsys(&mptcp_pm_proc_ops))
|
|
+ goto pernet_failed;
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ if (mptcp_pm_v6_init())
|
|
+ goto mptcp_pm_v6_failed;
|
|
+#endif
|
|
+ if (mptcp_pm_v4_init())
|
|
+ goto mptcp_pm_v4_failed;
|
|
+
|
|
+ mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table);
|
|
+ if (!mptcp_sysctl)
|
|
+ goto register_sysctl_failed;
|
|
+
|
|
+ if (mptcp_register_path_manager(&mptcp_pm_default))
|
|
+ goto register_pm_failed;
|
|
+
|
|
+ if (mptcp_register_scheduler(&mptcp_sched_default))
|
|
+ goto register_sched_failed;
|
|
+
|
|
+ pr_info("MPTCP: Unstable branch");
|
|
+
|
|
+ mptcp_init_failed = false;
|
|
+
|
|
+ return;
|
|
+
|
|
+register_sched_failed:
|
|
+ mptcp_unregister_path_manager(&mptcp_pm_default);
|
|
+register_pm_failed:
|
|
+ unregister_net_sysctl_table(mptcp_sysctl);
|
|
+register_sysctl_failed:
|
|
+ mptcp_pm_v4_undo();
|
|
+mptcp_pm_v4_failed:
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ mptcp_pm_v6_undo();
|
|
+mptcp_pm_v6_failed:
|
|
+#endif
|
|
+ unregister_pernet_subsys(&mptcp_pm_proc_ops);
|
|
+pernet_failed:
|
|
+ destroy_workqueue(mptcp_wq);
|
|
+alloc_workqueue_failed:
|
|
+ kmem_cache_destroy(mptcp_tw_cache);
|
|
+mptcp_tw_cache_failed:
|
|
+ kmem_cache_destroy(mptcp_cb_cache);
|
|
+mptcp_cb_cache_failed:
|
|
+ kmem_cache_destroy(mptcp_sock_cache);
|
|
+mptcp_sock_cache_failed:
|
|
+ mptcp_init_failed = true;
|
|
+}
|
|
diff --git a/net/mptcp/mptcp_ecf.c b/net/mptcp/mptcp_ecf.c
|
|
new file mode 100644
|
|
index 000000000000..6b976b2b0c72
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_ecf.c
|
|
@@ -0,0 +1,195 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/* MPTCP ECF Scheduler
|
|
+ *
|
|
+ * Algorithm Design:
|
|
+ * Yeon-sup Lim <ylim@cs.umass.edu>
|
|
+ * Don Towsley <towsley@cs.umass.edu>
|
|
+ * Erich M. Nahum <nahum@us.ibm.com>
|
|
+ * Richard J. Gibbens <richard.gibbens@cl.cam.ac.uk>
|
|
+ *
|
|
+ * Initial Implementation:
|
|
+ * Yeon-sup Lim <ylim@cs.umass.edu>
|
|
+ *
|
|
+ * Additional Authors:
|
|
+ * Daniel Weber <weberd@cs.uni-bonn.de>
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#include <linux/module.h>
|
|
+#include <net/mptcp.h>
|
|
+
|
|
+static unsigned int mptcp_ecf_r_beta __read_mostly = 4; /* beta = 1/r_beta = 0.25 */
|
|
+module_param(mptcp_ecf_r_beta, int, 0644);
|
|
+MODULE_PARM_DESC(mptcp_ecf_r_beta, "beta for ECF");
|
|
+
|
|
+struct ecfsched_priv {
|
|
+ u32 last_rbuf_opti;
|
|
+};
|
|
+
|
|
+struct ecfsched_cb {
|
|
+ u32 switching_margin; /* this is "waiting" in algorithm description */
|
|
+};
|
|
+
|
|
+static struct ecfsched_priv *ecfsched_get_priv(const struct tcp_sock *tp)
|
|
+{
|
|
+ return (struct ecfsched_priv *)&tp->mptcp->mptcp_sched[0];
|
|
+}
|
|
+
|
|
+static struct ecfsched_cb *ecfsched_get_cb(const struct tcp_sock *tp)
|
|
+{
|
|
+ return (struct ecfsched_cb *)&tp->mpcb->mptcp_sched[0];
|
|
+}
|
|
+
|
|
+/* This is the ECF scheduler. This function decides on which flow to send
|
|
+ * a given MSS. If all subflows are found to be busy or the currently best
|
|
+ * subflow is estimated to be slower than waiting for minsk, NULL is returned.
|
|
+ */
|
|
+static struct sock *ecf_get_available_subflow(struct sock *meta_sk,
|
|
+ struct sk_buff *skb,
|
|
+ bool zero_wnd_test)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct sock *bestsk, *minsk = NULL;
|
|
+ struct tcp_sock *besttp;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct ecfsched_cb *ecf_cb = ecfsched_get_cb(tcp_sk(meta_sk));
|
|
+ u32 min_srtt = U32_MAX;
|
|
+ u32 sub_sndbuf = 0;
|
|
+ u32 sub_packets_out = 0;
|
|
+
|
|
+ /* Answer data_fin on same subflow!!! */
|
|
+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
|
|
+ skb && mptcp_is_data_fin(skb)) {
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ bestsk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (tcp_sk(bestsk)->mptcp->path_index == mpcb->dfin_path_index &&
|
|
+ mptcp_is_available(bestsk, skb, zero_wnd_test))
|
|
+ return bestsk;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* First, find the overall best (fastest) subflow */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ bestsk = mptcp_to_sock(mptcp);
|
|
+ besttp = tcp_sk(bestsk);
|
|
+
|
|
+ /* Set of states for which we are allowed to send data */
|
|
+ if (!mptcp_sk_can_send(bestsk))
|
|
+ continue;
|
|
+
|
|
+ /* We do not send data on this subflow unless it is
|
|
+ * fully established, i.e. the 4th ack has been received.
|
|
+ */
|
|
+ if (besttp->mptcp->pre_established)
|
|
+ continue;
|
|
+
|
|
+ sub_sndbuf += bestsk->sk_wmem_queued;
|
|
+ sub_packets_out += besttp->packets_out;
|
|
+
|
|
+ /* record minimal rtt */
|
|
+ if (besttp->srtt_us < min_srtt) {
|
|
+ min_srtt = besttp->srtt_us;
|
|
+ minsk = bestsk;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* find the current best subflow according to the default scheduler */
|
|
+ bestsk = get_available_subflow(meta_sk, skb, zero_wnd_test);
|
|
+
|
|
+ /* if we decided to use a slower flow, we have the option of not using it at all */
|
|
+ if (bestsk && minsk && bestsk != minsk) {
|
|
+ u32 mss = tcp_current_mss(bestsk); /* assuming equal MSS */
|
|
+ u32 sndbuf_meta = meta_sk->sk_wmem_queued;
|
|
+ u32 sndbuf_minus = sub_sndbuf;
|
|
+ u32 sndbuf = 0;
|
|
+
|
|
+ u32 cwnd_f = tcp_sk(minsk)->snd_cwnd;
|
|
+ u32 srtt_f = tcp_sk(minsk)->srtt_us >> 3;
|
|
+ u32 rttvar_f = tcp_sk(minsk)->rttvar_us >> 1;
|
|
+
|
|
+ u32 cwnd_s = tcp_sk(bestsk)->snd_cwnd;
|
|
+ u32 srtt_s = tcp_sk(bestsk)->srtt_us >> 3;
|
|
+ u32 rttvar_s = tcp_sk(bestsk)->rttvar_us >> 1;
|
|
+
|
|
+ u32 delta = max(rttvar_f, rttvar_s);
|
|
+
|
|
+ u32 x_f;
|
|
+ u64 lhs, rhs; /* to avoid overflow, using u64 */
|
|
+
|
|
+ if (tcp_sk(meta_sk)->packets_out > sub_packets_out)
|
|
+ sndbuf_minus += (tcp_sk(meta_sk)->packets_out - sub_packets_out) * mss;
|
|
+
|
|
+ if (sndbuf_meta > sndbuf_minus)
|
|
+ sndbuf = sndbuf_meta - sndbuf_minus;
|
|
+
|
|
+ /* we have something to send.
|
|
+ * at least one time tx over fastest subflow is required
|
|
+ */
|
|
+ x_f = sndbuf > cwnd_f * mss ? sndbuf : cwnd_f * mss;
|
|
+ lhs = srtt_f * (x_f + cwnd_f * mss);
|
|
+ rhs = cwnd_f * mss * (srtt_s + delta);
|
|
+
|
|
+ if (mptcp_ecf_r_beta * lhs < mptcp_ecf_r_beta * rhs + ecf_cb->switching_margin * rhs) {
|
|
+ u32 x_s = sndbuf > cwnd_s * mss ? sndbuf : cwnd_s * mss;
|
|
+ u64 lhs_s = srtt_s * x_s;
|
|
+ u64 rhs_s = cwnd_s * mss * (2 * srtt_f + delta);
|
|
+
|
|
+ if (lhs_s >= rhs_s) {
|
|
+ /* too slower than fastest */
|
|
+ ecf_cb->switching_margin = 1;
|
|
+ return NULL;
|
|
+ }
|
|
+ } else {
|
|
+ /* use slower one */
|
|
+ ecf_cb->switching_margin = 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return bestsk;
|
|
+}
|
|
+
|
|
+static void ecfsched_init(struct sock *sk)
|
|
+{
|
|
+ struct ecfsched_priv *ecf_p = ecfsched_get_priv(tcp_sk(sk));
|
|
+ struct ecfsched_cb *ecf_cb = ecfsched_get_cb(tcp_sk(mptcp_meta_sk(sk)));
|
|
+
|
|
+ ecf_p->last_rbuf_opti = tcp_jiffies32;
|
|
+ ecf_cb->switching_margin = 0;
|
|
+}
|
|
+
|
|
+struct mptcp_sched_ops mptcp_sched_ecf = {
|
|
+ .get_subflow = ecf_get_available_subflow,
|
|
+ .next_segment = mptcp_next_segment,
|
|
+ .init = ecfsched_init,
|
|
+ .name = "ecf",
|
|
+ .owner = THIS_MODULE,
|
|
+};
|
|
+
|
|
+static int __init ecf_register(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct ecfsched_priv) > MPTCP_SCHED_SIZE);
|
|
+ BUILD_BUG_ON(sizeof(struct ecfsched_cb) > MPTCP_SCHED_DATA_SIZE);
|
|
+
|
|
+ if (mptcp_register_scheduler(&mptcp_sched_ecf))
|
|
+ return -1;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void ecf_unregister(void)
|
|
+{
|
|
+ mptcp_unregister_scheduler(&mptcp_sched_ecf);
|
|
+}
|
|
+
|
|
+module_init(ecf_register);
|
|
+module_exit(ecf_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Yeon-sup Lim, Daniel Weber");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("ECF (Earliest Completion First) scheduler for MPTCP, based on default minimum RTT scheduler");
|
|
+MODULE_VERSION("0.95");
|
|
diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c
|
|
new file mode 100644
|
|
index 000000000000..5424960256e6
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_fullmesh.c
|
|
@@ -0,0 +1,1938 @@
|
|
+#include <linux/module.h>
|
|
+#include <linux/proc_fs.h>
|
|
+
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+#include <net/mptcp_v6.h>
|
|
+#include <net/addrconf.h>
|
|
+#endif
|
|
+
|
|
+enum {
|
|
+ MPTCP_EVENT_ADD = 1,
|
|
+ MPTCP_EVENT_DEL,
|
|
+ MPTCP_EVENT_MOD,
|
|
+};
|
|
+
|
|
+#define MPTCP_SUBFLOW_RETRY_DELAY 1000
|
|
+
|
|
+/* Max number of local or remote addresses we can store.
|
|
+ * When changing, see the bitfield below in fullmesh_rem4/6.
|
|
+ */
|
|
+#define MPTCP_MAX_ADDR 8
|
|
+
|
|
+struct fullmesh_rem4 {
|
|
+ u8 rem4_id;
|
|
+ u8 bitfield;
|
|
+ u8 retry_bitfield;
|
|
+ __be16 port;
|
|
+ struct in_addr addr;
|
|
+};
|
|
+
|
|
+struct fullmesh_rem6 {
|
|
+ u8 rem6_id;
|
|
+ u8 bitfield;
|
|
+ u8 retry_bitfield;
|
|
+ __be16 port;
|
|
+ struct in6_addr addr;
|
|
+};
|
|
+
|
|
+struct mptcp_loc_addr {
|
|
+ struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR];
|
|
+ u8 loc4_bits;
|
|
+ u8 next_v4_index;
|
|
+
|
|
+ struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR];
|
|
+ u8 loc6_bits;
|
|
+ u8 next_v6_index;
|
|
+ struct rcu_head rcu;
|
|
+};
|
|
+
|
|
+struct mptcp_addr_event {
|
|
+ struct list_head list;
|
|
+ unsigned short family;
|
|
+ u8 code:7,
|
|
+ low_prio:1;
|
|
+ int if_idx;
|
|
+ union inet_addr addr;
|
|
+};
|
|
+
|
|
+struct fullmesh_priv {
|
|
+ /* Worker struct for subflow establishment */
|
|
+ struct work_struct subflow_work;
|
|
+ /* Delayed worker, when the routing-tables are not yet ready. */
|
|
+ struct delayed_work subflow_retry_work;
|
|
+
|
|
+ /* Remote addresses */
|
|
+ struct fullmesh_rem4 remaddr4[MPTCP_MAX_ADDR];
|
|
+ struct fullmesh_rem6 remaddr6[MPTCP_MAX_ADDR];
|
|
+
|
|
+ struct mptcp_cb *mpcb;
|
|
+
|
|
+ u16 remove_addrs; /* Addresses to remove */
|
|
+ u8 announced_addrs_v4; /* IPv4 Addresses we did announce */
|
|
+ u8 announced_addrs_v6; /* IPv6 Addresses we did announce */
|
|
+
|
|
+ u8 add_addr; /* Are we sending an add_addr? */
|
|
+
|
|
+ u8 rem4_bits;
|
|
+ u8 rem6_bits;
|
|
+
|
|
+ /* Have we established the additional subflows for primary pair? */
|
|
+ u8 first_pair:1;
|
|
+};
|
|
+
|
|
+struct mptcp_fm_ns {
|
|
+ struct mptcp_loc_addr __rcu *local;
|
|
+ spinlock_t local_lock; /* Protecting the above pointer */
|
|
+ struct list_head events;
|
|
+ struct delayed_work address_worker;
|
|
+
|
|
+ struct net *net;
|
|
+};
|
|
+
|
|
+static int num_subflows __read_mostly = 1;
|
|
+module_param(num_subflows, int, 0644);
|
|
+MODULE_PARM_DESC(num_subflows, "choose the number of subflows per pair of IP addresses of MPTCP connection");
|
|
+
|
|
+static int create_on_err __read_mostly;
|
|
+module_param(create_on_err, int, 0644);
|
|
+MODULE_PARM_DESC(create_on_err, "recreate the subflow upon a timeout");
|
|
+
|
|
+static struct mptcp_pm_ops full_mesh __read_mostly;
|
|
+
|
|
+static void full_mesh_create_subflows(struct sock *meta_sk);
|
|
+
|
|
+static struct mptcp_fm_ns *fm_get_ns(const struct net *net)
|
|
+{
|
|
+ return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH];
|
|
+}
|
|
+
|
|
+static struct fullmesh_priv *fullmesh_get_priv(const struct mptcp_cb *mpcb)
|
|
+{
|
|
+ return (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
|
|
+}
|
|
+
|
|
+/* Find the first free index in the bitfield */
|
|
+static int __mptcp_find_free_index(u8 bitfield, u8 base)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ /* There are anyways no free bits... */
|
|
+ if (bitfield == 0xff)
|
|
+ goto exit;
|
|
+
|
|
+ i = ffs(~(bitfield >> base)) - 1;
|
|
+ if (i < 0)
|
|
+ goto exit;
|
|
+
|
|
+ /* No free bits when starting at base, try from 0 on */
|
|
+ if (i + base >= sizeof(bitfield) * 8)
|
|
+ return __mptcp_find_free_index(bitfield, 0);
|
|
+
|
|
+ return i + base;
|
|
+exit:
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static int mptcp_find_free_index(u8 bitfield)
|
|
+{
|
|
+ return __mptcp_find_free_index(bitfield, 0);
|
|
+}
|
|
+
|
|
+static void mptcp_addv4_raddr(struct mptcp_cb *mpcb,
|
|
+ const struct in_addr *addr,
|
|
+ __be16 port, u8 id)
|
|
+{
|
|
+ int i;
|
|
+ struct fullmesh_rem4 *rem4;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
|
|
+ rem4 = &fmp->remaddr4[i];
|
|
+
|
|
+ /* Address is already in the list --- continue */
|
|
+ if (rem4->rem4_id == id &&
|
|
+ rem4->addr.s_addr == addr->s_addr && rem4->port == port)
|
|
+ return;
|
|
+
|
|
+ /* This may be the case, when the peer is behind a NAT. He is
|
|
+ * trying to JOIN, thus sending the JOIN with a certain ID.
|
|
+ * However the src_addr of the IP-packet has been changed. We
|
|
+ * update the addr in the list, because this is the address as
|
|
+ * OUR BOX sees it.
|
|
+ */
|
|
+ if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) {
|
|
+ /* update the address */
|
|
+ mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n",
|
|
+ __func__, &rem4->addr.s_addr,
|
|
+ &addr->s_addr, id);
|
|
+ rem4->addr.s_addr = addr->s_addr;
|
|
+ rem4->port = port;
|
|
+ mpcb->list_rcvd = 1;
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ i = mptcp_find_free_index(fmp->rem4_bits);
|
|
+ /* Do we have already the maximum number of local/remote addresses? */
|
|
+ if (i < 0) {
|
|
+ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n",
|
|
+ __func__, MPTCP_MAX_ADDR, &addr->s_addr);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ rem4 = &fmp->remaddr4[i];
|
|
+
|
|
+ /* Address is not known yet, store it */
|
|
+ rem4->addr.s_addr = addr->s_addr;
|
|
+ rem4->port = port;
|
|
+ rem4->bitfield = 0;
|
|
+ rem4->retry_bitfield = 0;
|
|
+ rem4->rem4_id = id;
|
|
+ mpcb->list_rcvd = 1;
|
|
+ fmp->rem4_bits |= (1 << i);
|
|
+
|
|
+ return;
|
|
+}
|
|
+
|
|
+static void mptcp_addv6_raddr(struct mptcp_cb *mpcb,
|
|
+ const struct in6_addr *addr,
|
|
+ __be16 port, u8 id)
|
|
+{
|
|
+ int i;
|
|
+ struct fullmesh_rem6 *rem6;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
|
|
+ rem6 = &fmp->remaddr6[i];
|
|
+
|
|
+ /* Address is already in the list --- continue */
|
|
+ if (rem6->rem6_id == id &&
|
|
+ ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port)
|
|
+ return;
|
|
+
|
|
+ /* This may be the case, when the peer is behind a NAT. He is
|
|
+ * trying to JOIN, thus sending the JOIN with a certain ID.
|
|
+ * However the src_addr of the IP-packet has been changed. We
|
|
+ * update the addr in the list, because this is the address as
|
|
+ * OUR BOX sees it.
|
|
+ */
|
|
+ if (rem6->rem6_id == id) {
|
|
+ /* update the address */
|
|
+ mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n",
|
|
+ __func__, &rem6->addr, addr, id);
|
|
+ rem6->addr = *addr;
|
|
+ rem6->port = port;
|
|
+ mpcb->list_rcvd = 1;
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ i = mptcp_find_free_index(fmp->rem6_bits);
|
|
+ /* Do we have already the maximum number of local/remote addresses? */
|
|
+ if (i < 0) {
|
|
+ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n",
|
|
+ __func__, MPTCP_MAX_ADDR, addr);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ rem6 = &fmp->remaddr6[i];
|
|
+
|
|
+ /* Address is not known yet, store it */
|
|
+ rem6->addr = *addr;
|
|
+ rem6->port = port;
|
|
+ rem6->bitfield = 0;
|
|
+ rem6->retry_bitfield = 0;
|
|
+ rem6->rem6_id = id;
|
|
+ mpcb->list_rcvd = 1;
|
|
+ fmp->rem6_bits |= (1 << i);
|
|
+
|
|
+ return;
|
|
+}
|
|
+
|
|
+static void mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id)
|
|
+{
|
|
+ int i;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
|
|
+ if (fmp->remaddr4[i].rem4_id == id) {
|
|
+ /* remove address from bitfield */
|
|
+ fmp->rem4_bits &= ~(1 << i);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static void mptcp_v6_rem_raddress(const struct mptcp_cb *mpcb, u8 id)
|
|
+{
|
|
+ int i;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
|
|
+ if (fmp->remaddr6[i].rem6_id == id) {
|
|
+ /* remove address from bitfield */
|
|
+ fmp->rem6_bits &= ~(1 << i);
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Sets the bitfield of the remote-address field */
|
|
+static void mptcp_v4_set_init_addr_bit(const struct mptcp_cb *mpcb,
|
|
+ const struct in_addr *addr, u8 index)
|
|
+{
|
|
+ int i;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
|
|
+ if (fmp->remaddr4[i].addr.s_addr == addr->s_addr) {
|
|
+ fmp->remaddr4[i].bitfield |= (1 << index);
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Sets the bitfield of the remote-address field */
|
|
+static void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,
|
|
+ const struct in6_addr *addr, u8 index)
|
|
+{
|
|
+ int i;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
|
|
+ if (ipv6_addr_equal(&fmp->remaddr6[i].addr, addr)) {
|
|
+ fmp->remaddr6[i].bitfield |= (1 << index);
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static void mptcp_set_init_addr_bit(struct mptcp_cb *mpcb,
|
|
+ const union inet_addr *addr,
|
|
+ sa_family_t family, u8 id)
|
|
+{
|
|
+ if (family == AF_INET)
|
|
+ mptcp_v4_set_init_addr_bit(mpcb, &addr->in, id);
|
|
+ else
|
|
+ mptcp_v6_set_init_addr_bit(mpcb, &addr->in6, id);
|
|
+}
|
|
+
|
|
+static void mptcp_v4_subflows(struct sock *meta_sk,
|
|
+ const struct mptcp_loc4 *loc,
|
|
+ struct mptcp_rem4 *rem)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ for (i = 1; i < num_subflows; i++)
|
|
+ mptcp_init4_subsockets(meta_sk, loc, rem);
|
|
+}
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+static void mptcp_v6_subflows(struct sock *meta_sk,
|
|
+ const struct mptcp_loc6 *loc,
|
|
+ struct mptcp_rem6 *rem)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ for (i = 1; i < num_subflows; i++)
|
|
+ mptcp_init6_subsockets(meta_sk, loc, rem);
|
|
+}
|
|
+#endif
|
|
+
|
|
+static void retry_subflow_worker(struct work_struct *work)
|
|
+{
|
|
+ struct delayed_work *delayed_work = container_of(work,
|
|
+ struct delayed_work,
|
|
+ work);
|
|
+ struct fullmesh_priv *fmp = container_of(delayed_work,
|
|
+ struct fullmesh_priv,
|
|
+ subflow_retry_work);
|
|
+ struct mptcp_cb *mpcb = fmp->mpcb;
|
|
+ struct sock *meta_sk = mpcb->meta_sk;
|
|
+ struct mptcp_loc_addr *mptcp_local;
|
|
+ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
|
|
+ int iter = 0, i;
|
|
+
|
|
+ /* We need a local (stable) copy of the address-list. Really, it is not
|
|
+ * such a big deal, if the address-list is not 100% up-to-date.
|
|
+ */
|
|
+ rcu_read_lock_bh();
|
|
+ mptcp_local = rcu_dereference_bh(fm_ns->local);
|
|
+ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
|
|
+ rcu_read_unlock_bh();
|
|
+
|
|
+ if (!mptcp_local)
|
|
+ return;
|
|
+
|
|
+next_subflow:
|
|
+ if (iter) {
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+
|
|
+ cond_resched();
|
|
+ }
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
|
|
+
|
|
+ if (!mptcp(tcp_sk(meta_sk)))
|
|
+ goto exit;
|
|
+
|
|
+ iter++;
|
|
+
|
|
+ if (sock_flag(meta_sk, SOCK_DEAD))
|
|
+ goto exit;
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
|
|
+ struct fullmesh_rem4 *rem = &fmp->remaddr4[i];
|
|
+ /* Do we need to retry establishing a subflow ? */
|
|
+ if (rem->retry_bitfield) {
|
|
+ int i = mptcp_find_free_index(~rem->retry_bitfield);
|
|
+ struct mptcp_rem4 rem4;
|
|
+
|
|
+ rem->bitfield |= (1 << i);
|
|
+ rem->retry_bitfield &= ~(1 << i);
|
|
+
|
|
+ rem4.addr = rem->addr;
|
|
+ rem4.port = rem->port;
|
|
+ rem4.rem4_id = rem->rem4_id;
|
|
+
|
|
+ mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], &rem4);
|
|
+ mptcp_v4_subflows(meta_sk,
|
|
+ &mptcp_local->locaddr4[i],
|
|
+ &rem4);
|
|
+ goto next_subflow;
|
|
+ }
|
|
+ }
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
|
|
+ struct fullmesh_rem6 *rem = &fmp->remaddr6[i];
|
|
+
|
|
+ /* Do we need to retry establishing a subflow ? */
|
|
+ if (rem->retry_bitfield) {
|
|
+ int i = mptcp_find_free_index(~rem->retry_bitfield);
|
|
+ struct mptcp_rem6 rem6;
|
|
+
|
|
+ rem->bitfield |= (1 << i);
|
|
+ rem->retry_bitfield &= ~(1 << i);
|
|
+
|
|
+ rem6.addr = rem->addr;
|
|
+ rem6.port = rem->port;
|
|
+ rem6.rem6_id = rem->rem6_id;
|
|
+
|
|
+ mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], &rem6);
|
|
+ mptcp_v6_subflows(meta_sk,
|
|
+ &mptcp_local->locaddr6[i],
|
|
+ &rem6);
|
|
+ goto next_subflow;
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+
|
|
+exit:
|
|
+ kfree(mptcp_local);
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ mptcp_mpcb_put(mpcb);
|
|
+ sock_put(meta_sk);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * Create all new subflows, by doing calls to mptcp_initX_subsockets
|
|
+ *
|
|
+ * This function uses a goto next_subflow, to allow releasing the lock between
|
|
+ * new subflows and giving other processes a chance to do some work on the
|
|
+ * socket and potentially finishing the communication.
|
|
+ **/
|
|
+static void create_subflow_worker(struct work_struct *work)
|
|
+{
|
|
+ struct fullmesh_priv *fmp = container_of(work, struct fullmesh_priv,
|
|
+ subflow_work);
|
|
+ struct mptcp_cb *mpcb = fmp->mpcb;
|
|
+ struct sock *meta_sk = mpcb->meta_sk;
|
|
+ struct mptcp_loc_addr *mptcp_local;
|
|
+ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
|
|
+ int iter = 0, retry = 0;
|
|
+ int i;
|
|
+
|
|
+ /* We need a local (stable) copy of the address-list. Really, it is not
|
|
+ * such a big deal, if the address-list is not 100% up-to-date.
|
|
+ */
|
|
+ rcu_read_lock_bh();
|
|
+ mptcp_local = rcu_dereference_bh(fm_ns->local);
|
|
+ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
|
|
+ rcu_read_unlock_bh();
|
|
+
|
|
+ if (!mptcp_local)
|
|
+ return;
|
|
+
|
|
+next_subflow:
|
|
+ if (iter) {
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+
|
|
+ cond_resched();
|
|
+ }
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
|
|
+
|
|
+ if (sock_flag(meta_sk, SOCK_DEAD) || !mptcp(tcp_sk(meta_sk)))
|
|
+ goto exit;
|
|
+
|
|
+ if (mpcb->master_sk &&
|
|
+ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
|
|
+ goto exit;
|
|
+
|
|
+ /* Create the additional subflows for the first pair */
|
|
+ if (fmp->first_pair == 0 && mpcb->master_sk) {
|
|
+ struct mptcp_loc4 loc;
|
|
+ struct mptcp_rem4 rem;
|
|
+
|
|
+ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
|
|
+ loc.loc4_id = 0;
|
|
+ loc.low_prio = 0;
|
|
+ loc.if_idx = mpcb->master_sk->sk_bound_dev_if;
|
|
+
|
|
+ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
|
|
+ rem.port = inet_sk(meta_sk)->inet_dport;
|
|
+ rem.rem4_id = 0; /* Default 0 */
|
|
+
|
|
+ mptcp_v4_subflows(meta_sk, &loc, &rem);
|
|
+
|
|
+ fmp->first_pair = 1;
|
|
+ }
|
|
+ iter++;
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
|
|
+ struct fullmesh_rem4 *rem;
|
|
+ u8 remaining_bits;
|
|
+
|
|
+ rem = &fmp->remaddr4[i];
|
|
+ remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits;
|
|
+
|
|
+ /* Are there still combinations to handle? */
|
|
+ if (remaining_bits) {
|
|
+ int i = mptcp_find_free_index(~remaining_bits);
|
|
+ struct mptcp_rem4 rem4;
|
|
+
|
|
+ rem->bitfield |= (1 << i);
|
|
+
|
|
+ rem4.addr = rem->addr;
|
|
+ rem4.port = rem->port;
|
|
+ rem4.rem4_id = rem->rem4_id;
|
|
+
|
|
+ /* If a route is not yet available then retry once */
|
|
+ if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i],
|
|
+ &rem4) == -ENETUNREACH)
|
|
+ retry = rem->retry_bitfield |= (1 << i);
|
|
+ else
|
|
+ mptcp_v4_subflows(meta_sk,
|
|
+ &mptcp_local->locaddr4[i],
|
|
+ &rem4);
|
|
+ goto next_subflow;
|
|
+ }
|
|
+ }
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ if (fmp->first_pair == 0 && mpcb->master_sk) {
|
|
+ struct mptcp_loc6 loc;
|
|
+ struct mptcp_rem6 rem;
|
|
+
|
|
+ loc.addr = inet6_sk(meta_sk)->saddr;
|
|
+ loc.loc6_id = 0;
|
|
+ loc.low_prio = 0;
|
|
+ loc.if_idx = mpcb->master_sk->sk_bound_dev_if;
|
|
+
|
|
+ rem.addr = meta_sk->sk_v6_daddr;
|
|
+ rem.port = inet_sk(meta_sk)->inet_dport;
|
|
+ rem.rem6_id = 0; /* Default 0 */
|
|
+
|
|
+ mptcp_v6_subflows(meta_sk, &loc, &rem);
|
|
+
|
|
+ fmp->first_pair = 1;
|
|
+ }
|
|
+ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
|
|
+ struct fullmesh_rem6 *rem;
|
|
+ u8 remaining_bits;
|
|
+
|
|
+ rem = &fmp->remaddr6[i];
|
|
+ remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits;
|
|
+
|
|
+ /* Are there still combinations to handle? */
|
|
+ if (remaining_bits) {
|
|
+ int i = mptcp_find_free_index(~remaining_bits);
|
|
+ struct mptcp_rem6 rem6;
|
|
+
|
|
+ rem->bitfield |= (1 << i);
|
|
+
|
|
+ rem6.addr = rem->addr;
|
|
+ rem6.port = rem->port;
|
|
+ rem6.rem6_id = rem->rem6_id;
|
|
+
|
|
+ /* If a route is not yet available then retry once */
|
|
+ if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i],
|
|
+ &rem6) == -ENETUNREACH)
|
|
+ retry = rem->retry_bitfield |= (1 << i);
|
|
+ else
|
|
+ mptcp_v6_subflows(meta_sk,
|
|
+ &mptcp_local->locaddr6[i],
|
|
+ &rem6);
|
|
+ goto next_subflow;
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ if (retry && !delayed_work_pending(&fmp->subflow_retry_work)) {
|
|
+ sock_hold(meta_sk);
|
|
+ refcount_inc(&mpcb->mpcb_refcnt);
|
|
+ queue_delayed_work(mptcp_wq, &fmp->subflow_retry_work,
|
|
+ msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY));
|
|
+ }
|
|
+
|
|
+exit:
|
|
+ kfree(mptcp_local);
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ mptcp_mpcb_put(mpcb);
|
|
+ sock_put(meta_sk);
|
|
+}
|
|
+
|
|
+static void announce_remove_addr(u8 addr_id, struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+ struct sock *sk = mptcp_select_ack_sock(meta_sk);
|
|
+
|
|
+ fmp->remove_addrs |= (1 << addr_id);
|
|
+ mpcb->addr_signal = 1;
|
|
+
|
|
+ if (sk)
|
|
+ tcp_send_ack(sk);
|
|
+}
|
|
+
|
|
+static void update_addr_bitfields(struct sock *meta_sk,
|
|
+ const struct mptcp_loc_addr *mptcp_local)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+ int i;
|
|
+
|
|
+ /* The bits in announced_addrs_* always match with loc*_bits. So, a
|
|
+ * simple & operation unsets the correct bits, because these go from
|
|
+ * announced to non-announced
|
|
+ */
|
|
+ fmp->announced_addrs_v4 &= mptcp_local->loc4_bits;
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
|
|
+ fmp->remaddr4[i].bitfield &= mptcp_local->loc4_bits;
|
|
+ fmp->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits;
|
|
+ }
|
|
+
|
|
+ fmp->announced_addrs_v6 &= mptcp_local->loc6_bits;
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
|
|
+ fmp->remaddr6[i].bitfield &= mptcp_local->loc6_bits;
|
|
+ fmp->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int mptcp_find_address(const struct mptcp_loc_addr *mptcp_local,
|
|
+ sa_family_t family, const union inet_addr *addr,
|
|
+ int if_idx)
|
|
+{
|
|
+ int i;
|
|
+ u8 loc_bits;
|
|
+ bool found = false;
|
|
+
|
|
+ if (family == AF_INET)
|
|
+ loc_bits = mptcp_local->loc4_bits;
|
|
+ else
|
|
+ loc_bits = mptcp_local->loc6_bits;
|
|
+
|
|
+ mptcp_for_each_bit_set(loc_bits, i) {
|
|
+ if (family == AF_INET &&
|
|
+ (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx) &&
|
|
+ mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) {
|
|
+ found = true;
|
|
+ break;
|
|
+ }
|
|
+ if (family == AF_INET6 &&
|
|
+ (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx) &&
|
|
+ ipv6_addr_equal(&mptcp_local->locaddr6[i].addr,
|
|
+ &addr->in6)) {
|
|
+ found = true;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!found)
|
|
+ return -1;
|
|
+
|
|
+ return i;
|
|
+}
|
|
+
|
|
+static int mptcp_find_address_transp(const struct mptcp_loc_addr *mptcp_local,
|
|
+ sa_family_t family, int if_idx)
|
|
+{
|
|
+ bool found = false;
|
|
+ u8 loc_bits;
|
|
+ int i;
|
|
+
|
|
+ if (family == AF_INET)
|
|
+ loc_bits = mptcp_local->loc4_bits;
|
|
+ else
|
|
+ loc_bits = mptcp_local->loc6_bits;
|
|
+
|
|
+ mptcp_for_each_bit_set(loc_bits, i) {
|
|
+ if (family == AF_INET &&
|
|
+ (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx)) {
|
|
+ found = true;
|
|
+ break;
|
|
+ }
|
|
+ if (family == AF_INET6 &&
|
|
+ (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx)) {
|
|
+ found = true;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!found)
|
|
+ return -1;
|
|
+
|
|
+ return i;
|
|
+}
|
|
+
|
|
+static void mptcp_address_worker(struct work_struct *work)
|
|
+{
|
|
+ const struct delayed_work *delayed_work = container_of(work,
|
|
+ struct delayed_work,
|
|
+ work);
|
|
+ struct mptcp_fm_ns *fm_ns = container_of(delayed_work,
|
|
+ struct mptcp_fm_ns,
|
|
+ address_worker);
|
|
+ struct net *net = fm_ns->net;
|
|
+ struct mptcp_addr_event *event = NULL;
|
|
+ struct mptcp_loc_addr *mptcp_local, *old;
|
|
+ int i, id = -1; /* id is used in the socket-code on a delete-event */
|
|
+ bool success; /* Used to indicate if we succeeded handling the event */
|
|
+
|
|
+next_event:
|
|
+ success = false;
|
|
+ kfree(event);
|
|
+
|
|
+ /* First, let's dequeue an event from our event-list */
|
|
+ rcu_read_lock_bh();
|
|
+ spin_lock(&fm_ns->local_lock);
|
|
+
|
|
+ event = list_first_entry_or_null(&fm_ns->events,
|
|
+ struct mptcp_addr_event, list);
|
|
+ if (!event) {
|
|
+ spin_unlock(&fm_ns->local_lock);
|
|
+ rcu_read_unlock_bh();
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ list_del(&event->list);
|
|
+
|
|
+ mptcp_local = rcu_dereference_bh(fm_ns->local);
|
|
+
|
|
+ if (event->code == MPTCP_EVENT_DEL) {
|
|
+ id = mptcp_find_address(mptcp_local, event->family,
|
|
+ &event->addr, event->if_idx);
|
|
+
|
|
+ /* Not in the list - so we don't care */
|
|
+ if (id < 0) {
|
|
+ mptcp_debug("%s could not find id\n", __func__);
|
|
+ goto duno;
|
|
+ }
|
|
+
|
|
+ old = mptcp_local;
|
|
+ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
|
|
+ GFP_ATOMIC);
|
|
+ if (!mptcp_local)
|
|
+ goto duno;
|
|
+
|
|
+ if (event->family == AF_INET)
|
|
+ mptcp_local->loc4_bits &= ~(1 << id);
|
|
+ else
|
|
+ mptcp_local->loc6_bits &= ~(1 << id);
|
|
+
|
|
+ rcu_assign_pointer(fm_ns->local, mptcp_local);
|
|
+ kfree_rcu(old, rcu);
|
|
+ } else {
|
|
+ int i = mptcp_find_address(mptcp_local, event->family,
|
|
+ &event->addr, event->if_idx);
|
|
+ int j = i;
|
|
+
|
|
+ if (j < 0) {
|
|
+ /* Not in the list, so we have to find an empty slot */
|
|
+ if (event->family == AF_INET)
|
|
+ i = __mptcp_find_free_index(mptcp_local->loc4_bits,
|
|
+ mptcp_local->next_v4_index);
|
|
+ if (event->family == AF_INET6)
|
|
+ i = __mptcp_find_free_index(mptcp_local->loc6_bits,
|
|
+ mptcp_local->next_v6_index);
|
|
+
|
|
+ if (i < 0) {
|
|
+ mptcp_debug("%s no more space\n", __func__);
|
|
+ goto duno;
|
|
+ }
|
|
+
|
|
+ /* It might have been a MOD-event. */
|
|
+ event->code = MPTCP_EVENT_ADD;
|
|
+ } else {
|
|
+ /* Let's check if anything changes */
|
|
+ if (event->family == AF_INET &&
|
|
+ event->low_prio == mptcp_local->locaddr4[i].low_prio)
|
|
+ goto duno;
|
|
+
|
|
+ if (event->family == AF_INET6 &&
|
|
+ event->low_prio == mptcp_local->locaddr6[i].low_prio)
|
|
+ goto duno;
|
|
+ }
|
|
+
|
|
+ old = mptcp_local;
|
|
+ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
|
|
+ GFP_ATOMIC);
|
|
+ if (!mptcp_local)
|
|
+ goto duno;
|
|
+
|
|
+ if (event->family == AF_INET) {
|
|
+ mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr;
|
|
+ mptcp_local->locaddr4[i].loc4_id = i + 1;
|
|
+ mptcp_local->locaddr4[i].low_prio = event->low_prio;
|
|
+ mptcp_local->locaddr4[i].if_idx = event->if_idx;
|
|
+
|
|
+ mptcp_debug("%s updated IP %pI4 on ifidx %u prio %u id %u\n",
|
|
+ __func__, &event->addr.in.s_addr,
|
|
+ event->if_idx, event->low_prio, i + 1);
|
|
+ } else {
|
|
+ mptcp_local->locaddr6[i].addr = event->addr.in6;
|
|
+ mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR;
|
|
+ mptcp_local->locaddr6[i].low_prio = event->low_prio;
|
|
+ mptcp_local->locaddr6[i].if_idx = event->if_idx;
|
|
+
|
|
+ mptcp_debug("%s updated IP %pI6 on ifidx %u prio %u id %u\n",
|
|
+ __func__, &event->addr.in6,
|
|
+ event->if_idx, event->low_prio, i + MPTCP_MAX_ADDR);
|
|
+ }
|
|
+
|
|
+ if (j < 0) {
|
|
+ if (event->family == AF_INET) {
|
|
+ mptcp_local->loc4_bits |= (1 << i);
|
|
+ mptcp_local->next_v4_index = i + 1;
|
|
+ } else {
|
|
+ mptcp_local->loc6_bits |= (1 << i);
|
|
+ mptcp_local->next_v6_index = i + 1;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ rcu_assign_pointer(fm_ns->local, mptcp_local);
|
|
+ kfree_rcu(old, rcu);
|
|
+ }
|
|
+ success = true;
|
|
+
|
|
+duno:
|
|
+ spin_unlock(&fm_ns->local_lock);
|
|
+ rcu_read_unlock_bh();
|
|
+
|
|
+ if (!success)
|
|
+ goto next_event;
|
|
+
|
|
+ /* Now we iterate over the MPTCP-sockets and apply the event. */
|
|
+ for (i = 0; i <= mptcp_tk_htable.mask; i++) {
|
|
+ const struct hlist_nulls_node *node;
|
|
+ struct tcp_sock *meta_tp;
|
|
+
|
|
+ rcu_read_lock_bh();
|
|
+ hlist_nulls_for_each_entry_rcu(meta_tp, node,
|
|
+ &mptcp_tk_htable.hashtable[i],
|
|
+ tk_table) {
|
|
+ struct sock *meta_sk = (struct sock *)meta_tp, *sk;
|
|
+ bool meta_v4 = meta_sk->sk_family == AF_INET;
|
|
+ struct mptcp_cb *mpcb;
|
|
+
|
|
+ if (sock_net(meta_sk) != net)
|
|
+ continue;
|
|
+
|
|
+ if (meta_v4) {
|
|
+ /* skip IPv6 events if meta is IPv4 */
|
|
+ if (event->family == AF_INET6)
|
|
+ continue;
|
|
+ } else if (event->family == AF_INET && meta_sk->sk_ipv6only) {
|
|
+ /* skip IPv4 events if IPV6_V6ONLY is set */
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (unlikely(!refcount_inc_not_zero(&meta_sk->sk_refcnt)))
|
|
+ continue;
|
|
+
|
|
+ bh_lock_sock(meta_sk);
|
|
+
|
|
+ mpcb = meta_tp->mpcb;
|
|
+ if (!mpcb)
|
|
+ goto next;
|
|
+
|
|
+ if (!mptcp(meta_tp) || !is_meta_sk(meta_sk) ||
|
|
+ mptcp_in_infinite_mapping_weak(mpcb))
|
|
+ goto next;
|
|
+
|
|
+ /* May be that the pm has changed in-between */
|
|
+ if (mpcb->pm_ops != &full_mesh)
|
|
+ goto next;
|
|
+
|
|
+ if (sock_owned_by_user(meta_sk)) {
|
|
+ if (!test_and_set_bit(MPTCP_PATH_MANAGER_DEFERRED,
|
|
+ &meta_sk->sk_tsq_flags))
|
|
+ sock_hold(meta_sk);
|
|
+
|
|
+ goto next;
|
|
+ }
|
|
+
|
|
+ if (event->code == MPTCP_EVENT_ADD) {
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+
|
|
+ fmp->add_addr++;
|
|
+ mpcb->addr_signal = 1;
|
|
+
|
|
+ sk = mptcp_select_ack_sock(meta_sk);
|
|
+ if (sk)
|
|
+ tcp_send_ack(sk);
|
|
+
|
|
+ full_mesh_create_subflows(meta_sk);
|
|
+ }
|
|
+
|
|
+ if (event->code == MPTCP_EVENT_DEL) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct mptcp_loc_addr *mptcp_local;
|
|
+ struct hlist_node *tmp;
|
|
+ bool found = false;
|
|
+
|
|
+ mptcp_local = rcu_dereference_bh(fm_ns->local);
|
|
+
|
|
+ /* In any case, we need to update our bitfields */
|
|
+ if (id >= 0)
|
|
+ update_addr_bitfields(meta_sk, mptcp_local);
|
|
+
|
|
+ /* Look for the socket and remove him */
|
|
+ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if ((event->family == AF_INET6 &&
|
|
+ (sk->sk_family == AF_INET ||
|
|
+ mptcp_v6_is_v4_mapped(sk))) ||
|
|
+ (event->family == AF_INET &&
|
|
+ (sk->sk_family == AF_INET6 &&
|
|
+ !mptcp_v6_is_v4_mapped(sk))))
|
|
+ continue;
|
|
+
|
|
+ if (event->family == AF_INET &&
|
|
+ (sk->sk_family == AF_INET ||
|
|
+ mptcp_v6_is_v4_mapped(sk)) &&
|
|
+ inet_sk(sk)->inet_saddr != event->addr.in.s_addr)
|
|
+ continue;
|
|
+
|
|
+ if (event->family == AF_INET6 &&
|
|
+ sk->sk_family == AF_INET6 &&
|
|
+ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6))
|
|
+ continue;
|
|
+
|
|
+ /* Reinject, so that pf = 1 and so we
|
|
+ * won't select this one as the
|
|
+ * ack-sock.
|
|
+ */
|
|
+ mptcp_reinject_data(sk, 0);
|
|
+
|
|
+ /* We announce the removal of this id */
|
|
+ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk);
|
|
+
|
|
+ mptcp_sub_force_close(sk);
|
|
+ found = true;
|
|
+ }
|
|
+
|
|
+ if (found)
|
|
+ goto next;
|
|
+
|
|
+ /* The id may have been given by the event,
|
|
+ * matching on a local address. And it may not
|
|
+ * have matched on one of the above sockets,
|
|
+ * because the client never created a subflow.
|
|
+ * So, we have to finally remove it here.
|
|
+ */
|
|
+ if (id >= 0) {
|
|
+ u8 loc_id = id
|
|
+ + (event->family == AF_INET ? 1 : MPTCP_MAX_ADDR);
|
|
+ announce_remove_addr(loc_id, meta_sk);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (event->code == MPTCP_EVENT_MOD) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ if (event->family == AF_INET &&
|
|
+ (sk->sk_family == AF_INET ||
|
|
+ mptcp_v6_is_v4_mapped(sk)) &&
|
|
+ inet_sk(sk)->inet_saddr == event->addr.in.s_addr) {
|
|
+ if (event->low_prio != tp->mptcp->low_prio) {
|
|
+ tp->mptcp->send_mp_prio = 1;
|
|
+ tp->mptcp->low_prio = event->low_prio;
|
|
+
|
|
+ tcp_send_ack(sk);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (event->family == AF_INET6 &&
|
|
+ sk->sk_family == AF_INET6 &&
|
|
+ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) {
|
|
+ if (event->low_prio != tp->mptcp->low_prio) {
|
|
+ tp->mptcp->send_mp_prio = 1;
|
|
+ tp->mptcp->low_prio = event->low_prio;
|
|
+
|
|
+ tcp_send_ack(sk);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+next:
|
|
+ bh_unlock_sock(meta_sk);
|
|
+ sock_put(meta_sk);
|
|
+ }
|
|
+ rcu_read_unlock_bh();
|
|
+ }
|
|
+ goto next_event;
|
|
+}
|
|
+
|
|
+static struct mptcp_addr_event *lookup_similar_event(const struct net *net,
|
|
+ const struct mptcp_addr_event *event)
|
|
+{
|
|
+ struct mptcp_addr_event *eventq;
|
|
+ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
|
|
+
|
|
+ list_for_each_entry(eventq, &fm_ns->events, list) {
|
|
+ if (eventq->family != event->family)
|
|
+ continue;
|
|
+ if (eventq->if_idx != event->if_idx)
|
|
+ continue;
|
|
+ if (event->family == AF_INET) {
|
|
+ if (eventq->addr.in.s_addr == event->addr.in.s_addr)
|
|
+ return eventq;
|
|
+ } else {
|
|
+ if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6))
|
|
+ return eventq;
|
|
+ }
|
|
+ }
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+/* We already hold the net-namespace MPTCP-lock */
|
|
+static void add_pm_event(struct net *net, const struct mptcp_addr_event *event)
|
|
+{
|
|
+ struct mptcp_addr_event *eventq = lookup_similar_event(net, event);
|
|
+ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
|
|
+
|
|
+ if (eventq) {
|
|
+ switch (event->code) {
|
|
+ case MPTCP_EVENT_DEL:
|
|
+ mptcp_debug("%s del old_code %u\n", __func__, eventq->code);
|
|
+ list_del(&eventq->list);
|
|
+ kfree(eventq);
|
|
+ break;
|
|
+ case MPTCP_EVENT_ADD:
|
|
+ mptcp_debug("%s add old_code %u\n", __func__, eventq->code);
|
|
+ eventq->low_prio = event->low_prio;
|
|
+ eventq->code = MPTCP_EVENT_ADD;
|
|
+ return;
|
|
+ case MPTCP_EVENT_MOD:
|
|
+ mptcp_debug("%s mod old_code %u\n", __func__, eventq->code);
|
|
+ eventq->low_prio = event->low_prio;
|
|
+ eventq->code = MPTCP_EVENT_MOD;
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* OK, we have to add the new address to the wait queue */
|
|
+ eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC);
|
|
+ if (!eventq)
|
|
+ return;
|
|
+
|
|
+ list_add_tail(&eventq->list, &fm_ns->events);
|
|
+
|
|
+ /* Create work-queue */
|
|
+ if (!delayed_work_pending(&fm_ns->address_worker))
|
|
+ queue_delayed_work(mptcp_wq, &fm_ns->address_worker,
|
|
+ msecs_to_jiffies(500));
|
|
+}
|
|
+
|
|
+static void addr4_event_handler(const struct in_ifaddr *ifa, unsigned long event,
|
|
+ struct net *net)
|
|
+{
|
|
+ const struct net_device *netdev = ifa->ifa_dev->dev;
|
|
+ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
|
|
+ struct mptcp_addr_event mpevent;
|
|
+
|
|
+ if (ifa->ifa_scope > RT_SCOPE_LINK ||
|
|
+ ipv4_is_loopback(ifa->ifa_local))
|
|
+ return;
|
|
+
|
|
+ spin_lock_bh(&fm_ns->local_lock);
|
|
+
|
|
+ mpevent.family = AF_INET;
|
|
+ mpevent.addr.in.s_addr = ifa->ifa_local;
|
|
+ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
|
|
+ mpevent.if_idx = netdev->ifindex;
|
|
+
|
|
+ if (event == NETDEV_DOWN || !netif_running(netdev) ||
|
|
+ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))
|
|
+ mpevent.code = MPTCP_EVENT_DEL;
|
|
+ else if (event == NETDEV_UP)
|
|
+ mpevent.code = MPTCP_EVENT_ADD;
|
|
+ else if (event == NETDEV_CHANGE)
|
|
+ mpevent.code = MPTCP_EVENT_MOD;
|
|
+
|
|
+ mptcp_debug("%s created event for %pI4, code %u prio %u idx %u\n", __func__,
|
|
+ &ifa->ifa_local, mpevent.code, mpevent.low_prio, mpevent.if_idx);
|
|
+ add_pm_event(net, &mpevent);
|
|
+
|
|
+ spin_unlock_bh(&fm_ns->local_lock);
|
|
+ return;
|
|
+}
|
|
+
|
|
+/* React on IPv4-addr add/rem-events */
|
|
+static int mptcp_pm_inetaddr_event(struct notifier_block *this,
|
|
+ unsigned long event, void *ptr)
|
|
+{
|
|
+ const struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
|
|
+ struct net *net = dev_net(ifa->ifa_dev->dev);
|
|
+
|
|
+ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
|
|
+ event == NETDEV_CHANGE))
|
|
+ return NOTIFY_DONE;
|
|
+
|
|
+ addr4_event_handler(ifa, event, net);
|
|
+
|
|
+ return NOTIFY_DONE;
|
|
+}
|
|
+
|
|
+static struct notifier_block mptcp_pm_inetaddr_notifier = {
|
|
+ .notifier_call = mptcp_pm_inetaddr_event,
|
|
+};
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+
|
|
+static int inet6_addr_event(struct notifier_block *this, unsigned long event,
|
|
+ void *ptr);
|
|
+
|
|
+static void addr6_event_handler(const struct inet6_ifaddr *ifa, unsigned long event,
|
|
+ struct net *net)
|
|
+{
|
|
+ const struct net_device *netdev = ifa->idev->dev;
|
|
+ int addr_type = ipv6_addr_type(&ifa->addr);
|
|
+ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
|
|
+ struct mptcp_addr_event mpevent;
|
|
+
|
|
+ if (ifa->scope > RT_SCOPE_LINK ||
|
|
+ addr_type == IPV6_ADDR_ANY ||
|
|
+ (addr_type & IPV6_ADDR_LOOPBACK) ||
|
|
+ (addr_type & IPV6_ADDR_LINKLOCAL))
|
|
+ return;
|
|
+
|
|
+ spin_lock_bh(&fm_ns->local_lock);
|
|
+
|
|
+ mpevent.family = AF_INET6;
|
|
+ mpevent.addr.in6 = ifa->addr;
|
|
+ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
|
|
+ mpevent.if_idx = netdev->ifindex;
|
|
+
|
|
+ if (event == NETDEV_DOWN || !netif_running(netdev) ||
|
|
+ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))
|
|
+ mpevent.code = MPTCP_EVENT_DEL;
|
|
+ else if (event == NETDEV_UP)
|
|
+ mpevent.code = MPTCP_EVENT_ADD;
|
|
+ else if (event == NETDEV_CHANGE)
|
|
+ mpevent.code = MPTCP_EVENT_MOD;
|
|
+
|
|
+ mptcp_debug("%s created event for %pI6, code %u prio %u idx %u\n", __func__,
|
|
+ &ifa->addr, mpevent.code, mpevent.low_prio, mpevent.if_idx);
|
|
+ add_pm_event(net, &mpevent);
|
|
+
|
|
+ spin_unlock_bh(&fm_ns->local_lock);
|
|
+ return;
|
|
+}
|
|
+
|
|
+/* React on IPv6-addr add/rem-events */
|
|
+static int inet6_addr_event(struct notifier_block *this, unsigned long event,
|
|
+ void *ptr)
|
|
+{
|
|
+ struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr;
|
|
+ struct net *net = dev_net(ifa6->idev->dev);
|
|
+
|
|
+ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
|
|
+ event == NETDEV_CHANGE))
|
|
+ return NOTIFY_DONE;
|
|
+
|
|
+ addr6_event_handler(ifa6, event, net);
|
|
+
|
|
+ return NOTIFY_DONE;
|
|
+}
|
|
+
|
|
+static struct notifier_block inet6_addr_notifier = {
|
|
+ .notifier_call = inet6_addr_event,
|
|
+};
|
|
+
|
|
+#endif
|
|
+
|
|
+/* React on ifup/down-events */
|
|
+static int netdev_event(struct notifier_block *this, unsigned long event,
|
|
+ void *ptr)
|
|
+{
|
|
+ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
|
+ struct in_device *in_dev;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ struct inet6_dev *in6_dev;
|
|
+#endif
|
|
+
|
|
+ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
|
|
+ event == NETDEV_CHANGE))
|
|
+ return NOTIFY_DONE;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ in_dev = __in_dev_get_rtnl(dev);
|
|
+
|
|
+ if (in_dev) {
|
|
+ struct in_ifaddr *ifa;
|
|
+
|
|
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
|
|
+ mptcp_pm_inetaddr_event(NULL, event, ifa);
|
|
+ }
|
|
+ }
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ in6_dev = __in6_dev_get(dev);
|
|
+
|
|
+ if (in6_dev) {
|
|
+ struct inet6_ifaddr *ifa6;
|
|
+ list_for_each_entry(ifa6, &in6_dev->addr_list, if_list)
|
|
+ inet6_addr_event(NULL, event, ifa6);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ rcu_read_unlock();
|
|
+ return NOTIFY_DONE;
|
|
+}
|
|
+
|
|
+static struct notifier_block mptcp_pm_netdev_notifier = {
|
|
+ .notifier_call = netdev_event,
|
|
+};
|
|
+
|
|
+static void full_mesh_add_raddr(struct mptcp_cb *mpcb,
|
|
+ const union inet_addr *addr,
|
|
+ sa_family_t family, __be16 port, u8 id)
|
|
+{
|
|
+ if (family == AF_INET)
|
|
+ mptcp_addv4_raddr(mpcb, &addr->in, port, id);
|
|
+ else
|
|
+ mptcp_addv6_raddr(mpcb, &addr->in6, port, id);
|
|
+}
|
|
+
|
|
+static void full_mesh_new_session(const struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_loc_addr *mptcp_local;
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
|
|
+ struct tcp_sock *master_tp = tcp_sk(mpcb->master_sk);
|
|
+ int i, index, if_idx = 0;
|
|
+ union inet_addr saddr, daddr;
|
|
+ sa_family_t family = AF_INET;
|
|
+ bool meta_v4 = meta_sk->sk_family == AF_INET;
|
|
+
|
|
+ /* Init local variables necessary for the rest */
|
|
+ if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) {
|
|
+ saddr.ip = inet_sk(meta_sk)->inet_saddr;
|
|
+ daddr.ip = inet_sk(meta_sk)->inet_daddr;
|
|
+ if_idx = mpcb->master_sk->sk_bound_dev_if;
|
|
+ family = AF_INET;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ } else {
|
|
+ saddr.in6 = inet6_sk(meta_sk)->saddr;
|
|
+ daddr.in6 = meta_sk->sk_v6_daddr;
|
|
+ if_idx = mpcb->master_sk->sk_bound_dev_if;
|
|
+ family = AF_INET6;
|
|
+#endif
|
|
+ }
|
|
+
|
|
+ if (inet_sk(meta_sk)->transparent)
|
|
+ if_idx = inet_sk(meta_sk)->rx_dst_ifindex;
|
|
+
|
|
+ rcu_read_lock_bh();
|
|
+ mptcp_local = rcu_dereference(fm_ns->local);
|
|
+
|
|
+ if (inet_sk(meta_sk)->transparent)
|
|
+ index = mptcp_find_address_transp(mptcp_local, family, if_idx);
|
|
+ else
|
|
+ index = mptcp_find_address(mptcp_local, family, &saddr, if_idx);
|
|
+ if (index < 0)
|
|
+ goto fallback;
|
|
+
|
|
+ if (family == AF_INET)
|
|
+ master_tp->mptcp->low_prio = mptcp_local->locaddr4[index].low_prio;
|
|
+ else
|
|
+ master_tp->mptcp->low_prio = mptcp_local->locaddr6[index].low_prio;
|
|
+ master_tp->mptcp->send_mp_prio = master_tp->mptcp->low_prio;
|
|
+
|
|
+ full_mesh_add_raddr(mpcb, &daddr, family, 0, 0);
|
|
+ mptcp_set_init_addr_bit(mpcb, &daddr, family, index);
|
|
+
|
|
+ /* Initialize workqueue-struct */
|
|
+ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
|
|
+ INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker);
|
|
+ fmp->mpcb = mpcb;
|
|
+
|
|
+ if (!meta_v4 && meta_sk->sk_ipv6only)
|
|
+ goto skip_ipv4;
|
|
+
|
|
+ /* Look for the address among the local addresses */
|
|
+ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
|
|
+ __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr;
|
|
+
|
|
+ /* We do not need to announce the initial subflow's address again */
|
|
+ if (family == AF_INET &&
|
|
+ (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx) &&
|
|
+ saddr.ip == ifa_address)
|
|
+ continue;
|
|
+
|
|
+ fmp->add_addr++;
|
|
+ mpcb->addr_signal = 1;
|
|
+ }
|
|
+
|
|
+skip_ipv4:
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ /* skip IPv6 addresses if meta-socket is IPv4 */
|
|
+ if (meta_v4)
|
|
+ goto skip_ipv6;
|
|
+
|
|
+ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
|
|
+ const struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr;
|
|
+
|
|
+ /* We do not need to announce the initial subflow's address again */
|
|
+ if (family == AF_INET6 &&
|
|
+ (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx) &&
|
|
+ ipv6_addr_equal(&saddr.in6, ifa6))
|
|
+ continue;
|
|
+
|
|
+ fmp->add_addr++;
|
|
+ mpcb->addr_signal = 1;
|
|
+ }
|
|
+
|
|
+skip_ipv6:
|
|
+#endif
|
|
+
|
|
+ rcu_read_unlock_bh();
|
|
+
|
|
+ if (family == AF_INET)
|
|
+ fmp->announced_addrs_v4 |= (1 << index);
|
|
+ else
|
|
+ fmp->announced_addrs_v6 |= (1 << index);
|
|
+
|
|
+ for (i = fmp->add_addr; i && fmp->add_addr; i--)
|
|
+ tcp_send_ack(mpcb->master_sk);
|
|
+
|
|
+ if (master_tp->mptcp->send_mp_prio)
|
|
+ tcp_send_ack(mpcb->master_sk);
|
|
+
|
|
+ return;
|
|
+
|
|
+fallback:
|
|
+ rcu_read_unlock_bh();
|
|
+ mptcp_fallback_default(mpcb);
|
|
+ return;
|
|
+}
|
|
+
|
|
+static void full_mesh_create_subflows(struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+
|
|
+ if (mptcp_in_infinite_mapping_weak(mpcb) ||
|
|
+ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
|
|
+ return;
|
|
+
|
|
+ if (mpcb->master_sk &&
|
|
+ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
|
|
+ return;
|
|
+
|
|
+ if (!work_pending(&fmp->subflow_work)) {
|
|
+ sock_hold(meta_sk);
|
|
+ refcount_inc(&mpcb->mpcb_refcnt);
|
|
+ queue_work(mptcp_wq, &fmp->subflow_work);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Called upon release_sock, if the socket was owned by the user during
|
|
+ * a path-management event.
|
|
+ */
|
|
+static void full_mesh_release_sock(struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_loc_addr *mptcp_local;
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
|
|
+ bool meta_v4 = meta_sk->sk_family == AF_INET;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct hlist_node *tmp;
|
|
+ int i;
|
|
+
|
|
+ rcu_read_lock_bh();
|
|
+ mptcp_local = rcu_dereference(fm_ns->local);
|
|
+
|
|
+ if (!meta_v4 && meta_sk->sk_ipv6only)
|
|
+ goto skip_ipv4;
|
|
+
|
|
+ /* First, detect modifications or additions */
|
|
+ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
|
|
+ struct in_addr ifa = mptcp_local->locaddr4[i].addr;
|
|
+ bool found = false;
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ if (sk->sk_family == AF_INET6 &&
|
|
+ !mptcp_v6_is_v4_mapped(sk))
|
|
+ continue;
|
|
+
|
|
+ if (inet_sk(sk)->inet_saddr != ifa.s_addr)
|
|
+ continue;
|
|
+
|
|
+ found = true;
|
|
+
|
|
+ if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) {
|
|
+ tp->mptcp->send_mp_prio = 1;
|
|
+ tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio;
|
|
+
|
|
+ tcp_send_ack(sk);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!found) {
|
|
+ struct sock *sk;
|
|
+
|
|
+ fmp->add_addr++;
|
|
+ mpcb->addr_signal = 1;
|
|
+
|
|
+ sk = mptcp_select_ack_sock(meta_sk);
|
|
+ if (sk)
|
|
+ tcp_send_ack(sk);
|
|
+ full_mesh_create_subflows(meta_sk);
|
|
+ }
|
|
+ }
|
|
+
|
|
+skip_ipv4:
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ /* skip IPv6 addresses if meta-socket is IPv4 */
|
|
+ if (meta_v4)
|
|
+ goto removal;
|
|
+
|
|
+ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
|
|
+ struct in6_addr ifa = mptcp_local->locaddr6[i].addr;
|
|
+ bool found = false;
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ if (sk->sk_family == AF_INET ||
|
|
+ mptcp_v6_is_v4_mapped(sk))
|
|
+ continue;
|
|
+
|
|
+ if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa))
|
|
+ continue;
|
|
+
|
|
+ found = true;
|
|
+
|
|
+ if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) {
|
|
+ tp->mptcp->send_mp_prio = 1;
|
|
+ tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio;
|
|
+
|
|
+ tcp_send_ack(sk);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!found) {
|
|
+ struct sock *sk;
|
|
+
|
|
+ fmp->add_addr++;
|
|
+ mpcb->addr_signal = 1;
|
|
+
|
|
+ sk = mptcp_select_ack_sock(meta_sk);
|
|
+ if (sk)
|
|
+ tcp_send_ack(sk);
|
|
+ full_mesh_create_subflows(meta_sk);
|
|
+ }
|
|
+ }
|
|
+
|
|
+removal:
|
|
+#endif
|
|
+
|
|
+ /* Now, detect address-removals */
|
|
+ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ bool shall_remove = true;
|
|
+
|
|
+ if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
|
|
+ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
|
|
+ if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) {
|
|
+ shall_remove = false;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ } else {
|
|
+ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
|
|
+ if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) {
|
|
+ shall_remove = false;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (shall_remove) {
|
|
+ /* Reinject, so that pf = 1 and so we
|
|
+ * won't select this one as the
|
|
+ * ack-sock.
|
|
+ */
|
|
+ mptcp_reinject_data(sk, 0);
|
|
+
|
|
+ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id,
|
|
+ meta_sk);
|
|
+
|
|
+ mptcp_sub_force_close(sk);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Just call it optimistically. It actually cannot do any harm */
|
|
+ update_addr_bitfields(meta_sk, mptcp_local);
|
|
+
|
|
+ rcu_read_unlock_bh();
|
|
+}
|
|
+
|
|
+static int full_mesh_get_local_id(const struct sock *meta_sk,
|
|
+ sa_family_t family, union inet_addr *addr,
|
|
+ bool *low_prio)
|
|
+{
|
|
+ struct mptcp_loc_addr *mptcp_local;
|
|
+ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
|
|
+ int index, id = -1;
|
|
+
|
|
+ /* Handle the backup-flows */
|
|
+ rcu_read_lock_bh();
|
|
+ mptcp_local = rcu_dereference(fm_ns->local);
|
|
+
|
|
+ index = mptcp_find_address(mptcp_local, family, addr, 0);
|
|
+
|
|
+ if (index != -1) {
|
|
+ if (family == AF_INET) {
|
|
+ id = mptcp_local->locaddr4[index].loc4_id;
|
|
+ *low_prio = mptcp_local->locaddr4[index].low_prio;
|
|
+ } else {
|
|
+ id = mptcp_local->locaddr6[index].loc6_id;
|
|
+ *low_prio = mptcp_local->locaddr6[index].low_prio;
|
|
+ }
|
|
+ }
|
|
+
|
|
+
|
|
+ rcu_read_unlock_bh();
|
|
+
|
|
+ return id;
|
|
+}
|
|
+
|
|
+static void full_mesh_addr_signal(struct sock *sk, unsigned *size,
|
|
+ struct tcp_out_options *opts,
|
|
+ struct sk_buff *skb)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+ struct sock *meta_sk = mpcb->meta_sk;
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
|
|
+ struct mptcp_loc_addr *mptcp_local;
|
|
+ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk));
|
|
+ int remove_addr_len;
|
|
+ u8 unannouncedv4 = 0, unannouncedv6 = 0;
|
|
+ bool meta_v4 = meta_sk->sk_family == AF_INET;
|
|
+
|
|
+ mpcb->addr_signal = 0;
|
|
+
|
|
+ if (likely(!fmp->add_addr))
|
|
+ goto remove_addr;
|
|
+
|
|
+ rcu_read_lock_bh();
|
|
+ mptcp_local = rcu_dereference(fm_ns->local);
|
|
+
|
|
+ if (!meta_v4 && meta_sk->sk_ipv6only)
|
|
+ goto skip_ipv4;
|
|
+
|
|
+ /* IPv4 */
|
|
+ unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits;
|
|
+ if (unannouncedv4 &&
|
|
+ ((mpcb->mptcp_ver == MPTCP_VERSION_0 &&
|
|
+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) ||
|
|
+ (mpcb->mptcp_ver >= MPTCP_VERSION_1 &&
|
|
+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1))) {
|
|
+ int ind = mptcp_find_free_index(~unannouncedv4);
|
|
+
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_ADD_ADDR;
|
|
+ opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id;
|
|
+ opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr;
|
|
+ opts->add_addr_v4 = 1;
|
|
+ if (mpcb->mptcp_ver >= MPTCP_VERSION_1) {
|
|
+ u8 mptcp_hash_mac[SHA256_DIGEST_SIZE];
|
|
+
|
|
+ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key,
|
|
+ (u8 *)&mpcb->mptcp_rem_key, mptcp_hash_mac, 2,
|
|
+ 1, (u8 *)&mptcp_local->locaddr4[ind].loc4_id,
|
|
+ 4, (u8 *)&opts->add_addr4.addr.s_addr);
|
|
+ opts->add_addr4.trunc_mac = *(u64 *)&mptcp_hash_mac[SHA256_DIGEST_SIZE - sizeof(u64)];
|
|
+ }
|
|
+
|
|
+ if (skb) {
|
|
+ fmp->announced_addrs_v4 |= (1 << ind);
|
|
+ fmp->add_addr--;
|
|
+ }
|
|
+
|
|
+ if (mpcb->mptcp_ver < MPTCP_VERSION_1)
|
|
+ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN;
|
|
+ if (mpcb->mptcp_ver >= MPTCP_VERSION_1)
|
|
+ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1;
|
|
+
|
|
+ goto skip_ipv6;
|
|
+ }
|
|
+
|
|
+ if (meta_v4)
|
|
+ goto skip_ipv6;
|
|
+skip_ipv4:
|
|
+ /* IPv6 */
|
|
+ unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits;
|
|
+ if (unannouncedv6 &&
|
|
+ ((mpcb->mptcp_ver == MPTCP_VERSION_0 &&
|
|
+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) ||
|
|
+ (mpcb->mptcp_ver >= MPTCP_VERSION_1 &&
|
|
+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1))) {
|
|
+ int ind = mptcp_find_free_index(~unannouncedv6);
|
|
+
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_ADD_ADDR;
|
|
+ opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id;
|
|
+ opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr;
|
|
+ opts->add_addr_v6 = 1;
|
|
+ if (mpcb->mptcp_ver >= MPTCP_VERSION_1) {
|
|
+ u8 mptcp_hash_mac[SHA256_DIGEST_SIZE];
|
|
+
|
|
+ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key,
|
|
+ (u8 *)&mpcb->mptcp_rem_key, mptcp_hash_mac, 2,
|
|
+ 1, (u8 *)&mptcp_local->locaddr6[ind].loc6_id,
|
|
+ 16, (u8 *)&opts->add_addr6.addr.s6_addr);
|
|
+ opts->add_addr6.trunc_mac = *(u64 *)&mptcp_hash_mac[SHA256_DIGEST_SIZE - sizeof(u64)];
|
|
+ }
|
|
+
|
|
+ if (skb) {
|
|
+ fmp->announced_addrs_v6 |= (1 << ind);
|
|
+ fmp->add_addr--;
|
|
+ }
|
|
+ if (mpcb->mptcp_ver < MPTCP_VERSION_1)
|
|
+ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN;
|
|
+ if (mpcb->mptcp_ver >= MPTCP_VERSION_1)
|
|
+ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1;
|
|
+ }
|
|
+
|
|
+skip_ipv6:
|
|
+ rcu_read_unlock_bh();
|
|
+
|
|
+ if (!unannouncedv4 && !unannouncedv6 && skb)
|
|
+ fmp->add_addr--;
|
|
+
|
|
+remove_addr:
|
|
+ if (likely(!fmp->remove_addrs))
|
|
+ goto exit;
|
|
+
|
|
+ remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs);
|
|
+ if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len)
|
|
+ goto exit;
|
|
+
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_REMOVE_ADDR;
|
|
+ opts->remove_addrs = fmp->remove_addrs;
|
|
+ *size += remove_addr_len;
|
|
+ if (skb)
|
|
+ fmp->remove_addrs = 0;
|
|
+
|
|
+exit:
|
|
+ mpcb->addr_signal = !!(fmp->add_addr || fmp->remove_addrs);
|
|
+}
|
|
+
|
|
+static void full_mesh_rem_raddr(struct mptcp_cb *mpcb, u8 rem_id)
|
|
+{
|
|
+ mptcp_v4_rem_raddress(mpcb, rem_id);
|
|
+ mptcp_v6_rem_raddress(mpcb, rem_id);
|
|
+}
|
|
+
|
|
+static void full_mesh_delete_subflow(struct sock *sk)
|
|
+{
|
|
+ struct fullmesh_priv *fmp = fullmesh_get_priv(tcp_sk(sk)->mpcb);
|
|
+ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk));
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ struct mptcp_loc_addr *mptcp_local;
|
|
+ int index, i;
|
|
+
|
|
+ if (!create_on_err)
|
|
+ return;
|
|
+
|
|
+ if (!mptcp_can_new_subflow(meta_sk))
|
|
+ return;
|
|
+
|
|
+ rcu_read_lock_bh();
|
|
+ mptcp_local = rcu_dereference_bh(fm_ns->local);
|
|
+
|
|
+ if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
|
|
+ union inet_addr saddr;
|
|
+
|
|
+ saddr.ip = inet_sk(sk)->inet_saddr;
|
|
+ index = mptcp_find_address(mptcp_local, AF_INET, &saddr,
|
|
+ sk->sk_bound_dev_if);
|
|
+ if (index < 0)
|
|
+ goto out;
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
|
|
+ struct fullmesh_rem4 *rem4 = &fmp->remaddr4[i];
|
|
+
|
|
+ if (rem4->addr.s_addr != sk->sk_daddr)
|
|
+ continue;
|
|
+
|
|
+ if (rem4->port && rem4->port != inet_sk(sk)->inet_dport)
|
|
+ continue;
|
|
+
|
|
+ rem4->bitfield &= ~(1 << index);
|
|
+ }
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ } else {
|
|
+ union inet_addr saddr;
|
|
+
|
|
+ saddr.in6 = inet6_sk(sk)->saddr;
|
|
+ index = mptcp_find_address(mptcp_local, AF_INET6, &saddr,
|
|
+ sk->sk_bound_dev_if);
|
|
+ if (index < 0)
|
|
+ goto out;
|
|
+
|
|
+ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
|
|
+ struct fullmesh_rem6 *rem6 = &fmp->remaddr6[i];
|
|
+
|
|
+ if (!ipv6_addr_equal(&rem6->addr, &sk->sk_v6_daddr))
|
|
+ continue;
|
|
+
|
|
+ if (rem6->port && rem6->port != inet_sk(sk)->inet_dport)
|
|
+ continue;
|
|
+
|
|
+ rem6->bitfield &= ~(1 << index);
|
|
+ }
|
|
+#endif
|
|
+ }
|
|
+
|
|
+out:
|
|
+ rcu_read_unlock_bh();
|
|
+
|
|
+ /* re-schedule the creation of failed subflows */
|
|
+ if (tcp_sk(sk)->mptcp->sk_err == ETIMEDOUT || sk->sk_err == ETIMEDOUT)
|
|
+ full_mesh_create_subflows(meta_sk);
|
|
+}
|
|
+
|
|
+/* Output /proc/net/mptcp_fullmesh */
|
|
+static int mptcp_fm_seq_show(struct seq_file *seq, void *v)
|
|
+{
|
|
+ const struct net *net = seq->private;
|
|
+ struct mptcp_loc_addr *mptcp_local;
|
|
+ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
|
|
+ int i;
|
|
+
|
|
+ seq_printf(seq, "Index, Address-ID, Backup, IP-address, if-idx\n");
|
|
+
|
|
+ rcu_read_lock_bh();
|
|
+ mptcp_local = rcu_dereference(fm_ns->local);
|
|
+
|
|
+ seq_printf(seq, "IPv4, next v4-index: %u\n", mptcp_local->next_v4_index);
|
|
+
|
|
+ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
|
|
+ struct mptcp_loc4 *loc4 = &mptcp_local->locaddr4[i];
|
|
+
|
|
+ seq_printf(seq, "%u, %u, %u, %pI4, %u\n", i, loc4->loc4_id,
|
|
+ loc4->low_prio, &loc4->addr, loc4->if_idx);
|
|
+ }
|
|
+
|
|
+ seq_printf(seq, "IPv6, next v6-index: %u\n", mptcp_local->next_v6_index);
|
|
+
|
|
+ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
|
|
+ struct mptcp_loc6 *loc6 = &mptcp_local->locaddr6[i];
|
|
+
|
|
+ seq_printf(seq, "%u, %u, %u, %pI6, %u\n", i, loc6->loc6_id,
|
|
+ loc6->low_prio, &loc6->addr, loc6->if_idx);
|
|
+ }
|
|
+ rcu_read_unlock_bh();
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int mptcp_fm_init_net(struct net *net)
|
|
+{
|
|
+ struct mptcp_loc_addr *mptcp_local;
|
|
+ struct mptcp_fm_ns *fm_ns;
|
|
+ int err = 0;
|
|
+
|
|
+ fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL);
|
|
+ if (!fm_ns)
|
|
+ return -ENOBUFS;
|
|
+
|
|
+ mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL);
|
|
+ if (!mptcp_local) {
|
|
+ err = -ENOBUFS;
|
|
+ goto err_mptcp_local;
|
|
+ }
|
|
+
|
|
+ if (!proc_create_net_single("mptcp_fullmesh", S_IRUGO, net->proc_net,
|
|
+ mptcp_fm_seq_show, NULL)) {
|
|
+ err = -ENOMEM;
|
|
+ goto err_seq_fops;
|
|
+ }
|
|
+
|
|
+ mptcp_local->next_v4_index = 1;
|
|
+
|
|
+ rcu_assign_pointer(fm_ns->local, mptcp_local);
|
|
+ INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker);
|
|
+ INIT_LIST_HEAD(&fm_ns->events);
|
|
+ spin_lock_init(&fm_ns->local_lock);
|
|
+ fm_ns->net = net;
|
|
+ net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns;
|
|
+
|
|
+ return 0;
|
|
+err_seq_fops:
|
|
+ kfree(mptcp_local);
|
|
+err_mptcp_local:
|
|
+ kfree(fm_ns);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+static void mptcp_fm_exit_net(struct net *net)
|
|
+{
|
|
+ struct mptcp_addr_event *eventq, *tmp;
|
|
+ struct mptcp_fm_ns *fm_ns;
|
|
+ struct mptcp_loc_addr *mptcp_local;
|
|
+
|
|
+ fm_ns = fm_get_ns(net);
|
|
+ cancel_delayed_work_sync(&fm_ns->address_worker);
|
|
+
|
|
+ rcu_read_lock_bh();
|
|
+
|
|
+ mptcp_local = rcu_dereference_bh(fm_ns->local);
|
|
+ kfree_rcu(mptcp_local, rcu);
|
|
+
|
|
+ spin_lock(&fm_ns->local_lock);
|
|
+ list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) {
|
|
+ list_del(&eventq->list);
|
|
+ kfree(eventq);
|
|
+ }
|
|
+ spin_unlock(&fm_ns->local_lock);
|
|
+
|
|
+ rcu_read_unlock_bh();
|
|
+
|
|
+ remove_proc_entry("mptcp_fullmesh", net->proc_net);
|
|
+
|
|
+ kfree(fm_ns);
|
|
+}
|
|
+
|
|
+static struct pernet_operations full_mesh_net_ops = {
|
|
+ .init = mptcp_fm_init_net,
|
|
+ .exit = mptcp_fm_exit_net,
|
|
+};
|
|
+
|
|
+static struct mptcp_pm_ops full_mesh __read_mostly = {
|
|
+ .new_session = full_mesh_new_session,
|
|
+ .release_sock = full_mesh_release_sock,
|
|
+ .fully_established = full_mesh_create_subflows,
|
|
+ .new_remote_address = full_mesh_create_subflows,
|
|
+ .get_local_id = full_mesh_get_local_id,
|
|
+ .addr_signal = full_mesh_addr_signal,
|
|
+ .add_raddr = full_mesh_add_raddr,
|
|
+ .rem_raddr = full_mesh_rem_raddr,
|
|
+ .delete_subflow = full_mesh_delete_subflow,
|
|
+ .name = "fullmesh",
|
|
+ .owner = THIS_MODULE,
|
|
+};
|
|
+
|
|
+/* General initialization of MPTCP_PM */
|
|
+static int __init full_mesh_register(void)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE);
|
|
+
|
|
+ ret = register_pernet_subsys(&full_mesh_net_ops);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
|
|
+ if (ret)
|
|
+ goto err_reg_inetaddr;
|
|
+ ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier);
|
|
+ if (ret)
|
|
+ goto err_reg_netdev;
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ ret = register_inet6addr_notifier(&inet6_addr_notifier);
|
|
+ if (ret)
|
|
+ goto err_reg_inet6addr;
|
|
+#endif
|
|
+
|
|
+ ret = mptcp_register_path_manager(&full_mesh);
|
|
+ if (ret)
|
|
+ goto err_reg_pm;
|
|
+
|
|
+out:
|
|
+ return ret;
|
|
+
|
|
+
|
|
+err_reg_pm:
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ unregister_inet6addr_notifier(&inet6_addr_notifier);
|
|
+err_reg_inet6addr:
|
|
+#endif
|
|
+ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
|
|
+err_reg_netdev:
|
|
+ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
|
|
+err_reg_inetaddr:
|
|
+ unregister_pernet_subsys(&full_mesh_net_ops);
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static void full_mesh_unregister(void)
|
|
+{
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ unregister_inet6addr_notifier(&inet6_addr_notifier);
|
|
+#endif
|
|
+ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
|
|
+ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
|
|
+ unregister_pernet_subsys(&full_mesh_net_ops);
|
|
+ mptcp_unregister_path_manager(&full_mesh);
|
|
+}
|
|
+
|
|
+module_init(full_mesh_register);
|
|
+module_exit(full_mesh_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Christoph Paasch");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("Full-Mesh MPTCP");
|
|
+MODULE_VERSION("0.88");
|
|
diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c
|
|
new file mode 100644
|
|
index 000000000000..ae9cc7209613
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_input.c
|
|
@@ -0,0 +1,2546 @@
|
|
+/*
|
|
+ * MPTCP implementation - Sending side
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
|
|
+ *
|
|
+ * Current Maintainer & Author:
|
|
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
|
|
+ *
|
|
+ * Additional authors:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ * Gregory Detal <gregory.detal@uclouvain.be>
|
|
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
|
|
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
|
|
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
|
|
+ * Andreas Ripke <ripke@neclab.eu>
|
|
+ * Vlad Dogaru <vlad.dogaru@intel.com>
|
|
+ * Octavian Purdila <octavian.purdila@intel.com>
|
|
+ * John Ronan <jronan@tssg.org>
|
|
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
|
|
+ * Brandon Heller <brandonh@stanford.edu>
|
|
+ *
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#include <asm/unaligned.h>
|
|
+
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
+#include <net/mptcp_v6.h>
|
|
+
|
|
+#include <linux/kconfig.h>
|
|
+
|
|
+/* is seq1 < seq2 ? */
|
|
+static inline bool before64(const u64 seq1, const u64 seq2)
|
|
+{
|
|
+ return (s64)(seq1 - seq2) < 0;
|
|
+}
|
|
+
|
|
+/* is seq1 > seq2 ? */
|
|
+#define after64(seq1, seq2) before64(seq2, seq1)
|
|
+
|
|
+static inline void mptcp_become_fully_estab(struct sock *sk)
|
|
+{
|
|
+ tcp_sk(sk)->mptcp->fully_established = 1;
|
|
+
|
|
+ if (is_master_tp(tcp_sk(sk)) &&
|
|
+ tcp_sk(sk)->mpcb->pm_ops->fully_established)
|
|
+ tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk));
|
|
+}
|
|
+
|
|
+/* Similar to tcp_tso_acked without any memory accounting */
|
|
+static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk,
|
|
+ struct sk_buff *skb)
|
|
+{
|
|
+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ u32 packets_acked, len, delta_truesize;
|
|
+
|
|
+ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una));
|
|
+
|
|
+ packets_acked = tcp_skb_pcount(skb);
|
|
+
|
|
+ if (skb_unclone(skb, GFP_ATOMIC))
|
|
+ return 0;
|
|
+
|
|
+ len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq;
|
|
+ delta_truesize = __pskb_trim_head(skb, len);
|
|
+
|
|
+ TCP_SKB_CB(skb)->seq += len;
|
|
+ skb->ip_summed = CHECKSUM_PARTIAL;
|
|
+
|
|
+ if (delta_truesize)
|
|
+ skb->truesize -= delta_truesize;
|
|
+
|
|
+ /* Any change of skb->len requires recalculation of tso factor. */
|
|
+ if (tcp_skb_pcount(skb) > 1)
|
|
+ tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
|
|
+ packets_acked -= tcp_skb_pcount(skb);
|
|
+
|
|
+ if (packets_acked) {
|
|
+ BUG_ON(tcp_skb_pcount(skb) == 0);
|
|
+ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
|
|
+ }
|
|
+
|
|
+ return packets_acked;
|
|
+}
|
|
+
|
|
+/* Cleans the meta-socket retransmission queue and the reinject-queue. */
|
|
+static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una)
|
|
+{
|
|
+ struct sk_buff *skb, *tmp, *next;
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ bool fully_acked = true;
|
|
+ bool acked = false;
|
|
+ u32 acked_pcount;
|
|
+
|
|
+ for (skb = skb_rb_first(&meta_sk->tcp_rtx_queue); skb; skb = next) {
|
|
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
|
|
+
|
|
+ tcp_ack_tstamp(meta_sk, skb, prior_snd_una);
|
|
+
|
|
+ if (after(scb->end_seq, meta_tp->snd_una)) {
|
|
+ if (tcp_skb_pcount(skb) == 1 ||
|
|
+ !after(meta_tp->snd_una, scb->seq))
|
|
+ break;
|
|
+
|
|
+ acked_pcount = tcp_tso_acked(meta_sk, skb);
|
|
+ if (!acked_pcount)
|
|
+ break;
|
|
+ fully_acked = false;
|
|
+ } else {
|
|
+ acked_pcount = tcp_skb_pcount(skb);
|
|
+ }
|
|
+
|
|
+ acked = true;
|
|
+ meta_tp->packets_out -= acked_pcount;
|
|
+ meta_tp->retrans_stamp = 0;
|
|
+
|
|
+ if (!fully_acked)
|
|
+ break;
|
|
+
|
|
+ next = skb_rb_next(skb);
|
|
+
|
|
+ if (mptcp_is_data_fin(skb)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct hlist_node *tmp;
|
|
+
|
|
+ /* DATA_FIN has been acknowledged - now we can close
|
|
+ * the subflows
|
|
+ */
|
|
+ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+ unsigned long delay = 0;
|
|
+
|
|
+ /* If we are the passive closer, don't trigger
|
|
+ * subflow-fin until the subflow has been finned
|
|
+ * by the peer - thus we add a delay.
|
|
+ */
|
|
+ if (mpcb->passive_close &&
|
|
+ sk_it->sk_state == TCP_ESTABLISHED)
|
|
+ delay = inet_csk(sk_it)->icsk_rto << 3;
|
|
+
|
|
+ mptcp_sub_close(sk_it, delay);
|
|
+ }
|
|
+ }
|
|
+ tcp_rtx_queue_unlink_and_free(skb, meta_sk);
|
|
+ }
|
|
+ /* Remove acknowledged data from the reinject queue */
|
|
+ skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) {
|
|
+ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
|
|
+ if (tcp_skb_pcount(skb) == 1 ||
|
|
+ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
|
|
+ break;
|
|
+
|
|
+ mptcp_tso_acked_reinject(meta_sk, skb);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ __skb_unlink(skb, &mpcb->reinject_queue);
|
|
+ __kfree_skb(skb);
|
|
+ }
|
|
+
|
|
+ if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una)))
|
|
+ meta_tp->snd_up = meta_tp->snd_una;
|
|
+
|
|
+ if (acked) {
|
|
+ tcp_rearm_rto(meta_sk);
|
|
+ /* Normally this is done in tcp_try_undo_loss - but MPTCP
|
|
+ * does not call this function.
|
|
+ */
|
|
+ inet_csk(meta_sk)->icsk_retransmits = 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Inspired by tcp_rcv_state_process */
|
|
+/* Returns 0 if processing the packet can continue
|
|
+ * -1 if connection was closed with an active reset
|
|
+ * 1 if connection was closed and processing should stop.
|
|
+ */
|
|
+static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk,
|
|
+ const struct sk_buff *skb, u32 data_seq,
|
|
+ u16 data_len)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
|
|
+ const struct tcphdr *th = tcp_hdr(skb);
|
|
+
|
|
+ /* State-machine handling if FIN has been enqueued and he has
|
|
+ * been acked (snd_una == write_seq) - it's important that this
|
|
+ * here is after sk_wmem_free_skb because otherwise
|
|
+ * sk_forward_alloc is wrong upon inet_csk_destroy_sock()
|
|
+ */
|
|
+ switch (meta_sk->sk_state) {
|
|
+ case TCP_FIN_WAIT1: {
|
|
+ struct dst_entry *dst;
|
|
+ int tmo;
|
|
+
|
|
+ if (meta_tp->snd_una != meta_tp->write_seq)
|
|
+ break;
|
|
+
|
|
+ tcp_set_state(meta_sk, TCP_FIN_WAIT2);
|
|
+ meta_sk->sk_shutdown |= SEND_SHUTDOWN;
|
|
+
|
|
+ dst = __sk_dst_get(sk);
|
|
+ if (dst)
|
|
+ dst_confirm(dst);
|
|
+
|
|
+ if (!sock_flag(meta_sk, SOCK_DEAD)) {
|
|
+ /* Wake up lingering close() */
|
|
+ meta_sk->sk_state_change(meta_sk);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (meta_tp->linger2 < 0 ||
|
|
+ (data_len &&
|
|
+ after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0),
|
|
+ meta_tp->rcv_nxt))) {
|
|
+ mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
|
|
+ tcp_done(meta_sk);
|
|
+ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ tmo = tcp_fin_time(meta_sk);
|
|
+ if (tmo > TCP_TIMEWAIT_LEN) {
|
|
+ inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN);
|
|
+ } else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) {
|
|
+ /* Bad case. We could lose such FIN otherwise.
|
|
+ * It is not a big problem, but it looks confusing
|
|
+ * and not so rare event. We still can lose it now,
|
|
+ * if it spins in bh_lock_sock(), but it is really
|
|
+ * marginal case.
|
|
+ */
|
|
+ inet_csk_reset_keepalive_timer(meta_sk, tmo);
|
|
+ } else {
|
|
+ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo);
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case TCP_CLOSING:
|
|
+ case TCP_LAST_ACK:
|
|
+ if (meta_tp->snd_una == meta_tp->write_seq) {
|
|
+ tcp_done(meta_sk);
|
|
+ return 1;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* step 7: process the segment text */
|
|
+ switch (meta_sk->sk_state) {
|
|
+ case TCP_FIN_WAIT1:
|
|
+ case TCP_FIN_WAIT2:
|
|
+ /* RFC 793 says to queue data in these states,
|
|
+ * RFC 1122 says we MUST send a reset.
|
|
+ * BSD 4.4 also does reset.
|
|
+ */
|
|
+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
|
|
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
|
|
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
|
|
+ !mptcp_is_data_fin2(skb, tp)) {
|
|
+ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
|
|
+ mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
|
|
+ tcp_reset(meta_sk);
|
|
+ return -1;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * @return:
|
|
+ * i) 1: Everything's fine.
|
|
+ * ii) -1: A reset has been sent on the subflow - csum-failure
|
|
+ * iii) 0: csum-failure but no reset sent, because it's the last subflow.
|
|
+ * Last packet should not be destroyed by the caller because it has
|
|
+ * been done here.
|
|
+ */
|
|
+static int mptcp_verif_dss_csum(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct sk_buff *tmp, *tmp1, *last = NULL;
|
|
+ __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */
|
|
+ int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0;
|
|
+ int iter = 0;
|
|
+ u32 next_seq, offset_seq;
|
|
+
|
|
+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) {
|
|
+ unsigned int csum_len;
|
|
+
|
|
+ /* init next seq in first round */
|
|
+ if (!iter)
|
|
+ next_seq = TCP_SKB_CB(tmp)->seq;
|
|
+ offset_seq = next_seq - TCP_SKB_CB(tmp)->seq;
|
|
+
|
|
+ if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq))
|
|
+ /* Mapping ends in the middle of the packet -
|
|
+ * csum only these bytes
|
|
+ */
|
|
+ csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq;
|
|
+ else
|
|
+ csum_len = tmp->len;
|
|
+
|
|
+ csum_len -= offset_seq;
|
|
+ offset = 0;
|
|
+ if (overflowed) {
|
|
+ char first_word[4];
|
|
+ first_word[0] = 0;
|
|
+ first_word[1] = 0;
|
|
+ first_word[2] = 0;
|
|
+ first_word[3] = *(tmp->data + offset_seq);
|
|
+ csum_tcp = csum_partial(first_word, 4, csum_tcp);
|
|
+ offset = 1;
|
|
+ csum_len--;
|
|
+ overflowed = 0;
|
|
+ }
|
|
+
|
|
+ csum_tcp = skb_checksum(tmp, offset + offset_seq, csum_len,
|
|
+ csum_tcp);
|
|
+
|
|
+ /* Was it on an odd-length? Then we have to merge the next byte
|
|
+ * correctly (see above)
|
|
+ */
|
|
+ if (csum_len != (csum_len & (~1)))
|
|
+ overflowed = 1;
|
|
+
|
|
+ if (mptcp_is_data_seq(tmp) && !dss_csum_added) {
|
|
+ __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32));
|
|
+
|
|
+ /* If a 64-bit dss is present, we increase the offset
|
|
+ * by 4 bytes, as the high-order 64-bits will be added
|
|
+ * in the final csum_partial-call.
|
|
+ */
|
|
+ u32 offset = skb_transport_offset(tmp) +
|
|
+ TCP_SKB_CB(tmp)->dss_off;
|
|
+ if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET)
|
|
+ offset += 4;
|
|
+
|
|
+ csum_tcp = skb_checksum(tmp, offset,
|
|
+ MPTCP_SUB_LEN_SEQ_CSUM,
|
|
+ csum_tcp);
|
|
+
|
|
+ csum_tcp = csum_partial(&data_seq,
|
|
+ sizeof(data_seq), csum_tcp);
|
|
+
|
|
+ dss_csum_added = 1; /* Just do it once */
|
|
+ } else if (mptcp_is_data_mpcapable(tmp) && !dss_csum_added) {
|
|
+ u32 offset = skb_transport_offset(tmp) + TCP_SKB_CB(tmp)->dss_off;
|
|
+ __be64 data_seq = htonll(tp->mptcp->map_data_seq);
|
|
+ __be32 rel_seq = htonl(tp->mptcp->map_subseq - tp->mptcp->rcv_isn);
|
|
+
|
|
+ csum_tcp = csum_partial(&data_seq, sizeof(data_seq), csum_tcp);
|
|
+ csum_tcp = csum_partial(&rel_seq, sizeof(rel_seq), csum_tcp);
|
|
+
|
|
+ csum_tcp = skb_checksum(tmp, offset, 4, csum_tcp);
|
|
+
|
|
+ dss_csum_added = 1;
|
|
+ }
|
|
+ last = tmp;
|
|
+ iter++;
|
|
+
|
|
+ if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) &&
|
|
+ !before(TCP_SKB_CB(tmp1)->seq,
|
|
+ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
|
|
+ break;
|
|
+ next_seq = TCP_SKB_CB(tmp)->end_seq;
|
|
+ }
|
|
+
|
|
+ /* Now, checksum must be 0 */
|
|
+ if (unlikely(csum_fold(csum_tcp))) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct sock *sk_it = NULL;
|
|
+
|
|
+ pr_debug("%s csum is wrong: %#x tcp-seq %u dss_csum_added %d overflowed %d iterations %d\n",
|
|
+ __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq,
|
|
+ dss_csum_added, overflowed, iter);
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CSUMFAIL);
|
|
+ tp->mptcp->send_mp_fail = 1;
|
|
+
|
|
+ /* map_data_seq is the data-seq number of the
|
|
+ * mapping we are currently checking
|
|
+ */
|
|
+ tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq;
|
|
+
|
|
+ /* Search for another subflow that is fully established */
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (sk_it != sk &&
|
|
+ tcp_sk(sk_it)->mptcp->fully_established)
|
|
+ break;
|
|
+
|
|
+ sk_it = NULL;
|
|
+ }
|
|
+
|
|
+ if (sk_it) {
|
|
+ mptcp_send_reset(sk);
|
|
+ ans = -1;
|
|
+ } else {
|
|
+ tp->mpcb->send_infinite_mapping = 1;
|
|
+
|
|
+ /* Need to purge the rcv-queue as it's no more valid */
|
|
+ while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
|
|
+ tp->copied_seq = TCP_SKB_CB(tmp)->end_seq;
|
|
+ kfree_skb(tmp);
|
|
+ }
|
|
+
|
|
+ mptcp_fallback_close(tp->mpcb, sk);
|
|
+
|
|
+ ans = 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return ans;
|
|
+}
|
|
+
|
|
+static inline void mptcp_prepare_skb(struct sk_buff *skb,
|
|
+ const struct sock *sk)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
|
|
+ u32 inc = 0, end_seq = tcb->end_seq;
|
|
+
|
|
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
|
|
+ end_seq--;
|
|
+ /* If skb is the end of this mapping (end is always at mapping-boundary
|
|
+ * thanks to the splitting/trimming), then we need to increase
|
|
+ * data-end-seq by 1 if this here is a data-fin.
|
|
+ *
|
|
+ * We need to do -1 because end_seq includes the subflow-FIN.
|
|
+ */
|
|
+ if (tp->mptcp->map_data_fin &&
|
|
+ end_seq == tp->mptcp->map_subseq + tp->mptcp->map_data_len) {
|
|
+ inc = 1;
|
|
+
|
|
+ /* We manually set the fin-flag if it is a data-fin. For easy
|
|
+ * processing in tcp_recvmsg.
|
|
+ */
|
|
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
|
|
+ } else {
|
|
+ /* We may have a subflow-fin with data but without data-fin */
|
|
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_FIN;
|
|
+ }
|
|
+
|
|
+ /* Adapt data-seq's to the packet itself. We kinda transform the
|
|
+ * dss-mapping to a per-packet granularity. This is necessary to
|
|
+ * correctly handle overlapping mappings coming from different
|
|
+ * subflows. Otherwise it would be a complete mess.
|
|
+ */
|
|
+ tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq;
|
|
+ tcb->end_seq = tcb->seq + skb->len + inc;
|
|
+}
|
|
+
|
|
+static inline void mptcp_reset_mapping(struct tcp_sock *tp, u32 old_copied_seq)
|
|
+{
|
|
+ tp->mptcp->map_data_len = 0;
|
|
+ tp->mptcp->map_data_seq = 0;
|
|
+ tp->mptcp->map_subseq = 0;
|
|
+ tp->mptcp->map_data_fin = 0;
|
|
+ tp->mptcp->mapping_present = 0;
|
|
+
|
|
+ /* In infinite mapping receiver mode, we have to advance the implied
|
|
+ * data-sequence number when we progress the subflow's data.
|
|
+ */
|
|
+ if (tp->mpcb->infinite_mapping_rcv)
|
|
+ tp->mpcb->infinite_rcv_seq += (tp->copied_seq - old_copied_seq);
|
|
+}
|
|
+
|
|
+/* The DSS-mapping received on the sk only covers the second half of the skb
|
|
+ * (cut at seq). We trim the head from the skb.
|
|
+ * Data will be freed upon kfree().
|
|
+ *
|
|
+ * Inspired by tcp_trim_head().
|
|
+ */
|
|
+static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq)
|
|
+{
|
|
+ int len = seq - TCP_SKB_CB(skb)->seq;
|
|
+ u32 new_seq = TCP_SKB_CB(skb)->seq + len;
|
|
+ u32 delta_truesize;
|
|
+
|
|
+ delta_truesize = __pskb_trim_head(skb, len);
|
|
+
|
|
+ TCP_SKB_CB(skb)->seq = new_seq;
|
|
+
|
|
+ if (delta_truesize) {
|
|
+ skb->truesize -= delta_truesize;
|
|
+ atomic_sub(delta_truesize, &sk->sk_rmem_alloc);
|
|
+ sk_mem_uncharge(sk, delta_truesize);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* The DSS-mapping received on the sk only covers the first half of the skb
|
|
+ * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue
|
|
+ * as further packets may resolve the mapping of the second half of data.
|
|
+ *
|
|
+ * Inspired by tcp_fragment().
|
|
+ */
|
|
+static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq)
|
|
+{
|
|
+ struct sk_buff *buff;
|
|
+ int nsize;
|
|
+ int nlen, len;
|
|
+ u8 flags;
|
|
+
|
|
+ len = seq - TCP_SKB_CB(skb)->seq;
|
|
+ nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len;
|
|
+ if (nsize < 0)
|
|
+ nsize = 0;
|
|
+
|
|
+ /* Get a new skb... force flag on. */
|
|
+ buff = alloc_skb(nsize, GFP_ATOMIC);
|
|
+ if (buff == NULL)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ skb_reserve(buff, tcp_sk(sk)->tcp_header_len);
|
|
+ skb_reset_transport_header(buff);
|
|
+
|
|
+ flags = TCP_SKB_CB(skb)->tcp_flags;
|
|
+ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN);
|
|
+ TCP_SKB_CB(buff)->tcp_flags = flags;
|
|
+
|
|
+ /* We absolutly need to call skb_set_owner_r before refreshing the
|
|
+ * truesize of buff, otherwise the moved data will account twice.
|
|
+ */
|
|
+ skb_set_owner_r(buff, sk);
|
|
+ nlen = skb->len - len - nsize;
|
|
+ buff->truesize += nlen;
|
|
+ skb->truesize -= nlen;
|
|
+
|
|
+ /* Correct the sequence numbers. */
|
|
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
|
|
+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
|
|
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
|
|
+
|
|
+ skb_split(skb, buff, len);
|
|
+
|
|
+ __skb_queue_after(&sk->sk_receive_queue, skb, buff);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* @return: 0 everything is fine. Just continue processing
|
|
+ * 1 subflow is broken stop everything
|
|
+ * -1 this packet was broken - continue with the next one.
|
|
+ */
|
|
+static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+
|
|
+ /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */
|
|
+ if (!skb->len && (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
|
|
+ !mptcp_is_data_fin(skb) && !mpcb->infinite_mapping_rcv) {
|
|
+ /* Remove a pure subflow-fin from the queue and increase
|
|
+ * copied_seq.
|
|
+ */
|
|
+ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
|
|
+ __skb_unlink(skb, &sk->sk_receive_queue);
|
|
+ __kfree_skb(skb);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ /* If we are not yet fully established and do not know the mapping for
|
|
+ * this segment, this path has to fallback to infinite or be torn down.
|
|
+ */
|
|
+ if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&
|
|
+ !mptcp_is_data_mpcapable(skb) &&
|
|
+ !tp->mptcp->mapping_present && !mpcb->infinite_mapping_rcv) {
|
|
+ pr_debug("%s %#x will fallback - pi %d from %pS, seq %u mptcp-flags %#x\n",
|
|
+ __func__, mpcb->mptcp_loc_token,
|
|
+ tp->mptcp->path_index, __builtin_return_address(0),
|
|
+ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->mptcp_flags);
|
|
+
|
|
+ if (!is_master_tp(tp)) {
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATASUB);
|
|
+ mptcp_send_reset(sk);
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATAINIT);
|
|
+
|
|
+ mpcb->infinite_mapping_snd = 1;
|
|
+ mpcb->infinite_mapping_rcv = 1;
|
|
+ mpcb->infinite_rcv_seq = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
|
|
+
|
|
+ mptcp_fallback_close(mpcb, sk);
|
|
+
|
|
+ /* We do a seamless fallback and should not send a inf.mapping. */
|
|
+ mpcb->send_infinite_mapping = 0;
|
|
+ tp->mptcp->fully_established = 1;
|
|
+ }
|
|
+
|
|
+ /* Receiver-side becomes fully established when a whole rcv-window has
|
|
+ * been received without the need to fallback due to the previous
|
|
+ * condition.
|
|
+ */
|
|
+ if (!tp->mptcp->fully_established) {
|
|
+ tp->mptcp->init_rcv_wnd -= skb->len;
|
|
+ if (tp->mptcp->init_rcv_wnd < 0)
|
|
+ mptcp_become_fully_estab(sk);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void mptcp_restart_sending(struct sock *meta_sk)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ struct sk_buff *wq_head, *skb, *tmp;
|
|
+
|
|
+ skb = tcp_rtx_queue_head(meta_sk);
|
|
+
|
|
+ /* We resend everything that has not been acknowledged, thus we need
|
|
+ * to move it from the rtx-tree to the write-queue.
|
|
+ */
|
|
+ wq_head = tcp_write_queue_head(meta_sk);
|
|
+
|
|
+ skb_rbtree_walk_from_safe(skb, tmp) {
|
|
+ list_del(&skb->tcp_tsorted_anchor);
|
|
+ tcp_rtx_queue_unlink(skb, meta_sk);
|
|
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
|
|
+
|
|
+ if (wq_head)
|
|
+ __skb_queue_before(&meta_sk->sk_write_queue, wq_head, skb);
|
|
+ else
|
|
+ tcp_add_write_queue_tail(meta_sk, skb);
|
|
+ }
|
|
+
|
|
+ /* We artificially restart the whole send-queue. Thus,
|
|
+ * it is as if no packets are in flight
|
|
+ */
|
|
+ meta_tp->packets_out = 0;
|
|
+
|
|
+ /* If the snd_nxt already wrapped around, we have to
|
|
+ * undo the wrapping, as we are restarting from snd_una
|
|
+ * on.
|
|
+ */
|
|
+ if (meta_tp->snd_nxt < meta_tp->snd_una) {
|
|
+ mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2;
|
|
+ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
|
|
+ }
|
|
+ meta_tp->snd_nxt = meta_tp->snd_una;
|
|
+
|
|
+ /* Trigger a sending on the meta. */
|
|
+ mptcp_push_pending_frames(meta_sk);
|
|
+}
|
|
+
|
|
+/* @return: 0 everything is fine. Just continue processing
|
|
+ * 1 subflow is broken stop everything
|
|
+ * -1 this packet was broken - continue with the next one.
|
|
+ */
|
|
+static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
|
|
+ u32 *ptr;
|
|
+ u32 data_seq, sub_seq, data_len, tcp_end_seq;
|
|
+ bool set_infinite_rcv = false;
|
|
+
|
|
+ /* If we are in infinite-mapping-mode, the subflow is guaranteed to be
|
|
+ * in-order at the data-level. Thus data-seq-numbers can be inferred
|
|
+ * from what is expected at the data-level.
|
|
+ */
|
|
+ if (mpcb->infinite_mapping_rcv) {
|
|
+ /* copied_seq may be bigger than tcb->seq (e.g., when the peer
|
|
+ * retransmits data that actually has already been acknowledged with
|
|
+ * newer data, if he did not receive our acks). Thus, we need
|
|
+ * to account for this overlap as well.
|
|
+ */
|
|
+ tp->mptcp->map_data_seq = mpcb->infinite_rcv_seq - (tp->copied_seq - tcb->seq);
|
|
+ tp->mptcp->map_subseq = tcb->seq;
|
|
+ tp->mptcp->map_data_len = skb->len;
|
|
+ tp->mptcp->map_data_fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN);
|
|
+ tp->mptcp->mapping_present = 1;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (!tp->mptcp->mapping_present && mptcp_is_data_mpcapable(skb)) {
|
|
+ __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);
|
|
+
|
|
+ sub_seq = 1 + tp->mptcp->rcv_isn;
|
|
+ data_seq = meta_tp->rcv_nxt;
|
|
+ data_len = get_unaligned_be16(ptr);
|
|
+ } else if (!mptcp_is_data_seq(skb)) {
|
|
+ /* No mapping here?
|
|
+ * Exit - it is either already set or still on its way
|
|
+ */
|
|
+ if (!tp->mptcp->mapping_present &&
|
|
+ tp->rcv_nxt - tp->copied_seq > 65536) {
|
|
+ /* Too many packets without a mapping,
|
|
+ * this subflow is broken
|
|
+ */
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
|
|
+ mptcp_send_reset(sk);
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+ } else {
|
|
+ /* Well, then the DSS-mapping is there. So, read it! */
|
|
+ ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb);
|
|
+ ptr++;
|
|
+ sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn;
|
|
+ ptr++;
|
|
+ data_len = get_unaligned_be16(ptr);
|
|
+ }
|
|
+
|
|
+ /* If it's an empty skb with DATA_FIN, sub_seq must get fixed.
|
|
+ * The draft sets it to 0, but we really would like to have the
|
|
+ * real value, to have an easy handling afterwards here in this
|
|
+ * function.
|
|
+ */
|
|
+ if (mptcp_is_data_fin(skb) && skb->len == 0)
|
|
+ sub_seq = TCP_SKB_CB(skb)->seq;
|
|
+
|
|
+ /* If there is already a mapping - we check if it maps with the current
|
|
+ * one. If not - we reset.
|
|
+ */
|
|
+ if (tp->mptcp->mapping_present &&
|
|
+ (data_seq != (u32)tp->mptcp->map_data_seq ||
|
|
+ sub_seq != tp->mptcp->map_subseq ||
|
|
+ data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin ||
|
|
+ mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) {
|
|
+ /* Mapping in packet is different from what we want */
|
|
+ pr_debug("%s Mappings do not match!\n", __func__);
|
|
+ pr_debug("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n",
|
|
+ __func__, data_seq, (u32)tp->mptcp->map_data_seq,
|
|
+ sub_seq, tp->mptcp->map_subseq, data_len,
|
|
+ tp->mptcp->map_data_len, mptcp_is_data_fin(skb),
|
|
+ tp->mptcp->map_data_fin);
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSNOMATCH);
|
|
+ mptcp_send_reset(sk);
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ /* If the previous check was good, the current mapping is valid and we exit. */
|
|
+ if (tp->mptcp->mapping_present)
|
|
+ return 0;
|
|
+
|
|
+ /* Mapping not yet set on this subflow - we set it here! */
|
|
+
|
|
+ if (!data_len) {
|
|
+ mpcb->infinite_mapping_rcv = 1;
|
|
+ mpcb->send_infinite_mapping = 1;
|
|
+ tp->mptcp->fully_established = 1;
|
|
+ /* We need to repeat mp_fail's until the sender felt
|
|
+ * back to infinite-mapping - here we stop repeating it.
|
|
+ */
|
|
+ tp->mptcp->send_mp_fail = 0;
|
|
+
|
|
+ /* We have to fixup data_len - it must be the same as skb->len */
|
|
+ data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0);
|
|
+ sub_seq = tcb->seq;
|
|
+
|
|
+ mptcp_restart_sending(tp->meta_sk);
|
|
+
|
|
+ mptcp_fallback_close(mpcb, sk);
|
|
+
|
|
+ /* data_seq and so on are set correctly */
|
|
+
|
|
+ /* At this point, the meta-ofo-queue has to be emptied,
|
|
+ * as the following data is guaranteed to be in-order at
|
|
+ * the data and subflow-level
|
|
+ */
|
|
+ skb_rbtree_purge(&meta_tp->out_of_order_queue);
|
|
+
|
|
+ set_infinite_rcv = true;
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_INFINITEMAPRX);
|
|
+ }
|
|
+
|
|
+ /* We are sending mp-fail's and thus are in fallback mode.
|
|
+ * Ignore packets which do not announce the fallback and still
|
|
+ * want to provide a mapping.
|
|
+ */
|
|
+ if (tp->mptcp->send_mp_fail) {
|
|
+ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
|
|
+ __skb_unlink(skb, &sk->sk_receive_queue);
|
|
+ __kfree_skb(skb);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ /* FIN increased the mapping-length by 1 */
|
|
+ if (mptcp_is_data_fin(skb))
|
|
+ data_len--;
|
|
+
|
|
+ /* Subflow-sequences of packet must be
|
|
+ * (at least partially) be part of the DSS-mapping's
|
|
+ * subflow-sequence-space.
|
|
+ *
|
|
+ * Basically the mapping is not valid, if either of the
|
|
+ * following conditions is true:
|
|
+ *
|
|
+ * 1. It's not a data_fin and
|
|
+ * MPTCP-sub_seq >= TCP-end_seq
|
|
+ *
|
|
+ * 2. It's a data_fin and TCP-end_seq > TCP-seq and
|
|
+ * MPTCP-sub_seq >= TCP-end_seq
|
|
+ *
|
|
+ * The previous two can be merged into:
|
|
+ * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq
|
|
+ * Because if it's not a data-fin, TCP-end_seq > TCP-seq
|
|
+ *
|
|
+ * 3. It's a data_fin and skb->len == 0 and
|
|
+ * MPTCP-sub_seq > TCP-end_seq
|
|
+ *
|
|
+ * 4. It's not a data_fin and TCP-end_seq > TCP-seq and
|
|
+ * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq
|
|
+ */
|
|
+
|
|
+ /* subflow-fin is not part of the mapping - ignore it here ! */
|
|
+ tcp_end_seq = tcb->end_seq;
|
|
+ if (tcb->tcp_flags & TCPHDR_FIN)
|
|
+ tcp_end_seq--;
|
|
+ if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) ||
|
|
+ (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) ||
|
|
+ (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq))) {
|
|
+ /* Subflow-sequences of packet is different from what is in the
|
|
+ * packet's dss-mapping. The peer is misbehaving - reset
|
|
+ */
|
|
+ pr_debug("%s Packet's mapping does not map to the DSS sub_seq %u end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u copied_seq %u\n",
|
|
+ __func__, sub_seq, tcb->end_seq, tcp_end_seq,
|
|
+ tcb->seq, mptcp_is_data_fin(skb),
|
|
+ skb->len, data_len, tp->copied_seq);
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSTCPMISMATCH);
|
|
+ mptcp_send_reset(sk);
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ /* Does the DSS had 64-bit seqnum's ? */
|
|
+ if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) {
|
|
+ /* Wrapped around? */
|
|
+ if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
|
|
+ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);
|
|
+ } else {
|
|
+ /* Else, access the default high-order bits */
|
|
+ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq);
|
|
+ }
|
|
+ } else {
|
|
+ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq);
|
|
+
|
|
+ if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) {
|
|
+ /* We make sure that the data_seq is invalid.
|
|
+ * It will be dropped later.
|
|
+ */
|
|
+ tp->mptcp->map_data_seq += 0xFFFFFFFF;
|
|
+ tp->mptcp->map_data_seq += 0xFFFFFFFF;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (set_infinite_rcv)
|
|
+ mpcb->infinite_rcv_seq = tp->mptcp->map_data_seq;
|
|
+
|
|
+ tp->mptcp->map_data_len = data_len;
|
|
+ tp->mptcp->map_subseq = sub_seq;
|
|
+ tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;
|
|
+ tp->mptcp->mapping_present = 1;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Similar to tcp_sequence(...) */
|
|
+static inline bool mptcp_sequence(const struct tcp_sock *meta_tp,
|
|
+ u64 data_seq, u64 end_data_seq)
|
|
+{
|
|
+ const struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ u64 rcv_wup64;
|
|
+
|
|
+ /* Wrap-around? */
|
|
+ if (meta_tp->rcv_wup > meta_tp->rcv_nxt) {
|
|
+ rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) |
|
|
+ meta_tp->rcv_wup;
|
|
+ } else {
|
|
+ rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
|
|
+ meta_tp->rcv_wup);
|
|
+ }
|
|
+
|
|
+ return !before64(end_data_seq, rcv_wup64) &&
|
|
+ !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window_now(meta_tp));
|
|
+}
|
|
+
|
|
+/* @return: 0 everything is fine. Just continue processing
|
|
+ * -1 this packet was broken - continue with the next one.
|
|
+ */
|
|
+static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct sk_buff *tmp, *tmp1;
|
|
+ u32 tcp_end_seq;
|
|
+
|
|
+ if (!tp->mptcp->mapping_present)
|
|
+ return 0;
|
|
+
|
|
+ /* either, the new skb gave us the mapping and the first segment
|
|
+ * in the sub-rcv-queue has to be trimmed ...
|
|
+ */
|
|
+ tmp = skb_peek(&sk->sk_receive_queue);
|
|
+ if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) &&
|
|
+ after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq)) {
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSTRIMHEAD);
|
|
+ mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq);
|
|
+ }
|
|
+
|
|
+ /* ... or the new skb (tail) has to be split at the end. */
|
|
+ tcp_end_seq = TCP_SKB_CB(skb)->end_seq;
|
|
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
|
|
+ tcp_end_seq--;
|
|
+ if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
|
|
+ u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len;
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSSPLITTAIL);
|
|
+ if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */
|
|
+ /* TODO : maybe handle this here better.
|
|
+ * We now just force meta-retransmission.
|
|
+ */
|
|
+ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
|
|
+ __skb_unlink(skb, &sk->sk_receive_queue);
|
|
+ __kfree_skb(skb);
|
|
+ return -1;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Now, remove old sk_buff's from the receive-queue.
|
|
+ * This may happen if the mapping has been lost for these segments and
|
|
+ * the next mapping has already been received.
|
|
+ */
|
|
+ if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) {
|
|
+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
|
|
+ if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq))
|
|
+ break;
|
|
+
|
|
+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
|
|
+ __skb_unlink(tmp1, &sk->sk_receive_queue);
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PURGEOLD);
|
|
+ /* Impossible that we could free skb here, because his
|
|
+ * mapping is known to be valid from previous checks
|
|
+ */
|
|
+ __kfree_skb(tmp1);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* @return: 0 everything is fine. Just continue processing
|
|
+ * 1 subflow is broken stop everything
|
|
+ * -1 this mapping has been put in the meta-receive-queue
|
|
+ * -2 this mapping has been eaten by the application
|
|
+ */
|
|
+static int mptcp_queue_skb(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+ struct sk_buff *tmp, *tmp1;
|
|
+ u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp);
|
|
+ u32 old_copied_seq = tp->copied_seq;
|
|
+ bool data_queued = false;
|
|
+
|
|
+ /* Have we not yet received the full mapping? */
|
|
+ if (!tp->mptcp->mapping_present ||
|
|
+ before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len))
|
|
+ return 0;
|
|
+
|
|
+ /* Is this an overlapping mapping? rcv_nxt >= end_data_seq
|
|
+ * OR
|
|
+ * This mapping is out of window
|
|
+ */
|
|
+ if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) ||
|
|
+ !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq,
|
|
+ tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) {
|
|
+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
|
|
+ __skb_unlink(tmp1, &sk->sk_receive_queue);
|
|
+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
|
|
+ __kfree_skb(tmp1);
|
|
+
|
|
+ if (!skb_queue_empty(&sk->sk_receive_queue) &&
|
|
+ !before(TCP_SKB_CB(tmp)->seq,
|
|
+ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ mptcp_reset_mapping(tp, old_copied_seq);
|
|
+
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ /* Record it, because we want to send our data_fin on the same path */
|
|
+ if (tp->mptcp->map_data_fin) {
|
|
+ mpcb->dfin_path_index = tp->mptcp->path_index;
|
|
+ mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN);
|
|
+ }
|
|
+
|
|
+ /* Verify the checksum */
|
|
+ if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) {
|
|
+ int ret = mptcp_verif_dss_csum(sk);
|
|
+
|
|
+ if (ret <= 0) {
|
|
+ mptcp_reset_mapping(tp, old_copied_seq);
|
|
+ return 1;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) {
|
|
+ /* Seg's have to go to the meta-ofo-queue */
|
|
+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
|
|
+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
|
|
+ mptcp_prepare_skb(tmp1, sk);
|
|
+ __skb_unlink(tmp1, &sk->sk_receive_queue);
|
|
+ /* MUST be done here, because fragstolen may be true later.
|
|
+ * Then, kfree_skb_partial will not account the memory.
|
|
+ */
|
|
+ skb_orphan(tmp1);
|
|
+
|
|
+ if (!mpcb->in_time_wait) /* In time-wait, do not receive data */
|
|
+ tcp_data_queue_ofo(meta_sk, tmp1);
|
|
+ else
|
|
+ __kfree_skb(tmp1);
|
|
+
|
|
+ if (!skb_queue_empty(&sk->sk_receive_queue) &&
|
|
+ !before(TCP_SKB_CB(tmp)->seq,
|
|
+ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* Quick ACK if more 3/4 of the receive window is filled */
|
|
+ if (after64(tp->mptcp->map_data_seq,
|
|
+ rcv_nxt64 + 3 * (tcp_receive_window_now(meta_tp) >> 2)))
|
|
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
|
|
+
|
|
+ } else {
|
|
+ /* Ready for the meta-rcv-queue */
|
|
+ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
|
|
+ int eaten = 0;
|
|
+ bool fragstolen = false;
|
|
+ u32 old_rcv_nxt = meta_tp->rcv_nxt;
|
|
+
|
|
+ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
|
|
+ mptcp_prepare_skb(tmp1, sk);
|
|
+ __skb_unlink(tmp1, &sk->sk_receive_queue);
|
|
+ /* MUST be done here, because fragstolen may be true.
|
|
+ * Then, kfree_skb_partial will not account the memory.
|
|
+ */
|
|
+ skb_orphan(tmp1);
|
|
+
|
|
+ /* This segment has already been received */
|
|
+ if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) {
|
|
+ __kfree_skb(tmp1);
|
|
+ goto next;
|
|
+ }
|
|
+
|
|
+ if (mpcb->in_time_wait) /* In time-wait, do not receive data */
|
|
+ eaten = 1;
|
|
+
|
|
+ if (!eaten)
|
|
+ eaten = tcp_queue_rcv(meta_sk, tmp1, &fragstolen);
|
|
+
|
|
+ meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq;
|
|
+
|
|
+ if (TCP_SKB_CB(tmp1)->tcp_flags & TCPHDR_FIN)
|
|
+ mptcp_fin(meta_sk);
|
|
+
|
|
+ /* Check if this fills a gap in the ofo queue */
|
|
+ if (!RB_EMPTY_ROOT(&meta_tp->out_of_order_queue))
|
|
+ tcp_ofo_queue(meta_sk);
|
|
+
|
|
+ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
|
|
+
|
|
+ if (eaten)
|
|
+ kfree_skb_partial(tmp1, fragstolen);
|
|
+
|
|
+ data_queued = true;
|
|
+next:
|
|
+ if (!skb_queue_empty(&sk->sk_receive_queue) &&
|
|
+ !before(TCP_SKB_CB(tmp)->seq,
|
|
+ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_jiffies32;
|
|
+ mptcp_reset_mapping(tp, old_copied_seq);
|
|
+
|
|
+ return data_queued ? -1 : -2;
|
|
+}
|
|
+
|
|
+void mptcp_data_ready(struct sock *sk)
|
|
+{
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ struct sk_buff *skb, *tmp;
|
|
+ int queued = 0;
|
|
+
|
|
+ tcp_mstamp_refresh(tcp_sk(meta_sk));
|
|
+
|
|
+ /* restart before the check, because mptcp_fin might have changed the
|
|
+ * state.
|
|
+ */
|
|
+restart:
|
|
+ /* If the meta cannot receive data, there is no point in pushing data.
|
|
+ * If we are in time-wait, we may still be waiting for the final FIN.
|
|
+ * So, we should proceed with the processing.
|
|
+ */
|
|
+ if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) {
|
|
+ skb_queue_purge(&sk->sk_receive_queue);
|
|
+ tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt;
|
|
+ goto exit;
|
|
+ }
|
|
+
|
|
+ /* Iterate over all segments, detect their mapping (if we don't have
|
|
+ * one yet), validate them and push everything one level higher.
|
|
+ */
|
|
+ skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
|
|
+ int ret;
|
|
+ /* Pre-validation - e.g., early fallback */
|
|
+ ret = mptcp_prevalidate_skb(sk, skb);
|
|
+ if (ret < 0)
|
|
+ goto restart;
|
|
+ else if (ret > 0)
|
|
+ break;
|
|
+
|
|
+ /* Set the current mapping */
|
|
+ ret = mptcp_detect_mapping(sk, skb);
|
|
+ if (ret < 0)
|
|
+ goto restart;
|
|
+ else if (ret > 0)
|
|
+ break;
|
|
+
|
|
+ /* Validation */
|
|
+ if (mptcp_validate_mapping(sk, skb) < 0)
|
|
+ goto restart;
|
|
+
|
|
+ /* Push a level higher */
|
|
+ ret = mptcp_queue_skb(sk);
|
|
+ if (ret < 0) {
|
|
+ if (ret == -1)
|
|
+ queued = ret;
|
|
+ goto restart;
|
|
+ } else if (ret == 0) {
|
|
+ continue;
|
|
+ } else { /* ret == 1 */
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+exit:
|
|
+ if (tcp_sk(sk)->close_it && sk->sk_state == TCP_FIN_WAIT2) {
|
|
+ tcp_send_ack(sk);
|
|
+ tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0);
|
|
+ }
|
|
+
|
|
+ if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD))
|
|
+ meta_sk->sk_data_ready(meta_sk);
|
|
+}
|
|
+
|
|
+struct mp_join *mptcp_find_join(const struct sk_buff *skb)
|
|
+{
|
|
+ const struct tcphdr *th = tcp_hdr(skb);
|
|
+ unsigned char *ptr;
|
|
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
|
|
+
|
|
+ /* Jump through the options to check whether JOIN is there */
|
|
+ ptr = (unsigned char *)(th + 1);
|
|
+ while (length > 0) {
|
|
+ int opcode = *ptr++;
|
|
+ int opsize;
|
|
+
|
|
+ switch (opcode) {
|
|
+ case TCPOPT_EOL:
|
|
+ return NULL;
|
|
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
|
|
+ length--;
|
|
+ continue;
|
|
+ default:
|
|
+ opsize = *ptr++;
|
|
+ if (opsize < 2) /* "silly options" */
|
|
+ return NULL;
|
|
+ if (opsize > length)
|
|
+ return NULL; /* don't parse partial options */
|
|
+ if (opcode == TCPOPT_MPTCP &&
|
|
+ ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) {
|
|
+ return (struct mp_join *)(ptr - 2);
|
|
+ }
|
|
+ ptr += opsize - 2;
|
|
+ length -= opsize;
|
|
+ }
|
|
+ }
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw)
|
|
+{
|
|
+ struct sock *meta_sk;
|
|
+ u32 token;
|
|
+ bool meta_v4;
|
|
+ struct mp_join *join_opt = mptcp_find_join(skb);
|
|
+ if (!join_opt)
|
|
+ return 0;
|
|
+
|
|
+ /* MPTCP structures were not initialized, so return error */
|
|
+ if (mptcp_init_failed)
|
|
+ return -1;
|
|
+
|
|
+ token = join_opt->u.syn.token;
|
|
+ meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token);
|
|
+ if (!meta_sk) {
|
|
+ MPTCP_INC_STATS(dev_net(skb_dst(skb)->dev), MPTCP_MIB_JOINNOTOKEN);
|
|
+ mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ meta_v4 = meta_sk->sk_family == AF_INET;
|
|
+ if (meta_v4) {
|
|
+ if (skb->protocol == htons(ETH_P_IPV6)) {
|
|
+ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");
|
|
+ sock_put(meta_sk); /* Taken by mptcp_hash_find */
|
|
+ return -1;
|
|
+ }
|
|
+ } else if (skb->protocol == htons(ETH_P_IP) && meta_sk->sk_ipv6only) {
|
|
+ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");
|
|
+ sock_put(meta_sk); /* Taken by mptcp_hash_find */
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ /* Coming from time-wait-sock processing in tcp_v4_rcv.
|
|
+ * We have to deschedule it before continuing, because otherwise
|
|
+ * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req.
|
|
+ */
|
|
+ if (tw)
|
|
+ inet_twsk_deschedule_put(tw);
|
|
+
|
|
+ /* OK, this is a new syn/join, let's create a new open request and
|
|
+ * send syn+ack
|
|
+ */
|
|
+ if (skb->protocol == htons(ETH_P_IP)) {
|
|
+ tcp_v4_do_rcv(meta_sk, skb);
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ } else {
|
|
+ tcp_v6_do_rcv(meta_sk, skb);
|
|
+#endif /* CONFIG_IPV6 */
|
|
+ }
|
|
+ sock_put(meta_sk); /* Taken by mptcp_hash_find */
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+int mptcp_do_join_short(struct sk_buff *skb,
|
|
+ const struct mptcp_options_received *mopt,
|
|
+ struct net *net)
|
|
+{
|
|
+ struct sock *meta_sk;
|
|
+ u32 token;
|
|
+ bool meta_v4;
|
|
+
|
|
+ token = mopt->mptcp_rem_token;
|
|
+ meta_sk = mptcp_hash_find(net, token);
|
|
+ if (!meta_sk) {
|
|
+ MPTCP_INC_STATS(dev_net(skb_dst(skb)->dev), MPTCP_MIB_JOINNOTOKEN);
|
|
+ mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ meta_v4 = meta_sk->sk_family == AF_INET;
|
|
+ if (meta_v4) {
|
|
+ if (skb->protocol == htons(ETH_P_IPV6)) {
|
|
+ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");
|
|
+ sock_put(meta_sk); /* Taken by mptcp_hash_find */
|
|
+ return -1;
|
|
+ }
|
|
+ } else if (skb->protocol == htons(ETH_P_IP) && meta_sk->sk_ipv6only) {
|
|
+ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");
|
|
+ sock_put(meta_sk); /* Taken by mptcp_hash_find */
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ /* OK, this is a new syn/join, let's create a new open request and
|
|
+ * send syn+ack
|
|
+ */
|
|
+
|
|
+ /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as
|
|
+ * the skb will finally be freed by tcp_v4_do_rcv (where we are
|
|
+ * coming from)
|
|
+ */
|
|
+ skb_get(skb);
|
|
+ if (skb->protocol == htons(ETH_P_IP)) {
|
|
+ tcp_v4_do_rcv(meta_sk, skb);
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ } else { /* IPv6 */
|
|
+ tcp_v6_do_rcv(meta_sk, skb);
|
|
+#endif /* CONFIG_IPV6 */
|
|
+ }
|
|
+
|
|
+ sock_put(meta_sk); /* Taken by mptcp_hash_find */
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * Equivalent of tcp_fin() for MPTCP
|
|
+ * Can be called only when the FIN is validly part
|
|
+ * of the data seqnum space. Not before when we get holes.
|
|
+ */
|
|
+void mptcp_fin(struct sock *meta_sk)
|
|
+{
|
|
+ struct sock *sk = NULL;
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ unsigned char state;
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) {
|
|
+ sk = sk_it;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!sk || sk->sk_state == TCP_CLOSE)
|
|
+ sk = mptcp_select_ack_sock(meta_sk);
|
|
+
|
|
+ inet_csk_schedule_ack(sk);
|
|
+
|
|
+ if (!mpcb->in_time_wait) {
|
|
+ meta_sk->sk_shutdown |= RCV_SHUTDOWN;
|
|
+ sock_set_flag(meta_sk, SOCK_DONE);
|
|
+ state = meta_sk->sk_state;
|
|
+ } else {
|
|
+ state = mpcb->mptw_state;
|
|
+ }
|
|
+
|
|
+ switch (state) {
|
|
+ case TCP_SYN_RECV:
|
|
+ case TCP_ESTABLISHED:
|
|
+ /* Move to CLOSE_WAIT */
|
|
+ tcp_set_state(meta_sk, TCP_CLOSE_WAIT);
|
|
+ inet_csk(sk)->icsk_ack.pingpong = 1;
|
|
+ break;
|
|
+
|
|
+ case TCP_CLOSE_WAIT:
|
|
+ case TCP_CLOSING:
|
|
+ /* Received a retransmission of the FIN, do
|
|
+ * nothing.
|
|
+ */
|
|
+ break;
|
|
+ case TCP_LAST_ACK:
|
|
+ /* RFC793: Remain in the LAST-ACK state. */
|
|
+ break;
|
|
+
|
|
+ case TCP_FIN_WAIT1:
|
|
+ /* This case occurs when a simultaneous close
|
|
+ * happens, we must ack the received FIN and
|
|
+ * enter the CLOSING state.
|
|
+ */
|
|
+ tcp_send_ack(sk);
|
|
+ tcp_set_state(meta_sk, TCP_CLOSING);
|
|
+ break;
|
|
+ case TCP_FIN_WAIT2:
|
|
+ /* Received a FIN -- send ACK and enter TIME_WAIT. */
|
|
+ tcp_send_ack(sk);
|
|
+ meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0);
|
|
+ break;
|
|
+ default:
|
|
+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
|
|
+ * cases we should never reach this piece of code.
|
|
+ */
|
|
+ pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__,
|
|
+ meta_sk->sk_state);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* It _is_ possible, that we have something out-of-order _after_ FIN.
|
|
+ * Probably, we should reset in this case. For now drop them.
|
|
+ */
|
|
+ skb_rbtree_purge(&meta_tp->out_of_order_queue);
|
|
+ sk_mem_reclaim(meta_sk);
|
|
+
|
|
+ if (!sock_flag(meta_sk, SOCK_DEAD)) {
|
|
+ meta_sk->sk_state_change(meta_sk);
|
|
+
|
|
+ /* Do not send POLL_HUP for half duplex close. */
|
|
+ if (meta_sk->sk_shutdown == SHUTDOWN_MASK ||
|
|
+ meta_sk->sk_state == TCP_CLOSE)
|
|
+ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP);
|
|
+ else
|
|
+ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN);
|
|
+ }
|
|
+
|
|
+ return;
|
|
+}
|
|
+
|
|
+/* Similar to tcp_xmit_retransmit_queue */
|
|
+static void mptcp_xmit_retransmit_queue(struct sock *meta_sk)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct sk_buff *skb, *rtx_head;
|
|
+
|
|
+ if (!meta_tp->packets_out)
|
|
+ return;
|
|
+
|
|
+ skb = rtx_head = tcp_rtx_queue_head(meta_sk);
|
|
+ skb_rbtree_walk_from(skb) {
|
|
+ if (mptcp_retransmit_skb(meta_sk, skb))
|
|
+ return;
|
|
+
|
|
+ if (skb == rtx_head)
|
|
+ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
|
|
+ inet_csk(meta_sk)->icsk_rto,
|
|
+ TCP_RTO_MAX);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void mptcp_snd_una_update(struct tcp_sock *meta_tp, u32 data_ack)
|
|
+{
|
|
+ u32 delta = data_ack - meta_tp->snd_una;
|
|
+
|
|
+ sock_owned_by_me((struct sock *)meta_tp);
|
|
+ meta_tp->bytes_acked += delta;
|
|
+ meta_tp->snd_una = data_ack;
|
|
+}
|
|
+
|
|
+static void mptcp_stop_subflow_chronos(struct sock *meta_sk,
|
|
+ const enum tcp_chrono type)
|
|
+{
|
|
+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ tcp_chrono_stop(sk_it, type);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Handle the DATA_ACK */
|
|
+static bool mptcp_process_data_ack(struct sock *sk, const struct sk_buff *skb)
|
|
+{
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
|
|
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
|
|
+ u32 prior_snd_una = meta_tp->snd_una;
|
|
+ int prior_packets;
|
|
+ u32 nwin, data_ack, data_seq;
|
|
+ u16 data_len = 0;
|
|
+
|
|
+ /* A valid packet came in - subflow is operational again */
|
|
+ tp->pf = 0;
|
|
+
|
|
+ /* Even if there is no data-ack, we stop retransmitting.
|
|
+ * Except if this is a SYN/ACK. Then it is just a retransmission
|
|
+ */
|
|
+ if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) {
|
|
+ tp->mptcp->pre_established = 0;
|
|
+ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
|
|
+
|
|
+ if (meta_tp->mpcb->pm_ops->established_subflow)
|
|
+ meta_tp->mpcb->pm_ops->established_subflow(sk);
|
|
+ }
|
|
+
|
|
+ /* If we are in infinite mapping mode, rx_opt.data_ack has been
|
|
+ * set by mptcp_clean_rtx_infinite.
|
|
+ */
|
|
+ if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd)
|
|
+ return false;
|
|
+
|
|
+ if (unlikely(!tp->mptcp->fully_established) &&
|
|
+ tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)
|
|
+ /* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1)
|
|
+ * includes a data-ack, we are fully established
|
|
+ */
|
|
+ mptcp_become_fully_estab(sk);
|
|
+
|
|
+ /* After we did the subflow-only processing (stopping timer and marking
|
|
+ * subflow as established), check if we can proceed with MPTCP-level
|
|
+ * processing.
|
|
+ */
|
|
+ if (meta_sk->sk_state == TCP_CLOSE)
|
|
+ return false;
|
|
+
|
|
+ /* Get the data_seq */
|
|
+ if (mptcp_is_data_seq(skb)) {
|
|
+ data_seq = tp->mptcp->rx_opt.data_seq;
|
|
+ data_len = tp->mptcp->rx_opt.data_len;
|
|
+ } else {
|
|
+ data_seq = meta_tp->snd_wl1;
|
|
+ }
|
|
+
|
|
+ data_ack = tp->mptcp->rx_opt.data_ack;
|
|
+
|
|
+ /* If the ack is older than previous acks
|
|
+ * then we can probably ignore it.
|
|
+ */
|
|
+ if (before(data_ack, prior_snd_una))
|
|
+ goto exit;
|
|
+
|
|
+ /* If the ack includes data we haven't sent yet, discard
|
|
+ * this segment (RFC793 Section 3.9).
|
|
+ */
|
|
+ if (after(data_ack, meta_tp->snd_nxt))
|
|
+ goto exit;
|
|
+
|
|
+ /* First valid DATA_ACK, we can stop sending the special MP_CAPABLE */
|
|
+ tp->mpcb->send_mptcpv1_mpcapable = 0;
|
|
+
|
|
+ /*** Now, update the window - inspired by tcp_ack_update_window ***/
|
|
+ nwin = ntohs(tcp_hdr(skb)->window);
|
|
+
|
|
+ if (likely(!tcp_hdr(skb)->syn))
|
|
+ nwin <<= tp->rx_opt.snd_wscale;
|
|
+
|
|
+ if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) {
|
|
+ tcp_update_wl(meta_tp, data_seq);
|
|
+
|
|
+ /* Draft v09, Section 3.3.5:
|
|
+ * [...] It should only update its local receive window values
|
|
+ * when the largest sequence number allowed (i.e. DATA_ACK +
|
|
+ * receive window) increases. [...]
|
|
+ */
|
|
+ if (meta_tp->snd_wnd != nwin &&
|
|
+ !before(data_ack + nwin, tcp_wnd_end(meta_tp))) {
|
|
+ meta_tp->snd_wnd = nwin;
|
|
+
|
|
+ if (nwin > meta_tp->max_window)
|
|
+ meta_tp->max_window = nwin;
|
|
+ }
|
|
+ }
|
|
+ /*** Done, update the window ***/
|
|
+
|
|
+ /* We passed data and got it acked, remove any soft error
|
|
+ * log. Something worked...
|
|
+ */
|
|
+ sk->sk_err_soft = 0;
|
|
+ inet_csk(meta_sk)->icsk_probes_out = 0;
|
|
+ meta_tp->rcv_tstamp = tcp_jiffies32;
|
|
+ prior_packets = meta_tp->packets_out;
|
|
+ if (!prior_packets)
|
|
+ goto no_queue;
|
|
+
|
|
+ mptcp_snd_una_update(meta_tp, data_ack);
|
|
+
|
|
+ mptcp_clean_rtx_queue(meta_sk, prior_snd_una);
|
|
+
|
|
+ /* We are in loss-state, and something got acked, retransmit the whole
|
|
+ * queue now!
|
|
+ */
|
|
+ if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss &&
|
|
+ after(data_ack, prior_snd_una)) {
|
|
+ mptcp_xmit_retransmit_queue(meta_sk);
|
|
+ inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open;
|
|
+ }
|
|
+
|
|
+ /* Simplified version of tcp_new_space, because the snd-buffer
|
|
+ * is handled by all the subflows.
|
|
+ */
|
|
+ if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) {
|
|
+ sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK);
|
|
+ if (meta_sk->sk_socket &&
|
|
+ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
|
|
+ meta_sk->sk_write_space(meta_sk);
|
|
+
|
|
+ if (meta_sk->sk_socket &&
|
|
+ !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) {
|
|
+ tcp_chrono_stop(meta_sk, TCP_CHRONO_SNDBUF_LIMITED);
|
|
+ mptcp_stop_subflow_chronos(meta_sk,
|
|
+ TCP_CHRONO_SNDBUF_LIMITED);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (meta_sk->sk_state != TCP_ESTABLISHED) {
|
|
+ int ret = mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len);
|
|
+
|
|
+ if (ret < 0)
|
|
+ return true;
|
|
+ else if (ret > 0)
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+exit:
|
|
+ mptcp_push_pending_frames(meta_sk);
|
|
+
|
|
+ return false;
|
|
+
|
|
+no_queue:
|
|
+ if (tcp_send_head(meta_sk))
|
|
+ tcp_ack_probe(meta_sk);
|
|
+
|
|
+ mptcp_push_pending_frames(meta_sk);
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk));
|
|
+
|
|
+ if (!tp->mpcb->infinite_mapping_snd)
|
|
+ return;
|
|
+
|
|
+ /* The difference between both write_seq's represents the offset between
|
|
+ * data-sequence and subflow-sequence. As we are infinite, this must
|
|
+ * match.
|
|
+ *
|
|
+ * Thus, from this difference we can infer the meta snd_una.
|
|
+ */
|
|
+ tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt +
|
|
+ tp->snd_una;
|
|
+
|
|
+ mptcp_process_data_ack(sk, skb);
|
|
+}
|
|
+
|
|
+/**** static functions used by mptcp_parse_options */
|
|
+
|
|
+static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct hlist_node *tmp;
|
|
+
|
|
+ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) {
|
|
+ mptcp_reinject_data(sk_it, 0);
|
|
+ mptcp_send_reset(sk_it);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline bool is_valid_addropt_opsize(u8 mptcp_ver,
|
|
+ struct mp_add_addr *mpadd,
|
|
+ int opsize)
|
|
+{
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ if (mptcp_ver < MPTCP_VERSION_1 && mpadd->u_bit.v0.ipver == 6) {
|
|
+ return opsize == MPTCP_SUB_LEN_ADD_ADDR6 ||
|
|
+ opsize == MPTCP_SUB_LEN_ADD_ADDR6 + 2;
|
|
+ }
|
|
+ if (mptcp_ver >= MPTCP_VERSION_1)
|
|
+ return opsize == MPTCP_SUB_LEN_ADD_ADDR6_VER1 ||
|
|
+ opsize == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2 ||
|
|
+ opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 ||
|
|
+ opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2;
|
|
+#endif
|
|
+ if (mptcp_ver < MPTCP_VERSION_1 && mpadd->u_bit.v0.ipver == 4) {
|
|
+ return opsize == MPTCP_SUB_LEN_ADD_ADDR4 ||
|
|
+ opsize == MPTCP_SUB_LEN_ADD_ADDR4 + 2;
|
|
+ }
|
|
+ if (mptcp_ver >= MPTCP_VERSION_1) {
|
|
+ return opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 ||
|
|
+ opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2;
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+void mptcp_parse_options(const uint8_t *ptr, int opsize,
|
|
+ struct mptcp_options_received *mopt,
|
|
+ const struct sk_buff *skb,
|
|
+ struct tcp_sock *tp)
|
|
+{
|
|
+ const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr;
|
|
+ const struct tcphdr *th = tcp_hdr(skb);
|
|
+
|
|
+ /* If the socket is mp-capable we would have a mopt. */
|
|
+ if (!mopt)
|
|
+ return;
|
|
+
|
|
+ switch (mp_opt->sub) {
|
|
+ case MPTCP_SUB_CAPABLE:
|
|
+ {
|
|
+ const struct mp_capable *mpcapable = (struct mp_capable *)ptr;
|
|
+
|
|
+ if (mpcapable->ver == MPTCP_VERSION_0 &&
|
|
+ ((th->syn && opsize != MPTCP_SUB_LEN_CAPABLE_SYN) ||
|
|
+ (!th->syn && th->ack && opsize != MPTCP_SUB_LEN_CAPABLE_ACK))) {
|
|
+ mptcp_debug("%s: mp_capable v0: bad option size %d\n",
|
|
+ __func__, opsize);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (mpcapable->ver == MPTCP_VERSION_1 &&
|
|
+ ((th->syn && !th->ack && opsize != MPTCPV1_SUB_LEN_CAPABLE_SYN) ||
|
|
+ (th->syn && th->ack && opsize != MPTCPV1_SUB_LEN_CAPABLE_SYNACK) ||
|
|
+ (!th->syn && th->ack && opsize != MPTCPV1_SUB_LEN_CAPABLE_ACK &&
|
|
+ opsize != MPTCPV1_SUB_LEN_CAPABLE_DATA &&
|
|
+ opsize != MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM))) {
|
|
+ mptcp_debug("%s: mp_capable v1: bad option size %d\n",
|
|
+ __func__, opsize);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* MPTCP-RFC 6824:
|
|
+ * "If receiving a message with the 'B' flag set to 1, and this
|
|
+ * is not understood, then this SYN MUST be silently ignored;
|
|
+ */
|
|
+ if (mpcapable->b) {
|
|
+ mopt->drop_me = 1;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* MPTCP-RFC 6824:
|
|
+ * "An implementation that only supports this method MUST set
|
|
+ * bit "H" to 1, and bits "C" through "G" to 0."
|
|
+ */
|
|
+ if (!mpcapable->h)
|
|
+ break;
|
|
+
|
|
+ mopt->saw_mpc = 1;
|
|
+ mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a;
|
|
+
|
|
+ if (mpcapable->ver == MPTCP_VERSION_0) {
|
|
+ if (opsize == MPTCP_SUB_LEN_CAPABLE_SYN)
|
|
+ mopt->mptcp_sender_key = mpcapable->sender_key;
|
|
+
|
|
+ if (opsize == MPTCP_SUB_LEN_CAPABLE_ACK) {
|
|
+ mopt->mptcp_sender_key = mpcapable->sender_key;
|
|
+ mopt->mptcp_receiver_key = mpcapable->receiver_key;
|
|
+ }
|
|
+ } else if (mpcapable->ver == MPTCP_VERSION_1) {
|
|
+ if (opsize == MPTCPV1_SUB_LEN_CAPABLE_SYNACK)
|
|
+ mopt->mptcp_sender_key = mpcapable->sender_key;
|
|
+
|
|
+ if (opsize == MPTCPV1_SUB_LEN_CAPABLE_ACK) {
|
|
+ mopt->mptcp_sender_key = mpcapable->sender_key;
|
|
+ mopt->mptcp_receiver_key = mpcapable->receiver_key;
|
|
+ }
|
|
+
|
|
+ if (opsize == MPTCPV1_SUB_LEN_CAPABLE_DATA ||
|
|
+ opsize == MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM) {
|
|
+ mopt->mptcp_sender_key = mpcapable->sender_key;
|
|
+ mopt->mptcp_receiver_key = mpcapable->receiver_key;
|
|
+
|
|
+ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_MPC_DATA;
|
|
+
|
|
+ ptr += sizeof(struct mp_capable);
|
|
+ TCP_SKB_CB(skb)->dss_off = (ptr - skb_transport_header(skb));
|
|
+
|
|
+ /* Is a check-sum present? */
|
|
+ if (opsize == MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM)
|
|
+ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_DSS_CSUM;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ mopt->mptcp_ver = mpcapable->ver;
|
|
+ break;
|
|
+ }
|
|
+ case MPTCP_SUB_JOIN:
|
|
+ {
|
|
+ const struct mp_join *mpjoin = (struct mp_join *)ptr;
|
|
+
|
|
+ if (opsize != MPTCP_SUB_LEN_JOIN_SYN &&
|
|
+ opsize != MPTCP_SUB_LEN_JOIN_SYNACK &&
|
|
+ opsize != MPTCP_SUB_LEN_JOIN_ACK) {
|
|
+ mptcp_debug("%s: mp_join: bad option size %d\n",
|
|
+ __func__, opsize);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* saw_mpc must be set, because in tcp_check_req we assume that
|
|
+ * it is set to support falling back to reg. TCP if a rexmitted
|
|
+ * SYN has no MP_CAPABLE or MP_JOIN
|
|
+ */
|
|
+ switch (opsize) {
|
|
+ case MPTCP_SUB_LEN_JOIN_SYN:
|
|
+ mopt->is_mp_join = 1;
|
|
+ mopt->saw_mpc = 1;
|
|
+ mopt->low_prio = mpjoin->b;
|
|
+ mopt->rem_id = mpjoin->addr_id;
|
|
+ mopt->mptcp_rem_token = mpjoin->u.syn.token;
|
|
+ mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce;
|
|
+ break;
|
|
+ case MPTCP_SUB_LEN_JOIN_SYNACK:
|
|
+ mopt->saw_mpc = 1;
|
|
+ mopt->low_prio = mpjoin->b;
|
|
+ mopt->rem_id = mpjoin->addr_id;
|
|
+ mopt->mptcp_recv_tmac = mpjoin->u.synack.mac;
|
|
+ mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce;
|
|
+ break;
|
|
+ case MPTCP_SUB_LEN_JOIN_ACK:
|
|
+ mopt->saw_mpc = 1;
|
|
+ mopt->join_ack = 1;
|
|
+ memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20);
|
|
+ break;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case MPTCP_SUB_DSS:
|
|
+ {
|
|
+ const struct mp_dss *mdss = (struct mp_dss *)ptr;
|
|
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
|
|
+
|
|
+ /* We check opsize for the csum and non-csum case. We do this,
|
|
+ * because the draft says that the csum SHOULD be ignored if
|
|
+ * it has not been negotiated in the MP_CAPABLE but still is
|
|
+ * present in the data.
|
|
+ *
|
|
+ * It will get ignored later in mptcp_queue_skb.
|
|
+ */
|
|
+ if (opsize != mptcp_sub_len_dss(mdss, 0) &&
|
|
+ opsize != mptcp_sub_len_dss(mdss, 1)) {
|
|
+ mptcp_debug("%s: mp_dss: bad option size %d\n",
|
|
+ __func__, opsize);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ptr += 4;
|
|
+
|
|
+ if (mdss->A) {
|
|
+ tcb->mptcp_flags |= MPTCPHDR_ACK;
|
|
+
|
|
+ if (mdss->a) {
|
|
+ mopt->data_ack = (u32) get_unaligned_be64(ptr);
|
|
+ ptr += MPTCP_SUB_LEN_ACK_64;
|
|
+ } else {
|
|
+ mopt->data_ack = get_unaligned_be32(ptr);
|
|
+ ptr += MPTCP_SUB_LEN_ACK;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ tcb->dss_off = (ptr - skb_transport_header(skb));
|
|
+
|
|
+ if (mdss->M) {
|
|
+ if (mdss->m) {
|
|
+ u64 data_seq64 = get_unaligned_be64(ptr);
|
|
+
|
|
+ tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET;
|
|
+ mopt->data_seq = (u32) data_seq64;
|
|
+
|
|
+ ptr += 12; /* 64-bit dseq + subseq */
|
|
+ } else {
|
|
+ mopt->data_seq = get_unaligned_be32(ptr);
|
|
+ ptr += 8; /* 32-bit dseq + subseq */
|
|
+ }
|
|
+ mopt->data_len = get_unaligned_be16(ptr);
|
|
+
|
|
+ tcb->mptcp_flags |= MPTCPHDR_SEQ;
|
|
+
|
|
+ /* Is a check-sum present? */
|
|
+ if (opsize == mptcp_sub_len_dss(mdss, 1))
|
|
+ tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM;
|
|
+
|
|
+ /* DATA_FIN only possible with DSS-mapping */
|
|
+ if (mdss->F)
|
|
+ tcb->mptcp_flags |= MPTCPHDR_FIN;
|
|
+ }
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case MPTCP_SUB_ADD_ADDR:
|
|
+ {
|
|
+ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
|
|
+
|
|
+ /* If tcp_sock is not available, MPTCP version can't be
|
|
+ * retrieved and ADD_ADDR opsize validation is not possible.
|
|
+ */
|
|
+ if (!tp || !tp->mpcb)
|
|
+ break;
|
|
+
|
|
+ if (!is_valid_addropt_opsize(tp->mpcb->mptcp_ver,
|
|
+ mpadd, opsize)) {
|
|
+ mptcp_debug("%s: mp_add_addr: bad option size %d\n",
|
|
+ __func__, opsize);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* We have to manually parse the options if we got two of them. */
|
|
+ if (mopt->saw_add_addr) {
|
|
+ mopt->more_add_addr = 1;
|
|
+ break;
|
|
+ }
|
|
+ mopt->saw_add_addr = 1;
|
|
+ mopt->add_addr_ptr = ptr;
|
|
+ break;
|
|
+ }
|
|
+ case MPTCP_SUB_REMOVE_ADDR:
|
|
+ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) {
|
|
+ mptcp_debug("%s: mp_remove_addr: bad option size %d\n",
|
|
+ __func__, opsize);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (mopt->saw_rem_addr) {
|
|
+ mopt->more_rem_addr = 1;
|
|
+ break;
|
|
+ }
|
|
+ mopt->saw_rem_addr = 1;
|
|
+ mopt->rem_addr_ptr = ptr;
|
|
+ break;
|
|
+ case MPTCP_SUB_PRIO:
|
|
+ {
|
|
+ const struct mp_prio *mpprio = (struct mp_prio *)ptr;
|
|
+
|
|
+ if (opsize != MPTCP_SUB_LEN_PRIO &&
|
|
+ opsize != MPTCP_SUB_LEN_PRIO_ADDR) {
|
|
+ mptcp_debug("%s: mp_prio: bad option size %d\n",
|
|
+ __func__, opsize);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ mopt->saw_low_prio = 1;
|
|
+ mopt->low_prio = mpprio->b;
|
|
+
|
|
+ if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) {
|
|
+ mopt->saw_low_prio = 2;
|
|
+ mopt->prio_addr_id = mpprio->addr_id;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case MPTCP_SUB_FAIL:
|
|
+ if (opsize != MPTCP_SUB_LEN_FAIL) {
|
|
+ mptcp_debug("%s: mp_fail: bad option size %d\n",
|
|
+ __func__, opsize);
|
|
+ break;
|
|
+ }
|
|
+ mopt->mp_fail = 1;
|
|
+ break;
|
|
+ case MPTCP_SUB_FCLOSE:
|
|
+ if (opsize != MPTCP_SUB_LEN_FCLOSE) {
|
|
+ mptcp_debug("%s: mp_fclose: bad option size %d\n",
|
|
+ __func__, opsize);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ mopt->mp_fclose = 1;
|
|
+ mopt->mptcp_sender_key = ((struct mp_fclose *)ptr)->key;
|
|
+
|
|
+ break;
|
|
+ default:
|
|
+ mptcp_debug("%s: Received unkown subtype: %d\n",
|
|
+ __func__, mp_opt->sub);
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+/** Parse only MPTCP options */
|
|
+void tcp_parse_mptcp_options(const struct sk_buff *skb,
|
|
+ struct mptcp_options_received *mopt)
|
|
+{
|
|
+ const struct tcphdr *th = tcp_hdr(skb);
|
|
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
|
|
+ const unsigned char *ptr = (const unsigned char *)(th + 1);
|
|
+
|
|
+ while (length > 0) {
|
|
+ int opcode = *ptr++;
|
|
+ int opsize;
|
|
+
|
|
+ switch (opcode) {
|
|
+ case TCPOPT_EOL:
|
|
+ return;
|
|
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
|
|
+ length--;
|
|
+ continue;
|
|
+ default:
|
|
+ opsize = *ptr++;
|
|
+ if (opsize < 2) /* "silly options" */
|
|
+ return;
|
|
+ if (opsize > length)
|
|
+ return; /* don't parse partial options */
|
|
+ if (opcode == TCPOPT_MPTCP)
|
|
+ mptcp_parse_options(ptr - 2, opsize, mopt, skb, NULL);
|
|
+ }
|
|
+ ptr += opsize - 2;
|
|
+ length -= opsize;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool mptcp_check_rtt(const struct tcp_sock *tp, int time)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ u32 rtt_max = 0;
|
|
+
|
|
+ /* In MPTCP, we take the max delay across all flows,
|
|
+ * in order to take into account meta-reordering buffers.
|
|
+ */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (!mptcp_sk_can_recv(sk))
|
|
+ continue;
|
|
+
|
|
+ if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt_us)
|
|
+ rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt_us;
|
|
+ }
|
|
+ if (time < (rtt_max >> 3) || !rtt_max)
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk)
|
|
+{
|
|
+ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
|
|
+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
|
|
+ union inet_addr addr;
|
|
+ sa_family_t family;
|
|
+ __be16 port = 0;
|
|
+ bool is_v4;
|
|
+
|
|
+ if (mpcb->mptcp_ver < MPTCP_VERSION_1) {
|
|
+ is_v4 = mpadd->u_bit.v0.ipver == 4;
|
|
+ } else {
|
|
+ is_v4 = mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 ||
|
|
+ mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2;
|
|
+
|
|
+ /* TODO: support ADD_ADDRv1 retransmissions */
|
|
+ if (mpadd->u_bit.v1.echo)
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (is_v4) {
|
|
+ u8 hash_mac_check[SHA256_DIGEST_SIZE];
|
|
+ __be16 hmacport = 0;
|
|
+ char *recv_hmac;
|
|
+
|
|
+ if (mpcb->mptcp_ver < MPTCP_VERSION_1)
|
|
+ goto skip_hmac_v4;
|
|
+
|
|
+ recv_hmac = (char *)mpadd->u.v4.mac;
|
|
+ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1) {
|
|
+ recv_hmac -= sizeof(mpadd->u.v4.port);
|
|
+ } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2) {
|
|
+ hmacport = mpadd->u.v4.port;
|
|
+ }
|
|
+ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key,
|
|
+ (u8 *)&mpcb->mptcp_loc_key, hash_mac_check, 3,
|
|
+ 1, (u8 *)&mpadd->addr_id,
|
|
+ 4, (u8 *)&mpadd->u.v4.addr.s_addr,
|
|
+ 2, (u8 *)&hmacport);
|
|
+ if (memcmp(&hash_mac_check[SHA256_DIGEST_SIZE - sizeof(u64)], recv_hmac, 8) != 0)
|
|
+ /* ADD_ADDR2 discarded */
|
|
+ return;
|
|
+skip_hmac_v4:
|
|
+ if ((mpcb->mptcp_ver == MPTCP_VERSION_0 &&
|
|
+ mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
|
|
+ (mpcb->mptcp_ver == MPTCP_VERSION_1 &&
|
|
+ mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2))
|
|
+ port = mpadd->u.v4.port;
|
|
+ family = AF_INET;
|
|
+ addr.in = mpadd->u.v4.addr;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ } else {
|
|
+ u8 hash_mac_check[SHA256_DIGEST_SIZE];
|
|
+ __be16 hmacport = 0;
|
|
+ char *recv_hmac;
|
|
+
|
|
+ if (mpcb->mptcp_ver < MPTCP_VERSION_1)
|
|
+ goto skip_hmac_v6;
|
|
+
|
|
+ recv_hmac = (char *)mpadd->u.v6.mac;
|
|
+ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1) {
|
|
+ recv_hmac -= sizeof(mpadd->u.v6.port);
|
|
+ } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2) {
|
|
+ hmacport = mpadd->u.v6.port;
|
|
+ }
|
|
+ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key,
|
|
+ (u8 *)&mpcb->mptcp_loc_key, hash_mac_check, 3,
|
|
+ 1, (u8 *)&mpadd->addr_id,
|
|
+ 16, (u8 *)&mpadd->u.v6.addr.s6_addr,
|
|
+ 2, (u8 *)&hmacport);
|
|
+ if (memcmp(&hash_mac_check[SHA256_DIGEST_SIZE - sizeof(u64)], recv_hmac, 8) != 0)
|
|
+ /* ADD_ADDR2 discarded */
|
|
+ return;
|
|
+skip_hmac_v6:
|
|
+ if ((mpcb->mptcp_ver == MPTCP_VERSION_0 &&
|
|
+ mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2) ||
|
|
+ (mpcb->mptcp_ver == MPTCP_VERSION_1 &&
|
|
+ mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2))
|
|
+ port = mpadd->u.v6.port;
|
|
+ family = AF_INET6;
|
|
+ addr.in6 = mpadd->u.v6.addr;
|
|
+#endif /* CONFIG_IPV6 */
|
|
+ }
|
|
+
|
|
+ if (mpcb->pm_ops->add_raddr)
|
|
+ mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id);
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRRX);
|
|
+}
|
|
+
|
|
+static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk)
|
|
+{
|
|
+ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
|
|
+ int i;
|
|
+ u8 rem_id;
|
|
+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
|
|
+
|
|
+ for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) {
|
|
+ rem_id = (&mprem->addrs_id)[i];
|
|
+
|
|
+ if (mpcb->pm_ops->rem_raddr)
|
|
+ mpcb->pm_ops->rem_raddr(mpcb, rem_id);
|
|
+ mptcp_send_reset_rem_id(mpcb, rem_id);
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_REMADDRSUB);
|
|
+ }
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_REMADDRRX);
|
|
+}
|
|
+
|
|
+static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk)
|
|
+{
|
|
+ struct tcphdr *th = tcp_hdr(skb);
|
|
+ unsigned char *ptr;
|
|
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
|
|
+
|
|
+ /* Jump through the options to check whether ADD_ADDR is there */
|
|
+ ptr = (unsigned char *)(th + 1);
|
|
+ while (length > 0) {
|
|
+ int opcode = *ptr++;
|
|
+ int opsize;
|
|
+
|
|
+ switch (opcode) {
|
|
+ case TCPOPT_EOL:
|
|
+ return;
|
|
+ case TCPOPT_NOP:
|
|
+ length--;
|
|
+ continue;
|
|
+ default:
|
|
+ opsize = *ptr++;
|
|
+ if (opsize < 2)
|
|
+ return;
|
|
+ if (opsize > length)
|
|
+ return; /* don't parse partial options */
|
|
+ if (opcode == TCPOPT_MPTCP &&
|
|
+ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) {
|
|
+ u8 mptcp_ver = tcp_sk(sk)->mpcb->mptcp_ver;
|
|
+ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
|
|
+
|
|
+ if (!is_valid_addropt_opsize(mptcp_ver, mpadd,
|
|
+ opsize))
|
|
+ goto cont;
|
|
+
|
|
+ mptcp_handle_add_addr(ptr, sk);
|
|
+ }
|
|
+ if (opcode == TCPOPT_MPTCP &&
|
|
+ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) {
|
|
+ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0)
|
|
+ goto cont;
|
|
+
|
|
+ mptcp_handle_rem_addr(ptr, sk);
|
|
+ }
|
|
+cont:
|
|
+ ptr += opsize - 2;
|
|
+ length -= opsize;
|
|
+ }
|
|
+ }
|
|
+ return;
|
|
+}
|
|
+
|
|
+static bool mptcp_mp_fastclose_rcvd(struct sock *sk)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;
|
|
+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
|
|
+
|
|
+ if (likely(!mptcp->rx_opt.mp_fclose))
|
|
+ return false;
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FASTCLOSERX);
|
|
+ mptcp->rx_opt.mp_fclose = 0;
|
|
+ if (mptcp->rx_opt.mptcp_sender_key != mpcb->mptcp_loc_key)
|
|
+ return false;
|
|
+
|
|
+ mptcp_sub_force_close_all(mpcb, NULL);
|
|
+
|
|
+ tcp_reset(mptcp_meta_sk(sk));
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX);
|
|
+ mptcp->rx_opt.mp_fail = 0;
|
|
+
|
|
+ if (!th->rst && !mpcb->infinite_mapping_snd) {
|
|
+ mpcb->send_infinite_mapping = 1;
|
|
+
|
|
+ mptcp_restart_sending(meta_sk);
|
|
+
|
|
+ mptcp_fallback_close(mpcb, sk);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void mptcp_path_array_check(struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+
|
|
+ if (unlikely(mpcb->list_rcvd)) {
|
|
+ mpcb->list_rcvd = 0;
|
|
+ if (mpcb->pm_ops->new_remote_address)
|
|
+ mpcb->pm_ops->new_remote_address(meta_sk);
|
|
+ }
|
|
+}
|
|
+
|
|
+bool mptcp_handle_options(struct sock *sk, const struct tcphdr *th,
|
|
+ const struct sk_buff *skb)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+
|
|
+ if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd)
|
|
+ return false;
|
|
+
|
|
+ if (mptcp_mp_fastclose_rcvd(sk))
|
|
+ return true;
|
|
+
|
|
+ if (sk->sk_state == TCP_RST_WAIT && !th->rst)
|
|
+ return true;
|
|
+
|
|
+ if (mopt->saw_mpc && !tp->mpcb->rem_key_set)
|
|
+ mptcp_initialize_recv_vars(mptcp_meta_tp(tp), tp->mpcb,
|
|
+ mopt->mptcp_sender_key);
|
|
+
|
|
+ if (unlikely(mopt->mp_fail))
|
|
+ mptcp_mp_fail_rcvd(sk, th);
|
|
+
|
|
+ /* RFC 6824, Section 3.3:
|
|
+ * If a checksum is not present when its use has been negotiated, the
|
|
+ * receiver MUST close the subflow with a RST as it is considered broken.
|
|
+ */
|
|
+ if ((mptcp_is_data_seq(skb) || mptcp_is_data_mpcapable(skb)) &&
|
|
+ tp->mpcb->dss_csum &&
|
|
+ !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) {
|
|
+ mptcp_send_reset(sk);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ /* We have to acknowledge retransmissions of the third
|
|
+ * ack.
|
|
+ */
|
|
+ if (mopt->join_ack) {
|
|
+ tcp_send_delayed_ack(sk);
|
|
+ mopt->join_ack = 0;
|
|
+ }
|
|
+
|
|
+ if (mopt->saw_add_addr || mopt->saw_rem_addr) {
|
|
+ if (mopt->more_add_addr || mopt->more_rem_addr) {
|
|
+ mptcp_parse_addropt(skb, sk);
|
|
+ } else {
|
|
+ if (mopt->saw_add_addr)
|
|
+ mptcp_handle_add_addr(mopt->add_addr_ptr, sk);
|
|
+ if (mopt->saw_rem_addr)
|
|
+ mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk);
|
|
+ }
|
|
+
|
|
+ mopt->more_add_addr = 0;
|
|
+ mopt->saw_add_addr = 0;
|
|
+ mopt->more_rem_addr = 0;
|
|
+ mopt->saw_rem_addr = 0;
|
|
+ }
|
|
+ if (mopt->saw_low_prio) {
|
|
+ if (mopt->saw_low_prio == 1) {
|
|
+ tp->mptcp->rcv_low_prio = mopt->low_prio;
|
|
+ if (mpcb->pm_ops->prio_changed)
|
|
+ mpcb->pm_ops->prio_changed(sk, mopt->low_prio);
|
|
+ } else {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ if (mptcp->rem_id == mopt->prio_addr_id) {
|
|
+ mptcp->rcv_low_prio = mopt->low_prio;
|
|
+ if (mpcb->pm_ops->prio_changed)
|
|
+ mpcb->pm_ops->prio_changed(sk,
|
|
+ mopt->low_prio);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ mopt->saw_low_prio = 0;
|
|
+ }
|
|
+
|
|
+ if (mptcp_process_data_ack(sk, skb))
|
|
+ return true;
|
|
+
|
|
+ mptcp_path_array_check(mptcp_meta_sk(sk));
|
|
+ /* Socket may have been mp_killed by a REMOVE_ADDR */
|
|
+ if (tp->mp_killed)
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void _mptcp_rcv_synsent_fastopen(struct sock *meta_sk,
|
|
+ struct sk_buff *skb, bool rtx_queue)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk);
|
|
+ u32 new_mapping = meta_tp->write_seq - master_tp->snd_una;
|
|
+
|
|
+ /* If the server only acknowledges partially the data sent in
|
|
+ * the SYN, we need to trim the acknowledged part because
|
|
+ * we don't want to retransmit this already received data.
|
|
+ * When we reach this point, tcp_ack() has already cleaned up
|
|
+ * fully acked segments. However, tcp trims partially acked
|
|
+ * segments only when retransmitting. Since MPTCP comes into
|
|
+ * play only now, we will fake an initial transmit, and
|
|
+ * retransmit_skb() will not be called. The following fragment
|
|
+ * comes from __tcp_retransmit_skb().
|
|
+ */
|
|
+ if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) {
|
|
+ BUG_ON(before(TCP_SKB_CB(skb)->end_seq, master_tp->snd_una));
|
|
+ /* tcp_trim_head can only returns ENOMEM if skb is
|
|
+ * cloned. It is not the case here (see
|
|
+ * tcp_send_syn_data).
|
|
+ */
|
|
+ BUG_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una -
|
|
+ TCP_SKB_CB(skb)->seq));
|
|
+ }
|
|
+
|
|
+ TCP_SKB_CB(skb)->seq += new_mapping;
|
|
+ TCP_SKB_CB(skb)->end_seq += new_mapping;
|
|
+ TCP_SKB_CB(skb)->sacked = 0;
|
|
+
|
|
+ list_del(&skb->tcp_tsorted_anchor);
|
|
+
|
|
+ if (rtx_queue)
|
|
+ tcp_rtx_queue_unlink(skb, meta_sk);
|
|
+
|
|
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
|
|
+
|
|
+ if (rtx_queue)
|
|
+ tcp_add_write_queue_tail(meta_sk, skb);
|
|
+}
|
|
+
|
|
+/* In case of fastopen, some data can already be in the write queue.
|
|
+ * We need to update the sequence number of the segments as they
|
|
+ * were initially TCP sequence numbers.
|
|
+ */
|
|
+static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk);
|
|
+ struct sk_buff *skb_write_head, *skb_rtx_head, *tmp;
|
|
+
|
|
+ skb_write_head = tcp_write_queue_head(meta_sk);
|
|
+ skb_rtx_head = tcp_rtx_queue_head(meta_sk);
|
|
+
|
|
+ if (!(skb_write_head || skb_rtx_head))
|
|
+ return;
|
|
+
|
|
+ /* There should only be one skb in {write, rtx} queue: the data not
|
|
+ * acknowledged in the SYN+ACK. In this case, we need to map
|
|
+ * this data to data sequence numbers.
|
|
+ */
|
|
+
|
|
+ BUG_ON(skb_write_head && skb_rtx_head);
|
|
+
|
|
+ if (skb_write_head) {
|
|
+ skb_queue_walk_from_safe(&meta_sk->sk_write_queue,
|
|
+ skb_write_head, tmp) {
|
|
+ _mptcp_rcv_synsent_fastopen(meta_sk, skb_write_head,
|
|
+ false);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (skb_rtx_head) {
|
|
+ skb_rbtree_walk_from_safe(skb_rtx_head, tmp) {
|
|
+ _mptcp_rcv_synsent_fastopen(meta_sk, skb_rtx_head,
|
|
+ true);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* We can advance write_seq by the number of bytes unacknowledged
|
|
+ * and that were mapped in the previous loop.
|
|
+ */
|
|
+ meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una;
|
|
+
|
|
+ /* The packets from the master_sk will be entailed to it later
|
|
+ * Until that time, its write queue is empty, and
|
|
+ * write_seq must align with snd_una
|
|
+ */
|
|
+ master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una;
|
|
+ master_tp->packets_out = 0;
|
|
+ tcp_clear_retrans(meta_tp);
|
|
+ tcp_clear_retrans(master_tp);
|
|
+ tcp_set_ca_state(meta_tp->mpcb->master_sk, TCP_CA_Open);
|
|
+ tcp_set_ca_state(meta_sk, TCP_CA_Open);
|
|
+}
|
|
+
|
|
+/* The skptr is needed, because if we become MPTCP-capable, we have to switch
|
|
+ * from meta-socket to master-socket.
|
|
+ *
|
|
+ * @return: 1 - we want to reset this connection
|
|
+ * 2 - we want to discard the received syn/ack
|
|
+ * 0 - everything is fine - continue
|
|
+ */
|
|
+int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
|
|
+ const struct sk_buff *skb,
|
|
+ const struct mptcp_options_received *mopt)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ if (mptcp(tp)) {
|
|
+ u8 hash_mac_check[SHA256_DIGEST_SIZE];
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+
|
|
+ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_rem_key,
|
|
+ (u8 *)&mpcb->mptcp_loc_key, hash_mac_check, 2,
|
|
+ 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
|
|
+ 4, (u8 *)&tp->mptcp->mptcp_loc_nonce);
|
|
+ if (memcmp(hash_mac_check,
|
|
+ (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) {
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC);
|
|
+ mptcp_sub_force_close(sk);
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ /* Set this flag in order to postpone data sending
|
|
+ * until the 4th ack arrives.
|
|
+ */
|
|
+ tp->mptcp->pre_established = 1;
|
|
+ tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio;
|
|
+
|
|
+ mptcp_hmac(mpcb->mptcp_ver, (u8 *)&mpcb->mptcp_loc_key,
|
|
+ (u8 *)&mpcb->mptcp_rem_key,
|
|
+ tp->mptcp->sender_mac, 2,
|
|
+ 4, (u8 *)&tp->mptcp->mptcp_loc_nonce,
|
|
+ 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce);
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
|
|
+ } else if (mopt->saw_mpc) {
|
|
+ struct sock *meta_sk = sk;
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
|
|
+ if (mopt->mptcp_ver > tcp_sk(sk)->mptcp_ver)
|
|
+ /* TODO Consider adding new MPTCP_INC_STATS entry */
|
|
+ goto fallback;
|
|
+ if (tcp_sk(sk)->mptcp_ver == MPTCP_VERSION_1 &&
|
|
+ mopt->mptcp_ver < MPTCP_VERSION_1)
|
|
+ /* TODO Consider adding new MPTCP_INC_STATS entry */
|
|
+ /* TODO - record this in the cache - use v0 next time */
|
|
+ goto fallback;
|
|
+
|
|
+ if (mptcp_create_master_sk(sk, mopt->mptcp_sender_key, 1,
|
|
+ mopt->mptcp_ver,
|
|
+ ntohs(tcp_hdr(skb)->window)))
|
|
+ return 2;
|
|
+
|
|
+ sk = tcp_sk(sk)->mpcb->master_sk;
|
|
+ *skptr = sk;
|
|
+ tp = tcp_sk(sk);
|
|
+
|
|
+ /* If fastopen was used data might be in the send queue. We
|
|
+ * need to update their sequence number to MPTCP-level seqno.
|
|
+ * Note that it can happen in rare cases that fastopen_req is
|
|
+ * NULL and syn_data is 0 but fastopen indeed occurred and
|
|
+ * data has been queued in the write queue (but not sent).
|
|
+ * Example of such rare cases: connect is non-blocking and
|
|
+ * TFO is configured to work without cookies.
|
|
+ */
|
|
+ mptcp_rcv_synsent_fastopen(meta_sk);
|
|
+
|
|
+ /* -1, because the SYN consumed 1 byte. In case of TFO, we
|
|
+ * start the subflow-sequence number as if the data of the SYN
|
|
+ * is not part of any mapping.
|
|
+ */
|
|
+ tp->mptcp->snt_isn = tp->snd_una - 1;
|
|
+ tp->mpcb->dss_csum = mopt->dss_csum;
|
|
+ if (tp->mpcb->dss_csum)
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CSUMENABLED);
|
|
+
|
|
+ if (tp->mpcb->mptcp_ver >= MPTCP_VERSION_1)
|
|
+ tp->mpcb->send_mptcpv1_mpcapable = 1;
|
|
+
|
|
+ tp->mptcp->include_mpc = 1;
|
|
+
|
|
+ sk_set_socket(sk, meta_sk->sk_socket);
|
|
+ sk->sk_wq = meta_sk->sk_wq;
|
|
+
|
|
+ bh_unlock_sock(sk);
|
|
+ /* hold in sk_clone_lock due to initialization to 2 */
|
|
+ sock_put(sk);
|
|
+ } else {
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
|
|
+fallback:
|
|
+ tp->request_mptcp = 0;
|
|
+
|
|
+ if (tp->inside_tk_table)
|
|
+ mptcp_hash_remove_bh(tp);
|
|
+ }
|
|
+
|
|
+ if (mptcp(tp))
|
|
+ tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Similar to tcp_should_expand_sndbuf */
|
|
+bool mptcp_should_expand_sndbuf(const struct sock *sk)
|
|
+{
|
|
+ const struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ const struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ /* We circumvent this check in tcp_check_space, because we want to
|
|
+ * always call sk_write_space. So, we reproduce the check here.
|
|
+ */
|
|
+ if (!meta_sk->sk_socket ||
|
|
+ !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
|
|
+ return false;
|
|
+
|
|
+ /* If the user specified a specific send buffer setting, do
|
|
+ * not modify it.
|
|
+ */
|
|
+ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
|
|
+ return false;
|
|
+
|
|
+ /* If we are under global TCP memory pressure, do not expand. */
|
|
+ if (tcp_under_memory_pressure(meta_sk))
|
|
+ return false;
|
|
+
|
|
+ /* If we are under soft global TCP memory pressure, do not expand. */
|
|
+ if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0))
|
|
+ return false;
|
|
+
|
|
+ /* For MPTCP we look for a subsocket that could send data.
|
|
+ * If we found one, then we update the send-buffer.
|
|
+ */
|
|
+ mptcp_for_each_sub(meta_tp->mpcb, mptcp) {
|
|
+ const struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+ const struct tcp_sock *tp_it = tcp_sk(sk_it);
|
|
+
|
|
+ if (!mptcp_sk_can_send(sk_it))
|
|
+ continue;
|
|
+
|
|
+ if (tcp_packets_in_flight(tp_it) < tp_it->snd_cwnd)
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+void mptcp_tcp_set_rto(struct sock *sk)
|
|
+{
|
|
+ tcp_set_rto(sk);
|
|
+ mptcp_set_rto(sk);
|
|
+}
|
|
diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c
|
|
new file mode 100644
|
|
index 000000000000..0370a7680d47
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_ipv4.c
|
|
@@ -0,0 +1,431 @@
|
|
+/*
|
|
+ * MPTCP implementation - IPv4-specific functions
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
|
|
+ *
|
|
+ * Current Maintainer:
|
|
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
|
|
+ *
|
|
+ * Additional authors:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ * Gregory Detal <gregory.detal@uclouvain.be>
|
|
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
|
|
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
|
|
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
|
|
+ * Andreas Ripke <ripke@neclab.eu>
|
|
+ * Vlad Dogaru <vlad.dogaru@intel.com>
|
|
+ * Octavian Purdila <octavian.purdila@intel.com>
|
|
+ * John Ronan <jronan@tssg.org>
|
|
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
|
|
+ * Brandon Heller <brandonh@stanford.edu>
|
|
+ *
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#include <linux/export.h>
|
|
+#include <linux/ip.h>
|
|
+#include <linux/list.h>
|
|
+#include <linux/skbuff.h>
|
|
+#include <linux/spinlock.h>
|
|
+#include <linux/tcp.h>
|
|
+
|
|
+#include <net/inet_common.h>
|
|
+#include <net/inet_connection_sock.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
+#include <net/request_sock.h>
|
|
+#include <net/tcp.h>
|
|
+
|
|
+u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
|
|
+{
|
|
+ return siphash_4u32((__force u32)saddr, (__force u32)daddr,
|
|
+ (__force u32)sport << 16 | (__force u32)dport,
|
|
+ mptcp_seed++, &mptcp_secret);
|
|
+}
|
|
+
|
|
+u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
|
|
+ u32 seed)
|
|
+{
|
|
+ return siphash_2u64((__force u64)saddr << 32 | (__force u64)daddr,
|
|
+ (__force u64)seed << 32 | (__force u64)sport << 16 | (__force u64)dport,
|
|
+ &mptcp_secret);
|
|
+}
|
|
+
|
|
+
|
|
+static void mptcp_v4_reqsk_destructor(struct request_sock *req)
|
|
+{
|
|
+ mptcp_reqsk_destructor(req);
|
|
+
|
|
+ tcp_v4_reqsk_destructor(req);
|
|
+}
|
|
+
|
|
+static int mptcp_v4_init_req(struct request_sock *req, const struct sock *sk,
|
|
+ struct sk_buff *skb, bool want_cookie)
|
|
+{
|
|
+ tcp_request_sock_ipv4_ops.init_req(req, sk, skb, want_cookie);
|
|
+
|
|
+ mptcp_rsk(req)->hash_entry.pprev = NULL;
|
|
+ mptcp_rsk(req)->is_sub = 0;
|
|
+ inet_rsk(req)->mptcp_rqsk = 1;
|
|
+
|
|
+ /* In case of SYN-cookies, we wait for the isn to be generated - it is
|
|
+ * input to the key-generation.
|
|
+ */
|
|
+ if (!want_cookie)
|
|
+ mptcp_reqsk_init(req, sk, skb, false);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SYN_COOKIES
|
|
+static u32 mptcp_v4_cookie_init_seq(struct request_sock *req, const struct sock *sk,
|
|
+ const struct sk_buff *skb, __u16 *mssp)
|
|
+{
|
|
+ __u32 isn = cookie_v4_init_sequence(req, sk, skb, mssp);
|
|
+
|
|
+ tcp_rsk(req)->snt_isn = isn;
|
|
+
|
|
+ mptcp_reqsk_init(req, sk, skb, true);
|
|
+
|
|
+ return isn;
|
|
+}
|
|
+#endif
|
|
+
|
|
+/* May be called without holding the meta-level lock */
|
|
+static int mptcp_v4_join_init_req(struct request_sock *req, const struct sock *meta_sk,
|
|
+ struct sk_buff *skb, bool want_cookie)
|
|
+{
|
|
+ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
|
|
+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ union inet_addr addr;
|
|
+ int loc_id;
|
|
+ bool low_prio = false;
|
|
+
|
|
+ if (!mpcb->rem_key_set)
|
|
+ return -1;
|
|
+
|
|
+ /* We need to do this as early as possible. Because, if we fail later
|
|
+ * (e.g., get_local_id), then reqsk_free tries to remove the
|
|
+ * request-socket from the htb in mptcp_hash_request_remove as pprev
|
|
+ * may be different from NULL.
|
|
+ */
|
|
+ mtreq->hash_entry.pprev = NULL;
|
|
+
|
|
+ tcp_request_sock_ipv4_ops.init_req(req, meta_sk, skb, want_cookie);
|
|
+
|
|
+ mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr,
|
|
+ ip_hdr(skb)->daddr,
|
|
+ tcp_hdr(skb)->source,
|
|
+ tcp_hdr(skb)->dest);
|
|
+ addr.ip = inet_rsk(req)->ir_loc_addr;
|
|
+ loc_id = mpcb->pm_ops->get_local_id(meta_sk, AF_INET, &addr, &low_prio);
|
|
+ if (loc_id == -1)
|
|
+ return -1;
|
|
+ mtreq->loc_id = loc_id;
|
|
+ mtreq->low_prio = low_prio;
|
|
+
|
|
+ mptcp_join_reqsk_init(mpcb, req, skb);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Similar to tcp_request_sock_ops */
|
|
+struct request_sock_ops mptcp_request_sock_ops __read_mostly = {
|
|
+ .family = PF_INET,
|
|
+ .obj_size = sizeof(struct mptcp_request_sock),
|
|
+ .rtx_syn_ack = tcp_rtx_synack,
|
|
+ .send_ack = tcp_v4_reqsk_send_ack,
|
|
+ .destructor = mptcp_v4_reqsk_destructor,
|
|
+ .send_reset = tcp_v4_send_reset,
|
|
+ .syn_ack_timeout = tcp_syn_ack_timeout,
|
|
+};
|
|
+
|
|
+/* Similar to: tcp_v4_conn_request
|
|
+ * May be called without holding the meta-level lock
|
|
+ */
|
|
+static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb)
|
|
+{
|
|
+ return tcp_conn_request(&mptcp_request_sock_ops,
|
|
+ &mptcp_join_request_sock_ipv4_ops,
|
|
+ meta_sk, skb);
|
|
+}
|
|
+
|
|
+/* Similar to: tcp_v4_do_rcv
|
|
+ * We only process join requests here. (either the SYN or the final ACK)
|
|
+ */
|
|
+int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
|
|
+{
|
|
+ const struct tcphdr *th = tcp_hdr(skb);
|
|
+ const struct iphdr *iph = ip_hdr(skb);
|
|
+ struct sock *child, *rsk = NULL, *sk;
|
|
+ int ret;
|
|
+
|
|
+ sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo,
|
|
+ iph->saddr, th->source, iph->daddr,
|
|
+ th->dest, inet_iif(skb));
|
|
+
|
|
+ if (!sk)
|
|
+ goto new_subflow;
|
|
+
|
|
+ if (is_meta_sk(sk)) {
|
|
+ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
|
|
+ sock_put(sk);
|
|
+ goto discard;
|
|
+ }
|
|
+
|
|
+ if (sk->sk_state == TCP_TIME_WAIT) {
|
|
+ inet_twsk_put(inet_twsk(sk));
|
|
+ goto discard;
|
|
+ }
|
|
+
|
|
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
|
|
+ struct request_sock *req = inet_reqsk(sk);
|
|
+ bool req_stolen;
|
|
+
|
|
+ if (!mptcp_can_new_subflow(meta_sk))
|
|
+ goto reset_and_discard;
|
|
+
|
|
+ local_bh_disable();
|
|
+ child = tcp_check_req(meta_sk, skb, req, false, &req_stolen);
|
|
+ if (!child) {
|
|
+ reqsk_put(req);
|
|
+ local_bh_enable();
|
|
+ goto discard;
|
|
+ }
|
|
+
|
|
+ if (child != meta_sk) {
|
|
+ ret = mptcp_finish_handshake(child, skb);
|
|
+ if (ret) {
|
|
+ rsk = child;
|
|
+ local_bh_enable();
|
|
+ goto reset_and_discard;
|
|
+ }
|
|
+
|
|
+ bh_unlock_sock(meta_sk);
|
|
+ local_bh_enable();
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ /* tcp_check_req failed */
|
|
+ reqsk_put(req);
|
|
+
|
|
+ local_bh_enable();
|
|
+ goto discard;
|
|
+ }
|
|
+
|
|
+ ret = tcp_v4_do_rcv(sk, skb);
|
|
+ sock_put(sk);
|
|
+
|
|
+ return ret;
|
|
+
|
|
+new_subflow:
|
|
+ if (!mptcp_can_new_subflow(meta_sk))
|
|
+ goto reset_and_discard;
|
|
+
|
|
+ child = tcp_v4_cookie_check(meta_sk, skb);
|
|
+ if (!child)
|
|
+ goto discard;
|
|
+
|
|
+ if (child != meta_sk) {
|
|
+ ret = mptcp_finish_handshake(child, skb);
|
|
+ if (ret) {
|
|
+ rsk = child;
|
|
+ goto reset_and_discard;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (tcp_hdr(skb)->syn) {
|
|
+ local_bh_disable();
|
|
+ mptcp_v4_join_request(meta_sk, skb);
|
|
+ local_bh_enable();
|
|
+ }
|
|
+
|
|
+discard:
|
|
+ kfree_skb(skb);
|
|
+ return 0;
|
|
+
|
|
+reset_and_discard:
|
|
+ tcp_v4_send_reset(rsk, skb);
|
|
+ goto discard;
|
|
+}
|
|
+
|
|
+/* Create a new IPv4 subflow.
|
|
+ *
|
|
+ * We are in user-context and meta-sock-lock is hold.
|
|
+ */
|
|
+int __mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
|
|
+ __be16 sport, struct mptcp_rem4 *rem,
|
|
+ struct sock **subsk)
|
|
+{
|
|
+ struct tcp_sock *tp;
|
|
+ struct sock *sk;
|
|
+ struct sockaddr_in loc_in, rem_in;
|
|
+ struct socket_alloc sock_full;
|
|
+ struct socket *sock = (struct socket *)&sock_full;
|
|
+ int ret;
|
|
+
|
|
+ /** First, create and prepare the new socket */
|
|
+ memcpy(&sock_full, meta_sk->sk_socket, sizeof(sock_full));
|
|
+ sock->state = SS_UNCONNECTED;
|
|
+ sock->ops = NULL;
|
|
+
|
|
+ ret = inet_create(sock_net(meta_sk), sock, IPPROTO_TCP, 1);
|
|
+ if (unlikely(ret < 0)) {
|
|
+ net_err_ratelimited("%s inet_create failed ret: %d\n",
|
|
+ __func__, ret);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ sk = sock->sk;
|
|
+ tp = tcp_sk(sk);
|
|
+
|
|
+ /* All subsockets need the MPTCP-lock-class */
|
|
+ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, meta_slock_key_name);
|
|
+ lockdep_init_map(&(sk)->sk_lock.dep_map, meta_key_name, &meta_key, 0);
|
|
+
|
|
+ ret = mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL);
|
|
+ if (ret) {
|
|
+ net_err_ratelimited("%s mptcp_add_sock failed ret: %d\n",
|
|
+ __func__, ret);
|
|
+ goto error;
|
|
+ }
|
|
+
|
|
+ tp->mptcp->slave_sk = 1;
|
|
+ tp->mptcp->low_prio = loc->low_prio;
|
|
+
|
|
+ /* Initializing the timer for an MPTCP subflow */
|
|
+ timer_setup(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, 0);
|
|
+
|
|
+ /** Then, connect the socket to the peer */
|
|
+ loc_in.sin_family = AF_INET;
|
|
+ rem_in.sin_family = AF_INET;
|
|
+ loc_in.sin_port = sport;
|
|
+ if (rem->port)
|
|
+ rem_in.sin_port = rem->port;
|
|
+ else
|
|
+ rem_in.sin_port = inet_sk(meta_sk)->inet_dport;
|
|
+ loc_in.sin_addr = loc->addr;
|
|
+ rem_in.sin_addr = rem->addr;
|
|
+
|
|
+ if (loc->if_idx)
|
|
+ sk->sk_bound_dev_if = loc->if_idx;
|
|
+
|
|
+ ret = kernel_bind(sock, (struct sockaddr *)&loc_in,
|
|
+ sizeof(struct sockaddr_in));
|
|
+ if (ret < 0) {
|
|
+ net_err_ratelimited("%s: token %#x bind() to %pI4 index %d failed, error %d\n",
|
|
+ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
|
|
+ &loc_in.sin_addr, loc->if_idx, ret);
|
|
+ goto error;
|
|
+ }
|
|
+
|
|
+ mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d ifidx: %d\n",
|
|
+ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
|
|
+ tp->mptcp->path_index, &loc_in.sin_addr,
|
|
+ ntohs(loc_in.sin_port), &rem_in.sin_addr,
|
|
+ ntohs(rem_in.sin_port), loc->if_idx);
|
|
+
|
|
+ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4)
|
|
+ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr);
|
|
+
|
|
+ ret = kernel_connect(sock, (struct sockaddr *)&rem_in,
|
|
+ sizeof(struct sockaddr_in), O_NONBLOCK);
|
|
+ if (ret < 0 && ret != -EINPROGRESS) {
|
|
+ net_err_ratelimited("%s: MPTCP subsocket connect() failed, error %d\n",
|
|
+ __func__, ret);
|
|
+ goto error;
|
|
+ }
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINSYNTX);
|
|
+
|
|
+ sk_set_socket(sk, meta_sk->sk_socket);
|
|
+ sk->sk_wq = meta_sk->sk_wq;
|
|
+
|
|
+ if (subsk)
|
|
+ *subsk = sk;
|
|
+
|
|
+ return 0;
|
|
+
|
|
+error:
|
|
+ /* May happen if mptcp_add_sock fails first */
|
|
+ if (!mptcp(tp)) {
|
|
+ tcp_close(sk, 0);
|
|
+ } else {
|
|
+ local_bh_disable();
|
|
+ mptcp_sub_force_close(sk);
|
|
+ local_bh_enable();
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+EXPORT_SYMBOL(__mptcp_init4_subsockets);
|
|
+
|
|
+const struct inet_connection_sock_af_ops mptcp_v4_specific = {
|
|
+ .queue_xmit = ip_queue_xmit,
|
|
+ .send_check = tcp_v4_send_check,
|
|
+ .rebuild_header = inet_sk_rebuild_header,
|
|
+ .sk_rx_dst_set = inet_sk_rx_dst_set,
|
|
+ .conn_request = mptcp_conn_request,
|
|
+ .syn_recv_sock = tcp_v4_syn_recv_sock,
|
|
+ .net_header_len = sizeof(struct iphdr),
|
|
+ .setsockopt = ip_setsockopt,
|
|
+ .getsockopt = ip_getsockopt,
|
|
+ .addr2sockaddr = inet_csk_addr2sockaddr,
|
|
+ .sockaddr_len = sizeof(struct sockaddr_in),
|
|
+#ifdef CONFIG_COMPAT
|
|
+ .compat_setsockopt = compat_ip_setsockopt,
|
|
+ .compat_getsockopt = compat_ip_getsockopt,
|
|
+#endif
|
|
+ .mtu_reduced = tcp_v4_mtu_reduced,
|
|
+};
|
|
+
|
|
+struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
|
|
+struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
|
|
+
|
|
+/* General initialization of IPv4 for MPTCP */
|
|
+int mptcp_pm_v4_init(void)
|
|
+{
|
|
+ int ret = 0;
|
|
+ struct request_sock_ops *ops = &mptcp_request_sock_ops;
|
|
+
|
|
+ mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
|
|
+ mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req;
|
|
+#ifdef CONFIG_SYN_COOKIES
|
|
+ mptcp_request_sock_ipv4_ops.cookie_init_seq = mptcp_v4_cookie_init_seq;
|
|
+#endif
|
|
+ mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
|
|
+ mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req;
|
|
+
|
|
+ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP");
|
|
+ if (ops->slab_name == NULL) {
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
|
|
+ SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN,
|
|
+ NULL);
|
|
+
|
|
+ if (ops->slab == NULL) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err_reqsk_create;
|
|
+ }
|
|
+
|
|
+out:
|
|
+ return ret;
|
|
+
|
|
+err_reqsk_create:
|
|
+ kfree(ops->slab_name);
|
|
+ ops->slab_name = NULL;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+void mptcp_pm_v4_undo(void)
|
|
+{
|
|
+ kmem_cache_destroy(mptcp_request_sock_ops.slab);
|
|
+ kfree(mptcp_request_sock_ops.slab_name);
|
|
+}
|
|
diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c
|
|
new file mode 100644
|
|
index 000000000000..8af32df4fd5f
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_ipv6.c
|
|
@@ -0,0 +1,479 @@
|
|
+/*
|
|
+ * MPTCP implementation - IPv6-specific functions
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
|
|
+ *
|
|
+ * Current Maintainer:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ *
|
|
+ * Additional authors:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ * Gregory Detal <gregory.detal@uclouvain.be>
|
|
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
|
|
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
|
|
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
|
|
+ * Andreas Ripke <ripke@neclab.eu>
|
|
+ * Vlad Dogaru <vlad.dogaru@intel.com>
|
|
+ * Octavian Purdila <octavian.purdila@intel.com>
|
|
+ * John Ronan <jronan@tssg.org>
|
|
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
|
|
+ * Brandon Heller <brandonh@stanford.edu>
|
|
+ *
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#include <linux/export.h>
|
|
+#include <linux/in6.h>
|
|
+#include <linux/kernel.h>
|
|
+
|
|
+#include <net/addrconf.h>
|
|
+#include <net/flow.h>
|
|
+#include <net/inet6_connection_sock.h>
|
|
+#include <net/inet6_hashtables.h>
|
|
+#include <net/inet_common.h>
|
|
+#include <net/ipv6.h>
|
|
+#include <net/ip6_checksum.h>
|
|
+#include <net/ip6_route.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v6.h>
|
|
+#include <net/tcp.h>
|
|
+#include <net/transp_v6.h>
|
|
+
|
|
+__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
|
|
+ __be16 sport, __be16 dport)
|
|
+{
|
|
+ const struct {
|
|
+ struct in6_addr saddr;
|
|
+ struct in6_addr daddr;
|
|
+ u32 seed;
|
|
+ __be16 sport;
|
|
+ __be16 dport;
|
|
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
|
|
+ .saddr = *(struct in6_addr *)saddr,
|
|
+ .daddr = *(struct in6_addr *)daddr,
|
|
+ .seed = mptcp_seed++,
|
|
+ .sport = sport,
|
|
+ .dport = dport
|
|
+ };
|
|
+
|
|
+ return siphash(&combined, offsetofend(typeof(combined), dport),
|
|
+ &mptcp_secret);
|
|
+}
|
|
+
|
|
+u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
|
|
+ __be16 sport, __be16 dport, u32 seed)
|
|
+{
|
|
+ const struct {
|
|
+ struct in6_addr saddr;
|
|
+ struct in6_addr daddr;
|
|
+ u32 seed;
|
|
+ __be16 sport;
|
|
+ __be16 dport;
|
|
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
|
|
+ .saddr = *(struct in6_addr *)saddr,
|
|
+ .daddr = *(struct in6_addr *)daddr,
|
|
+ .seed = seed,
|
|
+ .sport = sport,
|
|
+ .dport = dport
|
|
+ };
|
|
+
|
|
+ return siphash(&combined, offsetofend(typeof(combined), dport),
|
|
+ &mptcp_secret);
|
|
+}
|
|
+
|
|
+static void mptcp_v6_reqsk_destructor(struct request_sock *req)
|
|
+{
|
|
+ mptcp_reqsk_destructor(req);
|
|
+
|
|
+ tcp_v6_reqsk_destructor(req);
|
|
+}
|
|
+
|
|
+static int mptcp_v6_init_req(struct request_sock *req, const struct sock *sk,
|
|
+ struct sk_buff *skb, bool want_cookie)
|
|
+{
|
|
+ tcp_request_sock_ipv6_ops.init_req(req, sk, skb, want_cookie);
|
|
+
|
|
+ mptcp_rsk(req)->hash_entry.pprev = NULL;
|
|
+ mptcp_rsk(req)->is_sub = 0;
|
|
+ inet_rsk(req)->mptcp_rqsk = 1;
|
|
+
|
|
+ /* In case of SYN-cookies, we wait for the isn to be generated - it is
|
|
+ * input to the key-generation.
|
|
+ */
|
|
+ if (!want_cookie)
|
|
+ mptcp_reqsk_init(req, sk, skb, false);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_SYN_COOKIES
|
|
+static u32 mptcp_v6_cookie_init_seq(struct request_sock *req, const struct sock *sk,
|
|
+ const struct sk_buff *skb, __u16 *mssp)
|
|
+{
|
|
+ __u32 isn = cookie_v6_init_sequence(req, sk, skb, mssp);
|
|
+
|
|
+ tcp_rsk(req)->snt_isn = isn;
|
|
+
|
|
+ mptcp_reqsk_init(req, sk, skb, true);
|
|
+
|
|
+ return isn;
|
|
+}
|
|
+#endif
|
|
+
|
|
+/* May be called without holding the meta-level lock */
|
|
+static int mptcp_v6_join_init_req(struct request_sock *req, const struct sock *meta_sk,
|
|
+ struct sk_buff *skb, bool want_cookie)
|
|
+{
|
|
+ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
|
|
+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ union inet_addr addr;
|
|
+ int loc_id;
|
|
+ bool low_prio = false;
|
|
+
|
|
+ if (!mpcb->rem_key_set)
|
|
+ return -1;
|
|
+
|
|
+ /* We need to do this as early as possible. Because, if we fail later
|
|
+ * (e.g., get_local_id), then reqsk_free tries to remove the
|
|
+ * request-socket from the htb in mptcp_hash_request_remove as pprev
|
|
+ * may be different from NULL.
|
|
+ */
|
|
+ mtreq->hash_entry.pprev = NULL;
|
|
+
|
|
+ tcp_request_sock_ipv6_ops.init_req(req, meta_sk, skb, want_cookie);
|
|
+
|
|
+ mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32,
|
|
+ ipv6_hdr(skb)->daddr.s6_addr32,
|
|
+ tcp_hdr(skb)->source,
|
|
+ tcp_hdr(skb)->dest);
|
|
+ addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
|
|
+ loc_id = mpcb->pm_ops->get_local_id(meta_sk, AF_INET6, &addr, &low_prio);
|
|
+ if (loc_id == -1)
|
|
+ return -1;
|
|
+ mtreq->loc_id = loc_id;
|
|
+ mtreq->low_prio = low_prio;
|
|
+
|
|
+ mptcp_join_reqsk_init(mpcb, req, skb);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Similar to tcp6_request_sock_ops */
|
|
+struct request_sock_ops mptcp6_request_sock_ops __read_mostly = {
|
|
+ .family = AF_INET6,
|
|
+ .obj_size = sizeof(struct mptcp_request_sock),
|
|
+ .rtx_syn_ack = tcp_rtx_synack,
|
|
+ .send_ack = tcp_v6_reqsk_send_ack,
|
|
+ .destructor = mptcp_v6_reqsk_destructor,
|
|
+ .send_reset = tcp_v6_send_reset,
|
|
+ .syn_ack_timeout = tcp_syn_ack_timeout,
|
|
+};
|
|
+
|
|
+/* Similar to: tcp_v6_conn_request
|
|
+ * May be called without holding the meta-level lock
|
|
+ */
|
|
+static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb)
|
|
+{
|
|
+ return tcp_conn_request(&mptcp6_request_sock_ops,
|
|
+ &mptcp_join_request_sock_ipv6_ops,
|
|
+ meta_sk, skb);
|
|
+}
|
|
+
|
|
+int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
|
|
+{
|
|
+ const struct tcphdr *th = tcp_hdr(skb);
|
|
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
|
|
+ struct sock *child, *rsk = NULL, *sk;
|
|
+ int ret;
|
|
+
|
|
+ sk = __inet6_lookup_established(sock_net(meta_sk),
|
|
+ &tcp_hashinfo,
|
|
+ &ip6h->saddr, th->source,
|
|
+ &ip6h->daddr, ntohs(th->dest),
|
|
+ tcp_v6_iif(skb), tcp_v6_sdif(skb));
|
|
+
|
|
+ if (!sk)
|
|
+ goto new_subflow;
|
|
+
|
|
+ if (is_meta_sk(sk)) {
|
|
+ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
|
|
+ sock_put(sk);
|
|
+ goto discard;
|
|
+ }
|
|
+
|
|
+ if (sk->sk_state == TCP_TIME_WAIT) {
|
|
+ inet_twsk_put(inet_twsk(sk));
|
|
+ goto discard;
|
|
+ }
|
|
+
|
|
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
|
|
+ struct request_sock *req = inet_reqsk(sk);
|
|
+ bool req_stolen;
|
|
+
|
|
+ if (!mptcp_can_new_subflow(meta_sk))
|
|
+ goto reset_and_discard;
|
|
+
|
|
+ local_bh_disable();
|
|
+ child = tcp_check_req(meta_sk, skb, req, false, &req_stolen);
|
|
+ if (!child) {
|
|
+ reqsk_put(req);
|
|
+ local_bh_enable();
|
|
+ goto discard;
|
|
+ }
|
|
+
|
|
+ if (child != meta_sk) {
|
|
+ ret = mptcp_finish_handshake(child, skb);
|
|
+ if (ret) {
|
|
+ rsk = child;
|
|
+ local_bh_enable();
|
|
+ goto reset_and_discard;
|
|
+ }
|
|
+
|
|
+ bh_unlock_sock(meta_sk);
|
|
+ local_bh_enable();
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ /* tcp_check_req failed */
|
|
+ reqsk_put(req);
|
|
+
|
|
+ local_bh_enable();
|
|
+ goto discard;
|
|
+ }
|
|
+
|
|
+ ret = tcp_v6_do_rcv(sk, skb);
|
|
+ sock_put(sk);
|
|
+
|
|
+ return ret;
|
|
+
|
|
+new_subflow:
|
|
+ if (!mptcp_can_new_subflow(meta_sk))
|
|
+ goto reset_and_discard;
|
|
+
|
|
+ child = tcp_v6_cookie_check(meta_sk, skb);
|
|
+ if (!child)
|
|
+ goto discard;
|
|
+
|
|
+ if (child != meta_sk) {
|
|
+ ret = mptcp_finish_handshake(child, skb);
|
|
+ if (ret) {
|
|
+ rsk = child;
|
|
+ goto reset_and_discard;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (tcp_hdr(skb)->syn) {
|
|
+ local_bh_disable();
|
|
+ mptcp_v6_join_request(meta_sk, skb);
|
|
+ local_bh_enable();
|
|
+ }
|
|
+
|
|
+discard:
|
|
+ kfree_skb(skb);
|
|
+ return 0;
|
|
+
|
|
+reset_and_discard:
|
|
+ tcp_v6_send_reset(rsk, skb);
|
|
+ goto discard;
|
|
+}
|
|
+
|
|
+/* Create a new IPv6 subflow.
|
|
+ *
|
|
+ * We are in user-context and meta-sock-lock is hold.
|
|
+ */
|
|
+int __mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
|
|
+ __be16 sport, struct mptcp_rem6 *rem,
|
|
+ struct sock **subsk)
|
|
+{
|
|
+ struct tcp_sock *tp;
|
|
+ struct sock *sk;
|
|
+ struct sockaddr_in6 loc_in, rem_in;
|
|
+ struct socket_alloc sock_full;
|
|
+ struct socket *sock = (struct socket *)&sock_full;
|
|
+ int ret;
|
|
+
|
|
+ /** First, create and prepare the new socket */
|
|
+ memcpy(&sock_full, meta_sk->sk_socket, sizeof(sock_full));
|
|
+ sock->state = SS_UNCONNECTED;
|
|
+ sock->ops = NULL;
|
|
+
|
|
+ ret = inet6_create(sock_net(meta_sk), sock, IPPROTO_TCP, 1);
|
|
+ if (unlikely(ret < 0)) {
|
|
+ net_err_ratelimited("%s inet6_create failed ret: %d\n",
|
|
+ __func__, ret);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ sk = sock->sk;
|
|
+ tp = tcp_sk(sk);
|
|
+
|
|
+ /* All subsockets need the MPTCP-lock-class */
|
|
+ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, meta_slock_key_name);
|
|
+ lockdep_init_map(&(sk)->sk_lock.dep_map, meta_key_name, &meta_key, 0);
|
|
+
|
|
+ ret = mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL);
|
|
+ if (ret) {
|
|
+ net_err_ratelimited("%s mptcp_add_sock failed ret: %d\n",
|
|
+ __func__, ret);
|
|
+ goto error;
|
|
+ }
|
|
+
|
|
+ tp->mptcp->slave_sk = 1;
|
|
+ tp->mptcp->low_prio = loc->low_prio;
|
|
+
|
|
+ /* Initializing the timer for an MPTCP subflow */
|
|
+ timer_setup(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, 0);
|
|
+
|
|
+ /** Then, connect the socket to the peer */
|
|
+ loc_in.sin6_family = AF_INET6;
|
|
+ rem_in.sin6_family = AF_INET6;
|
|
+ loc_in.sin6_port = sport;
|
|
+ if (rem->port)
|
|
+ rem_in.sin6_port = rem->port;
|
|
+ else
|
|
+ rem_in.sin6_port = inet_sk(meta_sk)->inet_dport;
|
|
+ loc_in.sin6_addr = loc->addr;
|
|
+ rem_in.sin6_addr = rem->addr;
|
|
+
|
|
+ if (loc->if_idx)
|
|
+ sk->sk_bound_dev_if = loc->if_idx;
|
|
+
|
|
+ ret = kernel_bind(sock, (struct sockaddr *)&loc_in,
|
|
+ sizeof(struct sockaddr_in6));
|
|
+ if (ret < 0) {
|
|
+ net_err_ratelimited("%s: token %#x bind() to %pI6 index %d failed, error %d\n",
|
|
+ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
|
|
+ &loc_in.sin6_addr, loc->if_idx, ret);
|
|
+ goto error;
|
|
+ }
|
|
+
|
|
+ mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d ifidx: %u\n",
|
|
+ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
|
|
+ tp->mptcp->path_index, &loc_in.sin6_addr,
|
|
+ ntohs(loc_in.sin6_port), &rem_in.sin6_addr,
|
|
+ ntohs(rem_in.sin6_port), loc->if_idx);
|
|
+
|
|
+ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6)
|
|
+ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr);
|
|
+
|
|
+ ret = kernel_connect(sock, (struct sockaddr *)&rem_in,
|
|
+ sizeof(struct sockaddr_in6), O_NONBLOCK);
|
|
+ if (ret < 0 && ret != -EINPROGRESS) {
|
|
+ net_err_ratelimited("%s: MPTCP subsocket connect() failed, error %d\n",
|
|
+ __func__, ret);
|
|
+ goto error;
|
|
+ }
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINSYNTX);
|
|
+
|
|
+ sk_set_socket(sk, meta_sk->sk_socket);
|
|
+ sk->sk_wq = meta_sk->sk_wq;
|
|
+
|
|
+ if (subsk)
|
|
+ *subsk = sk;
|
|
+
|
|
+ return 0;
|
|
+
|
|
+error:
|
|
+ /* May happen if mptcp_add_sock fails first */
|
|
+ if (!mptcp(tp)) {
|
|
+ tcp_close(sk, 0);
|
|
+ } else {
|
|
+ local_bh_disable();
|
|
+ mptcp_sub_force_close(sk);
|
|
+ local_bh_enable();
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+EXPORT_SYMBOL(__mptcp_init6_subsockets);
|
|
+
|
|
+const struct inet_connection_sock_af_ops mptcp_v6_specific = {
|
|
+ .queue_xmit = inet6_csk_xmit,
|
|
+ .send_check = tcp_v6_send_check,
|
|
+ .rebuild_header = inet6_sk_rebuild_header,
|
|
+ .sk_rx_dst_set = inet6_sk_rx_dst_set,
|
|
+ .conn_request = mptcp_conn_request,
|
|
+ .syn_recv_sock = tcp_v6_syn_recv_sock,
|
|
+ .net_header_len = sizeof(struct ipv6hdr),
|
|
+ .net_frag_header_len = sizeof(struct frag_hdr),
|
|
+ .setsockopt = ipv6_setsockopt,
|
|
+ .getsockopt = ipv6_getsockopt,
|
|
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
|
|
+ .sockaddr_len = sizeof(struct sockaddr_in6),
|
|
+#ifdef CONFIG_COMPAT
|
|
+ .compat_setsockopt = compat_ipv6_setsockopt,
|
|
+ .compat_getsockopt = compat_ipv6_getsockopt,
|
|
+#endif
|
|
+ .mtu_reduced = tcp_v6_mtu_reduced,
|
|
+};
|
|
+
|
|
+const struct inet_connection_sock_af_ops mptcp_v6_mapped = {
|
|
+ .queue_xmit = ip_queue_xmit,
|
|
+ .send_check = tcp_v4_send_check,
|
|
+ .rebuild_header = inet_sk_rebuild_header,
|
|
+ .sk_rx_dst_set = inet_sk_rx_dst_set,
|
|
+ .conn_request = mptcp_conn_request,
|
|
+ .syn_recv_sock = tcp_v6_syn_recv_sock,
|
|
+ .net_header_len = sizeof(struct iphdr),
|
|
+ .setsockopt = ipv6_setsockopt,
|
|
+ .getsockopt = ipv6_getsockopt,
|
|
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
|
|
+ .sockaddr_len = sizeof(struct sockaddr_in6),
|
|
+#ifdef CONFIG_COMPAT
|
|
+ .compat_setsockopt = compat_ipv6_setsockopt,
|
|
+ .compat_getsockopt = compat_ipv6_getsockopt,
|
|
+#endif
|
|
+ .mtu_reduced = tcp_v4_mtu_reduced,
|
|
+};
|
|
+
|
|
+struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
|
|
+struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
|
|
+
|
|
+int mptcp_pm_v6_init(void)
|
|
+{
|
|
+ int ret = 0;
|
|
+ struct request_sock_ops *ops = &mptcp6_request_sock_ops;
|
|
+
|
|
+ mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
|
|
+ mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req;
|
|
+#ifdef CONFIG_SYN_COOKIES
|
|
+ mptcp_request_sock_ipv6_ops.cookie_init_seq = mptcp_v6_cookie_init_seq;
|
|
+#endif
|
|
+
|
|
+ mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
|
|
+ mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req;
|
|
+
|
|
+ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6");
|
|
+ if (ops->slab_name == NULL) {
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
|
|
+ SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN,
|
|
+ NULL);
|
|
+
|
|
+ if (ops->slab == NULL) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err_reqsk_create;
|
|
+ }
|
|
+
|
|
+out:
|
|
+ return ret;
|
|
+
|
|
+err_reqsk_create:
|
|
+ kfree(ops->slab_name);
|
|
+ ops->slab_name = NULL;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+void mptcp_pm_v6_undo(void)
|
|
+{
|
|
+ kmem_cache_destroy(mptcp6_request_sock_ops.slab);
|
|
+ kfree(mptcp6_request_sock_ops.slab_name);
|
|
+}
|
|
diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c
|
|
new file mode 100644
|
|
index 000000000000..cf019990447c
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_ndiffports.c
|
|
@@ -0,0 +1,174 @@
|
|
+#include <linux/module.h>
|
|
+
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+#include <net/mptcp_v6.h>
|
|
+#endif
|
|
+
|
|
+struct ndiffports_priv {
|
|
+ /* Worker struct for subflow establishment */
|
|
+ struct work_struct subflow_work;
|
|
+
|
|
+ struct mptcp_cb *mpcb;
|
|
+};
|
|
+
|
|
+static int num_subflows __read_mostly = 2;
|
|
+module_param(num_subflows, int, 0644);
|
|
+MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection");
|
|
+
|
|
+/**
|
|
+ * Create all new subflows, by doing calls to mptcp_initX_subsockets
|
|
+ *
|
|
+ * This function uses a goto next_subflow, to allow releasing the lock between
|
|
+ * new subflows and giving other processes a chance to do some work on the
|
|
+ * socket and potentially finishing the communication.
|
|
+ **/
|
|
+static void create_subflow_worker(struct work_struct *work)
|
|
+{
|
|
+ const struct ndiffports_priv *pm_priv = container_of(work,
|
|
+ struct ndiffports_priv,
|
|
+ subflow_work);
|
|
+ struct mptcp_cb *mpcb = pm_priv->mpcb;
|
|
+ struct sock *meta_sk = mpcb->meta_sk;
|
|
+ int iter = 0;
|
|
+
|
|
+next_subflow:
|
|
+ if (iter) {
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+
|
|
+ cond_resched();
|
|
+ }
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
|
|
+
|
|
+ if (!mptcp(tcp_sk(meta_sk)))
|
|
+ goto exit;
|
|
+
|
|
+ iter++;
|
|
+
|
|
+ if (sock_flag(meta_sk, SOCK_DEAD))
|
|
+ goto exit;
|
|
+
|
|
+ if (mpcb->master_sk &&
|
|
+ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
|
|
+ goto exit;
|
|
+
|
|
+ if (num_subflows > iter && num_subflows > mptcp_subflow_count(mpcb)) {
|
|
+ if (meta_sk->sk_family == AF_INET ||
|
|
+ mptcp_v6_is_v4_mapped(meta_sk)) {
|
|
+ struct mptcp_loc4 loc;
|
|
+ struct mptcp_rem4 rem;
|
|
+
|
|
+ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
|
|
+ loc.loc4_id = 0;
|
|
+ loc.low_prio = 0;
|
|
+ if (mpcb->master_sk)
|
|
+ loc.if_idx = mpcb->master_sk->sk_bound_dev_if;
|
|
+ else
|
|
+ loc.if_idx = 0;
|
|
+
|
|
+ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
|
|
+ rem.port = inet_sk(meta_sk)->inet_dport;
|
|
+ rem.rem4_id = 0; /* Default 0 */
|
|
+
|
|
+ mptcp_init4_subsockets(meta_sk, &loc, &rem);
|
|
+ } else {
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ struct mptcp_loc6 loc;
|
|
+ struct mptcp_rem6 rem;
|
|
+
|
|
+ loc.addr = inet6_sk(meta_sk)->saddr;
|
|
+ loc.loc6_id = 0;
|
|
+ loc.low_prio = 0;
|
|
+ if (mpcb->master_sk)
|
|
+ loc.if_idx = mpcb->master_sk->sk_bound_dev_if;
|
|
+ else
|
|
+ loc.if_idx = 0;
|
|
+
|
|
+ rem.addr = meta_sk->sk_v6_daddr;
|
|
+ rem.port = inet_sk(meta_sk)->inet_dport;
|
|
+ rem.rem6_id = 0; /* Default 0 */
|
|
+
|
|
+ mptcp_init6_subsockets(meta_sk, &loc, &rem);
|
|
+#endif
|
|
+ }
|
|
+ goto next_subflow;
|
|
+ }
|
|
+
|
|
+exit:
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ mptcp_mpcb_put(mpcb);
|
|
+ sock_put(meta_sk);
|
|
+}
|
|
+
|
|
+static void ndiffports_new_session(const struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
|
|
+
|
|
+ /* Initialize workqueue-struct */
|
|
+ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
|
|
+ fmp->mpcb = mpcb;
|
|
+}
|
|
+
|
|
+static void ndiffports_create_subflows(struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
|
|
+
|
|
+ if (mptcp_in_infinite_mapping_weak(mpcb) ||
|
|
+ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
|
|
+ return;
|
|
+
|
|
+ if (!work_pending(&pm_priv->subflow_work)) {
|
|
+ sock_hold(meta_sk);
|
|
+ refcount_inc(&mpcb->mpcb_refcnt);
|
|
+ queue_work(mptcp_wq, &pm_priv->subflow_work);
|
|
+ }
|
|
+}
|
|
+
|
|
+static int ndiffports_get_local_id(const struct sock *meta_sk,
|
|
+ sa_family_t family, union inet_addr *addr,
|
|
+ bool *low_prio)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static struct mptcp_pm_ops ndiffports __read_mostly = {
|
|
+ .new_session = ndiffports_new_session,
|
|
+ .fully_established = ndiffports_create_subflows,
|
|
+ .get_local_id = ndiffports_get_local_id,
|
|
+ .name = "ndiffports",
|
|
+ .owner = THIS_MODULE,
|
|
+};
|
|
+
|
|
+/* General initialization of MPTCP_PM */
|
|
+static int __init ndiffports_register(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE);
|
|
+
|
|
+ if (mptcp_register_path_manager(&ndiffports))
|
|
+ goto exit;
|
|
+
|
|
+ return 0;
|
|
+
|
|
+exit:
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static void ndiffports_unregister(void)
|
|
+{
|
|
+ mptcp_unregister_path_manager(&ndiffports);
|
|
+}
|
|
+
|
|
+module_init(ndiffports_register);
|
|
+module_exit(ndiffports_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Christoph Paasch");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("NDIFF-PORTS MPTCP");
|
|
+MODULE_VERSION("0.88");
|
|
diff --git a/net/mptcp/mptcp_netlink.c b/net/mptcp/mptcp_netlink.c
|
|
new file mode 100644
|
|
index 000000000000..dd696841ea85
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_netlink.c
|
|
@@ -0,0 +1,1272 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/* MPTCP implementation - Netlink Path Manager
|
|
+ *
|
|
+ * Analysis, Design and Implementation:
|
|
+ * - Gregory Detal <gregory.detal@tessares.net>
|
|
+ * - Sébastien Barré <sebastien.barre@tessares.net>
|
|
+ * - Matthieu Baerts <matthieu.baerts@tessares.net>
|
|
+ * - Pau Espin Pedrol <pau.espin@tessares.net>
|
|
+ * - Detlev Casanova <detlev.casanova@tessares.net>
|
|
+ * - David Verbeiren <david.verbeiren@tessares.net>
|
|
+ * - Frank Vanbever <frank.vanbever@tessares.net>
|
|
+ * - Antoine Maes <antoine.maes@tessares.net>
|
|
+ * - Tim Froidcoeur <tim.froidcoeur@tessares.net>
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
+#include <linux/module.h>
|
|
+#include <linux/mptcp.h>
|
|
+#include <net/genetlink.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+#include <net/mptcp_v6.h>
|
|
+#endif
|
|
+
|
|
+#define MPTCP_MAX_ADDR 8
|
|
+
|
|
+struct mptcp_nl_priv {
|
|
+ /* Unfortunately we need to store this to generate MP_JOINs in case
|
|
+ * of the peer generating a subflow (see get_local_id).
|
|
+ */
|
|
+ u8 loc4_bits;
|
|
+ u8 announced4;
|
|
+ struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR];
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ u8 loc6_bits;
|
|
+ u8 announced6;
|
|
+ struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR];
|
|
+#endif
|
|
+
|
|
+ u16 remove_addrs;
|
|
+
|
|
+ bool is_closed;
|
|
+};
|
|
+
|
|
+static struct genl_family mptcp_genl_family;
|
|
+
|
|
+#define MPTCP_GENL_EV_GRP_OFFSET 0
|
|
+#define MPTCP_GENL_CMD_GRP_OFFSET 1
|
|
+
|
|
+static const struct genl_multicast_group mptcp_mcgrps[] = {
|
|
+ [MPTCP_GENL_EV_GRP_OFFSET] = { .name = MPTCP_GENL_EV_GRP_NAME, },
|
|
+ [MPTCP_GENL_CMD_GRP_OFFSET] = { .name = MPTCP_GENL_CMD_GRP_NAME, },
|
|
+};
|
|
+
|
|
+static const struct nla_policy mptcp_nl_genl_policy[MPTCP_ATTR_MAX + 1] = {
|
|
+ [MPTCP_ATTR_TOKEN] = { .type = NLA_U32, },
|
|
+ [MPTCP_ATTR_FAMILY] = { .type = NLA_U16, },
|
|
+ [MPTCP_ATTR_LOC_ID] = { .type = NLA_U8, },
|
|
+ [MPTCP_ATTR_REM_ID] = { .type = NLA_U8, },
|
|
+ [MPTCP_ATTR_SADDR4] = { .type = NLA_U32, },
|
|
+ [MPTCP_ATTR_SADDR6] = { .type = NLA_BINARY,
|
|
+ .len = sizeof(struct in6_addr), },
|
|
+ [MPTCP_ATTR_DADDR4] = { .type = NLA_U32, },
|
|
+ [MPTCP_ATTR_DADDR6] = { .type = NLA_BINARY,
|
|
+ .len = sizeof(struct in6_addr), },
|
|
+ [MPTCP_ATTR_SPORT] = { .type = NLA_U16, },
|
|
+ [MPTCP_ATTR_DPORT] = { .type = NLA_U16, },
|
|
+ [MPTCP_ATTR_BACKUP] = { .type = NLA_U8, },
|
|
+ [MPTCP_ATTR_FLAGS] = { .type = NLA_U16, },
|
|
+ [MPTCP_ATTR_TIMEOUT] = { .type = NLA_U32, },
|
|
+ [MPTCP_ATTR_IF_IDX] = { .type = NLA_S32, },
|
|
+};
|
|
+
|
|
+/* Defines the userspace PM filter on events. Set events are ignored. */
|
|
+static u16 mptcp_nl_event_filter;
|
|
+
|
|
+static inline struct mptcp_nl_priv *
|
|
+mptcp_nl_priv(const struct sock *meta_sk)
|
|
+{
|
|
+ return (struct mptcp_nl_priv *)&tcp_sk(meta_sk)->mpcb->mptcp_pm[0];
|
|
+}
|
|
+
|
|
+static inline bool
|
|
+mptcp_nl_must_notify(u16 event, const struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk);
|
|
+
|
|
+ /* close_session() can be called before other events because it is
|
|
+ * also called when doing a fallback to TCP. We don't want to send
|
|
+ * events to the user-space after having sent the CLOSED event.
|
|
+ */
|
|
+ if (priv->is_closed)
|
|
+ return false;
|
|
+
|
|
+ if (event == MPTCPF_EVENT_CLOSED)
|
|
+ priv->is_closed = true;
|
|
+
|
|
+ if (mptcp_nl_event_filter & event)
|
|
+ return false;
|
|
+
|
|
+ if (!genl_has_listeners(&mptcp_genl_family, sock_net(meta_sk), 0))
|
|
+ return false;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/* Find the first free index in the bitfield starting from 0 */
|
|
+static int
|
|
+mptcp_nl_find_free_index(u8 bitfield)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ /* There are anyways no free bits... */
|
|
+ if (bitfield == 0xff)
|
|
+ return -1;
|
|
+
|
|
+ i = ffs(~bitfield) - 1;
|
|
+ if (i < 0)
|
|
+ return -1;
|
|
+
|
|
+ return i;
|
|
+}
|
|
+
|
|
+static inline int
|
|
+mptcp_nl_put_subsk(struct sk_buff *msg, struct sock *sk)
|
|
+{
|
|
+ struct inet_sock *isk = inet_sk(sk);
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ u8 backup;
|
|
+ u8 sk_err;
|
|
+
|
|
+ if (nla_put_u16(msg, MPTCP_ATTR_FAMILY, sk->sk_family))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ if (nla_put_u8(msg, MPTCP_ATTR_LOC_ID, tcp_sk(sk)->mptcp->loc_id))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ if (nla_put_u8(msg, MPTCP_ATTR_REM_ID, tcp_sk(sk)->mptcp->rem_id))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ switch (sk->sk_family) {
|
|
+ case AF_INET:
|
|
+ if (nla_put_u32(msg, MPTCP_ATTR_SADDR4, isk->inet_saddr))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ if (nla_put_u32(msg, MPTCP_ATTR_DADDR4, isk->inet_daddr))
|
|
+ goto nla_put_failure;
|
|
+ break;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ case AF_INET6: {
|
|
+ struct ipv6_pinfo *np = inet6_sk(sk);
|
|
+
|
|
+ if (nla_put(msg, MPTCP_ATTR_SADDR6, sizeof(np->saddr),
|
|
+ &np->saddr))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ if (nla_put(msg, MPTCP_ATTR_DADDR6, sizeof(sk->sk_v6_daddr),
|
|
+ &sk->sk_v6_daddr))
|
|
+ goto nla_put_failure;
|
|
+ break;
|
|
+ }
|
|
+#endif
|
|
+ default:
|
|
+ goto nla_put_failure;
|
|
+ }
|
|
+
|
|
+ if (nla_put_u16(msg, MPTCP_ATTR_SPORT, ntohs(isk->inet_sport)))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ if (nla_put_u16(msg, MPTCP_ATTR_DPORT, ntohs(isk->inet_dport)))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ backup = !!(tcp_sk(sk)->mptcp->rcv_low_prio ||
|
|
+ tcp_sk(sk)->mptcp->low_prio);
|
|
+
|
|
+ if (nla_put_u8(msg, MPTCP_ATTR_BACKUP, backup))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ if (nla_put_s32(msg, MPTCP_ATTR_IF_IDX, sk->sk_bound_dev_if))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ sk_err = sk->sk_err ? : tcp_sk(sk)->mptcp->sk_err;
|
|
+ if (unlikely(sk_err != 0) && meta_sk->sk_state == TCP_ESTABLISHED &&
|
|
+ nla_put_u8(msg, MPTCP_ATTR_ERROR, sk_err))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ return 0;
|
|
+
|
|
+nla_put_failure:
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static inline struct sk_buff *
|
|
+mptcp_nl_mcast_prepare(struct mptcp_cb *mpcb, struct sock *sk, int cmd,
|
|
+ void **hdr)
|
|
+{
|
|
+ struct sk_buff *msg;
|
|
+
|
|
+ /* possible optimisation: use the needed size */
|
|
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
|
|
+ if (!msg)
|
|
+ return NULL;
|
|
+
|
|
+ *hdr = genlmsg_put(msg, 0, 0, &mptcp_genl_family, 0, cmd);
|
|
+ if (!*hdr)
|
|
+ goto free_msg;
|
|
+
|
|
+ if (nla_put_u32(msg, MPTCP_ATTR_TOKEN, mpcb->mptcp_loc_token))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ if (sk && mptcp_nl_put_subsk(msg, sk))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ return msg;
|
|
+
|
|
+nla_put_failure:
|
|
+ genlmsg_cancel(msg, *hdr);
|
|
+free_msg:
|
|
+ nlmsg_free(msg);
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static inline int
|
|
+mptcp_nl_mcast_send(struct mptcp_cb *mpcb, struct sk_buff *msg, void *hdr)
|
|
+{
|
|
+ int ret;
|
|
+ struct sock *meta_sk = mpcb->meta_sk;
|
|
+
|
|
+ genlmsg_end(msg, hdr);
|
|
+
|
|
+ ret = genlmsg_multicast_netns(&mptcp_genl_family, sock_net(meta_sk),
|
|
+ msg, 0, MPTCP_GENL_EV_GRP_OFFSET,
|
|
+ GFP_ATOMIC);
|
|
+ if (ret && ret != -ESRCH)
|
|
+ pr_err("%s: genlmsg_multicast failed with %d\n", __func__, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline void
|
|
+mptcp_nl_mcast(struct mptcp_cb *mpcb, struct sock *sk, int cmd)
|
|
+{
|
|
+ void *hdr;
|
|
+ struct sk_buff *msg;
|
|
+
|
|
+ msg = mptcp_nl_mcast_prepare(mpcb, sk, cmd, &hdr);
|
|
+ if (msg)
|
|
+ mptcp_nl_mcast_send(mpcb, msg, hdr);
|
|
+ else
|
|
+ pr_warn("%s: unable to prepare multicast message\n", __func__);
|
|
+}
|
|
+
|
|
+static inline void
|
|
+mptcp_nl_mcast_fail(struct sk_buff *msg, void *hdr)
|
|
+{
|
|
+ genlmsg_cancel(msg, hdr);
|
|
+ nlmsg_free(msg);
|
|
+}
|
|
+
|
|
+static void
|
|
+mptcp_nl_new(const struct sock *meta_sk, bool established)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+
|
|
+ mptcp_nl_mcast(mpcb, mpcb->master_sk,
|
|
+ established ? MPTCP_EVENT_ESTABLISHED
|
|
+ : MPTCP_EVENT_CREATED);
|
|
+}
|
|
+
|
|
+static void
|
|
+mptcp_nl_pm_new_session(const struct sock *meta_sk)
|
|
+{
|
|
+ if (!mptcp_nl_must_notify(MPTCPF_EVENT_CREATED, meta_sk))
|
|
+ return;
|
|
+
|
|
+ mptcp_nl_new(meta_sk, false);
|
|
+}
|
|
+
|
|
+static inline int
|
|
+mptcp_nl_loc_id_to_index_lookup(struct sock *meta_sk, sa_family_t family,
|
|
+ u8 addr_id)
|
|
+{
|
|
+ struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk);
|
|
+ int i;
|
|
+
|
|
+ switch (family) {
|
|
+ case AF_INET:
|
|
+ mptcp_for_each_bit_set(priv->loc4_bits, i) {
|
|
+ if (priv->locaddr4[i].loc4_id == addr_id)
|
|
+ return i;
|
|
+ }
|
|
+ break;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ case AF_INET6:
|
|
+ mptcp_for_each_bit_set(priv->loc6_bits, i) {
|
|
+ if (priv->locaddr6[i].loc6_id == addr_id)
|
|
+ return i;
|
|
+ }
|
|
+ break;
|
|
+#endif
|
|
+ }
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static inline void
|
|
+mptcp_nl_sk_setup_locaddr(struct sock *meta_sk, struct sock *sk)
|
|
+{
|
|
+ struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk);
|
|
+ bool backup = !!(tcp_sk(sk)->mptcp->rcv_low_prio ||
|
|
+ tcp_sk(sk)->mptcp->low_prio);
|
|
+ sa_family_t family = mptcp_v6_is_v4_mapped(sk) ? AF_INET
|
|
+ : sk->sk_family;
|
|
+ u8 addr_id = tcp_sk(sk)->mptcp->loc_id;
|
|
+ int idx = mptcp_nl_loc_id_to_index_lookup(meta_sk, family,
|
|
+ addr_id);
|
|
+
|
|
+ /* Same as in mptcp_fullmesh.c: exception for transparent sockets */
|
|
+ int if_idx = inet_sk(sk)->transparent ? inet_sk(sk)->rx_dst_ifindex :
|
|
+ sk->sk_bound_dev_if;
|
|
+
|
|
+ switch (family) {
|
|
+ case AF_INET: {
|
|
+ struct inet_sock *isk = inet_sk(sk);
|
|
+
|
|
+ if (idx == -1)
|
|
+ idx = mptcp_nl_find_free_index(priv->loc4_bits);
|
|
+ if (idx == -1) {
|
|
+ pr_warn("No free index for sk loc_id v4\n");
|
|
+ return;
|
|
+ }
|
|
+ priv->locaddr4[idx].addr.s_addr = isk->inet_saddr;
|
|
+ priv->locaddr4[idx].loc4_id = addr_id;
|
|
+ priv->locaddr4[idx].low_prio = backup;
|
|
+ priv->locaddr4[idx].if_idx = if_idx;
|
|
+ priv->loc4_bits |= 1 << idx;
|
|
+ priv->announced4 |= 1 << idx;
|
|
+ break;
|
|
+ }
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ case AF_INET6: {
|
|
+ struct ipv6_pinfo *np = inet6_sk(sk);
|
|
+
|
|
+ if (idx == -1)
|
|
+ idx = mptcp_nl_find_free_index(priv->loc6_bits);
|
|
+ if (idx == -1) {
|
|
+ pr_warn("No free index for sk loc_id v6\n");
|
|
+ return;
|
|
+ }
|
|
+ priv->locaddr6[idx].addr = np->saddr;
|
|
+ priv->locaddr6[idx].loc6_id = addr_id;
|
|
+ priv->locaddr6[idx].low_prio = backup;
|
|
+ priv->locaddr6[idx].if_idx = if_idx;
|
|
+ priv->loc6_bits |= 1 << idx;
|
|
+ priv->announced6 |= 1 << idx;
|
|
+ break;
|
|
+ }
|
|
+#endif
|
|
+ }
|
|
+}
|
|
+
|
|
+static void
|
|
+mptcp_nl_pm_fully_established(struct sock *meta_sk)
|
|
+{
|
|
+ mptcp_nl_sk_setup_locaddr(meta_sk, tcp_sk(meta_sk)->mpcb->master_sk);
|
|
+
|
|
+ if (!mptcp_nl_must_notify(MPTCPF_EVENT_ESTABLISHED, meta_sk))
|
|
+ return;
|
|
+
|
|
+ mptcp_nl_new(meta_sk, true);
|
|
+}
|
|
+
|
|
+static void
|
|
+mptcp_nl_pm_close_session(struct sock *meta_sk)
|
|
+{
|
|
+ if (!mptcp_nl_must_notify(MPTCPF_EVENT_CLOSED, meta_sk))
|
|
+ return;
|
|
+
|
|
+ mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, NULL, MPTCP_EVENT_CLOSED);
|
|
+}
|
|
+
|
|
+static void
|
|
+mptcp_nl_pm_established_subflow(struct sock *sk)
|
|
+{
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+
|
|
+ mptcp_nl_sk_setup_locaddr(meta_sk, sk);
|
|
+
|
|
+ if (!mptcp_nl_must_notify(MPTCPF_EVENT_SUB_ESTABLISHED, meta_sk))
|
|
+ return;
|
|
+
|
|
+ mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, sk, MPTCP_EVENT_SUB_ESTABLISHED);
|
|
+}
|
|
+
|
|
+static void
|
|
+mptcp_nl_pm_delete_subflow(struct sock *sk)
|
|
+{
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+
|
|
+ if (!mptcp_nl_must_notify(MPTCPF_EVENT_SUB_CLOSED, meta_sk))
|
|
+ return;
|
|
+
|
|
+ mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, sk, MPTCP_EVENT_SUB_CLOSED);
|
|
+}
|
|
+
|
|
+static void
|
|
+mptcp_nl_pm_add_raddr(struct mptcp_cb *mpcb, const union inet_addr *addr,
|
|
+ sa_family_t family, __be16 port, u8 id)
|
|
+{
|
|
+ struct sk_buff *msg;
|
|
+ void *hdr;
|
|
+
|
|
+ if (!mptcp_nl_must_notify(MPTCPF_EVENT_ANNOUNCED, mpcb->meta_sk))
|
|
+ return;
|
|
+
|
|
+ msg = mptcp_nl_mcast_prepare(mpcb, NULL, MPTCP_EVENT_ANNOUNCED, &hdr);
|
|
+ if (!msg)
|
|
+ return;
|
|
+
|
|
+ if (nla_put_u8(msg, MPTCP_ATTR_REM_ID, id))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ if (nla_put_u16(msg, MPTCP_ATTR_FAMILY, family))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ switch (family) {
|
|
+ case AF_INET:
|
|
+ if (nla_put_u32(msg, MPTCP_ATTR_DADDR4, addr->ip))
|
|
+ goto nla_put_failure;
|
|
+ break;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ case AF_INET6:
|
|
+ if (nla_put(msg, MPTCP_ATTR_DADDR6, sizeof(addr->ip6),
|
|
+ &addr->ip6))
|
|
+ goto nla_put_failure;
|
|
+ break;
|
|
+#endif
|
|
+ default:
|
|
+ goto nla_put_failure;
|
|
+ }
|
|
+
|
|
+ if (nla_put_u16(msg, MPTCP_ATTR_DPORT, ntohs(port)))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ mptcp_nl_mcast_send(mpcb, msg, hdr);
|
|
+
|
|
+ return;
|
|
+
|
|
+nla_put_failure:
|
|
+ mptcp_nl_mcast_fail(msg, hdr);
|
|
+}
|
|
+
|
|
+static void
|
|
+mptcp_nl_pm_rem_raddr(struct mptcp_cb *mpcb, u8 id)
|
|
+{
|
|
+ struct sk_buff *msg;
|
|
+ void *hdr;
|
|
+
|
|
+ if (!mptcp_nl_must_notify(MPTCPF_EVENT_REMOVED, mpcb->meta_sk))
|
|
+ return;
|
|
+
|
|
+ msg = mptcp_nl_mcast_prepare(mpcb, NULL, MPTCP_EVENT_REMOVED, &hdr);
|
|
+
|
|
+ if (!msg)
|
|
+ return;
|
|
+
|
|
+ if (nla_put_u8(msg, MPTCP_ATTR_REM_ID, id))
|
|
+ goto nla_put_failure;
|
|
+
|
|
+ mptcp_nl_mcast_send(mpcb, msg, hdr);
|
|
+
|
|
+ return;
|
|
+
|
|
+nla_put_failure:
|
|
+ mptcp_nl_mcast_fail(msg, hdr);
|
|
+}
|
|
+
|
|
+static int
|
|
+mptcp_nl_pm_get_local_id(const struct sock *meta_sk, sa_family_t family,
|
|
+ union inet_addr *addr, bool *low_prio)
|
|
+{
|
|
+ struct mptcp_nl_priv *priv = mptcp_nl_priv(meta_sk);
|
|
+ int i, id = 0;
|
|
+
|
|
+ switch (family) {
|
|
+ case AF_INET:
|
|
+ mptcp_for_each_bit_set(priv->loc4_bits, i) {
|
|
+ if (addr->in.s_addr == priv->locaddr4[i].addr.s_addr) {
|
|
+ id = priv->locaddr4[i].loc4_id;
|
|
+ *low_prio = priv->locaddr4[i].low_prio;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ case AF_INET6:
|
|
+ mptcp_for_each_bit_set(priv->loc6_bits, i) {
|
|
+ if (ipv6_addr_equal(&addr->in6,
|
|
+ &priv->locaddr6[i].addr)) {
|
|
+ id = priv->locaddr6[i].loc6_id;
|
|
+ *low_prio = priv->locaddr6[i].low_prio;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+#endif
|
|
+ }
|
|
+ return -1;
|
|
+
|
|
+out:
|
|
+ return id;
|
|
+}
|
|
+
|
|
+static void
|
|
+mptcp_nl_pm_addr_signal(struct sock *sk, unsigned *size,
|
|
+ struct tcp_out_options *opts, struct sk_buff *skb)
|
|
+{
|
|
+ struct mptcp_nl_priv *priv = mptcp_nl_priv(sk);
|
|
+ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
|
|
+ u8 unannounced;
|
|
+ int remove_addr_len;
|
|
+
|
|
+ unannounced = (~priv->announced4) & priv->loc4_bits;
|
|
+ if (unannounced &&
|
|
+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) {
|
|
+ int i = mptcp_nl_find_free_index(~unannounced);
|
|
+
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_ADD_ADDR;
|
|
+ opts->add_addr4.addr_id = priv->locaddr4[i].loc4_id;
|
|
+ opts->add_addr4.addr = priv->locaddr4[i].addr;
|
|
+ opts->add_addr_v4 = 1;
|
|
+
|
|
+ if (skb)
|
|
+ priv->announced4 |= (1 << i);
|
|
+ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN;
|
|
+ }
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ unannounced = (~priv->announced6) & priv->loc6_bits;
|
|
+ if (unannounced &&
|
|
+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) {
|
|
+ int i = mptcp_nl_find_free_index(~unannounced);
|
|
+
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_ADD_ADDR;
|
|
+ opts->add_addr6.addr_id = priv->locaddr6[i].loc6_id;
|
|
+ opts->add_addr6.addr = priv->locaddr6[i].addr;
|
|
+ opts->add_addr_v6 = 1;
|
|
+
|
|
+ if (skb)
|
|
+ priv->announced6 |= (1 << i);
|
|
+ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN;
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ if (likely(!priv->remove_addrs))
|
|
+ goto exit;
|
|
+
|
|
+ remove_addr_len = mptcp_sub_len_remove_addr_align(priv->remove_addrs);
|
|
+ if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len)
|
|
+ goto exit;
|
|
+
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_REMOVE_ADDR;
|
|
+ opts->remove_addrs = priv->remove_addrs;
|
|
+
|
|
+ if (skb)
|
|
+ priv->remove_addrs = 0;
|
|
+ *size += remove_addr_len;
|
|
+
|
|
+exit:
|
|
+ mpcb->addr_signal = !!((~priv->announced4) & priv->loc4_bits ||
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ (~priv->announced6) & priv->loc6_bits ||
|
|
+#endif
|
|
+ priv->remove_addrs);
|
|
+}
|
|
+
|
|
+static void
|
|
+mptcp_nl_pm_prio_changed(struct sock *sk, int low_prio)
|
|
+{
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+
|
|
+ if (!mptcp_nl_must_notify(MPTCPF_EVENT_SUB_PRIORITY, meta_sk))
|
|
+ return;
|
|
+
|
|
+ mptcp_nl_mcast(tcp_sk(meta_sk)->mpcb, sk, MPTCP_EVENT_SUB_PRIORITY);
|
|
+}
|
|
+
|
|
+static int
|
|
+mptcp_nl_genl_announce(struct sk_buff *skb, struct genl_info *info)
|
|
+{
|
|
+ struct sock *meta_sk, *subsk;
|
|
+ struct mptcp_cb *mpcb;
|
|
+ struct mptcp_nl_priv *priv;
|
|
+ u32 token;
|
|
+ u8 addr_id, backup = 0;
|
|
+ u16 family;
|
|
+ int i, ret = 0;
|
|
+ union inet_addr saddr;
|
|
+ int if_idx = 0;
|
|
+ bool useless; /* unused out parameter "low_prio" */
|
|
+
|
|
+ if (!info->attrs[MPTCP_ATTR_TOKEN] || !info->attrs[MPTCP_ATTR_FAMILY] ||
|
|
+ !info->attrs[MPTCP_ATTR_LOC_ID])
|
|
+ return -EINVAL;
|
|
+
|
|
+ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]);
|
|
+ meta_sk = mptcp_hash_find(genl_info_net(info), token);
|
|
+ if (!meta_sk)
|
|
+ return -EINVAL;
|
|
+
|
|
+ mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ priv = mptcp_nl_priv(meta_sk);
|
|
+ family = nla_get_u16(info->attrs[MPTCP_ATTR_FAMILY]);
|
|
+ addr_id = nla_get_u8(info->attrs[MPTCP_ATTR_LOC_ID]);
|
|
+
|
|
+ if (info->attrs[MPTCP_ATTR_BACKUP])
|
|
+ backup = nla_get_u8(info->attrs[MPTCP_ATTR_BACKUP]);
|
|
+
|
|
+ if (info->attrs[MPTCP_ATTR_IF_IDX])
|
|
+ if_idx = nla_get_s32(info->attrs[MPTCP_ATTR_IF_IDX]);
|
|
+
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
|
|
+
|
|
+ switch (family) {
|
|
+ case AF_INET:
|
|
+ if (!info->attrs[MPTCP_ATTR_SADDR4]) {
|
|
+ ret = -EINVAL;
|
|
+ goto exit;
|
|
+ }
|
|
+
|
|
+ saddr.in.s_addr = nla_get_u32(info->attrs[MPTCP_ATTR_SADDR4]);
|
|
+ i = mptcp_nl_pm_get_local_id(meta_sk, family,
|
|
+ &saddr, &useless);
|
|
+ if (i < 0) {
|
|
+ i = mptcp_nl_find_free_index(priv->loc4_bits);
|
|
+ if (i < 0) {
|
|
+ ret = -ENOBUFS;
|
|
+ goto exit;
|
|
+ }
|
|
+ } else if (i != addr_id) {
|
|
+ ret = -EINVAL;
|
|
+ goto exit;
|
|
+ }
|
|
+
|
|
+ priv->locaddr4[i].addr.s_addr = saddr.in.s_addr;
|
|
+ priv->locaddr4[i].loc4_id = addr_id;
|
|
+ priv->locaddr4[i].low_prio = !!backup;
|
|
+ priv->locaddr4[i].if_idx = if_idx;
|
|
+ priv->loc4_bits |= 1 << i;
|
|
+ priv->announced4 &= ~(1 << i);
|
|
+ break;
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ case AF_INET6:
|
|
+ if (!info->attrs[MPTCP_ATTR_SADDR6]) {
|
|
+ ret = -EINVAL;
|
|
+ goto exit;
|
|
+ }
|
|
+
|
|
+ saddr.in6 = *(struct in6_addr *)
|
|
+ nla_data(info->attrs[MPTCP_ATTR_SADDR6]);
|
|
+ i = mptcp_nl_pm_get_local_id(meta_sk, family, &saddr, &useless);
|
|
+ if (i < 0) {
|
|
+ i = mptcp_nl_find_free_index(priv->loc6_bits);
|
|
+ if (i < 0) {
|
|
+ ret = -ENOBUFS;
|
|
+ goto exit;
|
|
+ }
|
|
+ } else if (i != addr_id) {
|
|
+ ret = -EINVAL;
|
|
+ goto exit;
|
|
+ }
|
|
+
|
|
+ priv->locaddr6[i].addr = saddr.in6;
|
|
+ priv->locaddr6[i].loc6_id = addr_id;
|
|
+ priv->locaddr6[i].low_prio = !!backup;
|
|
+ priv->locaddr6[i].if_idx = if_idx;
|
|
+ priv->loc6_bits |= 1 << i;
|
|
+ priv->announced6 &= ~(1 << i);
|
|
+ break;
|
|
+#endif
|
|
+ default:
|
|
+ ret = -EINVAL;
|
|
+ goto exit;
|
|
+ }
|
|
+
|
|
+ mpcb->addr_signal = 1;
|
|
+
|
|
+ rcu_read_lock_bh();
|
|
+ subsk = mptcp_select_ack_sock(meta_sk);
|
|
+ if (subsk)
|
|
+ tcp_send_ack(subsk);
|
|
+ rcu_read_unlock_bh();
|
|
+
|
|
+exit:
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ sock_put(meta_sk);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int
|
|
+mptcp_nl_genl_remove(struct sk_buff *skb, struct genl_info *info)
|
|
+{
|
|
+ struct sock *meta_sk, *subsk;
|
|
+ struct mptcp_cb *mpcb;
|
|
+ struct mptcp_nl_priv *priv;
|
|
+ u32 token;
|
|
+ u8 addr_id;
|
|
+ int i;
|
|
+ int retcode;
|
|
+ bool found = false;
|
|
+
|
|
+ if (!info->attrs[MPTCP_ATTR_TOKEN] || !info->attrs[MPTCP_ATTR_LOC_ID])
|
|
+ return -EINVAL;
|
|
+
|
|
+ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]);
|
|
+ meta_sk = mptcp_hash_find(genl_info_net(info), token);
|
|
+ if (!meta_sk)
|
|
+ return -EINVAL;
|
|
+
|
|
+ mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ priv = mptcp_nl_priv(meta_sk);
|
|
+ addr_id = nla_get_u8(info->attrs[MPTCP_ATTR_LOC_ID]);
|
|
+
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
|
|
+
|
|
+ mptcp_for_each_bit_set(priv->loc4_bits, i) {
|
|
+ if (priv->locaddr4[i].loc4_id == addr_id) {
|
|
+ priv->loc4_bits &= ~(1 << i);
|
|
+ found = true;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ if (!found) {
|
|
+ mptcp_for_each_bit_set(priv->loc6_bits, i) {
|
|
+ if (priv->locaddr6[i].loc6_id == addr_id) {
|
|
+ priv->loc6_bits &= ~(1 << i);
|
|
+ found = true;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ if (found) {
|
|
+ priv->remove_addrs |= 1 << addr_id;
|
|
+ mpcb->addr_signal = 1;
|
|
+
|
|
+ rcu_read_lock_bh();
|
|
+ subsk = mptcp_select_ack_sock(meta_sk);
|
|
+ if (subsk)
|
|
+ tcp_send_ack(subsk);
|
|
+ rcu_read_unlock_bh();
|
|
+ retcode = 0;
|
|
+ } else {
|
|
+ retcode = -EINVAL;
|
|
+ }
|
|
+
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ sock_put(meta_sk);
|
|
+ return retcode;
|
|
+}
|
|
+
|
|
+static int
|
|
+mptcp_nl_genl_create(struct sk_buff *skb, struct genl_info *info)
|
|
+{
|
|
+ struct sock *meta_sk, *subsk = NULL;
|
|
+ struct mptcp_cb *mpcb;
|
|
+ struct mptcp_nl_priv *priv;
|
|
+ u32 token;
|
|
+ u16 family, sport;
|
|
+ u8 loc_id, rem_id, backup = 0;
|
|
+ int i, ret = 0;
|
|
+ int if_idx;
|
|
+
|
|
+ if (!info->attrs[MPTCP_ATTR_TOKEN] || !info->attrs[MPTCP_ATTR_FAMILY] ||
|
|
+ !info->attrs[MPTCP_ATTR_LOC_ID] || !info->attrs[MPTCP_ATTR_REM_ID])
|
|
+ return -EINVAL;
|
|
+
|
|
+ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]);
|
|
+ meta_sk = mptcp_hash_find(genl_info_net(info), token);
|
|
+ if (!meta_sk)
|
|
+ /* We use a more specific value than EINVAL here so that
|
|
+ * userspace can handle this specific case easily. This is
|
|
+ * useful to check the case in which userspace tries to create a
|
|
+ * subflow for a connection which was already destroyed recently
|
|
+ * in kernelspace, but userspace didn't have time to realize
|
|
+ * about it because there is a gap of time between kernel
|
|
+ * destroying the connection and userspace receiving the event
|
|
+ * through Netlink. It can easily happen for short life-time
|
|
+ * conns.
|
|
+ */
|
|
+ return -EBADR;
|
|
+
|
|
+ mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
|
|
+
|
|
+ if (sock_flag(meta_sk, SOCK_DEAD)) {
|
|
+ /* Same as for the EBADR case. In this case, though, we know for
|
|
+ * sure the conn owner of the subflow existed at some point (no
|
|
+ * invalid token possibility)
|
|
+ */
|
|
+ ret = -EOWNERDEAD;
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ if (!mptcp_can_new_subflow(meta_sk)) {
|
|
+ /* Same as for the EBADR and EOWNERDEAD case but here, the MPTCP
|
|
+ * session has just been stopped, it is no longer possible to
|
|
+ * create new subflows.
|
|
+ */
|
|
+ ret = -ENOTCONN;
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ if (mpcb->master_sk &&
|
|
+ !tcp_sk(mpcb->master_sk)->mptcp->fully_established) {
|
|
+ /* First condition is not only in there for safely purposes, it
|
|
+ * can also be triggered in the same scenario as in EBADR and
|
|
+ * EOWNERDEAD
|
|
+ */
|
|
+ ret = -EAGAIN;
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ priv = mptcp_nl_priv(meta_sk);
|
|
+
|
|
+ family = nla_get_u16(info->attrs[MPTCP_ATTR_FAMILY]);
|
|
+ loc_id = nla_get_u8(info->attrs[MPTCP_ATTR_LOC_ID]);
|
|
+ rem_id = nla_get_u8(info->attrs[MPTCP_ATTR_REM_ID]);
|
|
+
|
|
+ sport = info->attrs[MPTCP_ATTR_SPORT]
|
|
+ ? htons(nla_get_u16(info->attrs[MPTCP_ATTR_SPORT])) : 0;
|
|
+ backup = info->attrs[MPTCP_ATTR_BACKUP]
|
|
+ ? nla_get_u8(info->attrs[MPTCP_ATTR_BACKUP]) : 0;
|
|
+ if_idx = info->attrs[MPTCP_ATTR_IF_IDX]
|
|
+ ? nla_get_s32(info->attrs[MPTCP_ATTR_IF_IDX]) : 0;
|
|
+
|
|
+ switch (family) {
|
|
+ case AF_INET: {
|
|
+ struct mptcp_rem4 rem = {
|
|
+ .rem4_id = rem_id,
|
|
+ };
|
|
+ struct mptcp_loc4 loc = {
|
|
+ .loc4_id = loc_id,
|
|
+ };
|
|
+
|
|
+ if (!info->attrs[MPTCP_ATTR_DADDR4] ||
|
|
+ !info->attrs[MPTCP_ATTR_DPORT]) {
|
|
+ goto create_failed;
|
|
+ } else {
|
|
+ rem.addr.s_addr =
|
|
+ nla_get_u32(info->attrs[MPTCP_ATTR_DADDR4]);
|
|
+ rem.port =
|
|
+ ntohs(nla_get_u16(info->attrs[MPTCP_ATTR_DPORT]));
|
|
+ }
|
|
+
|
|
+ if (!info->attrs[MPTCP_ATTR_SADDR4]) {
|
|
+ bool found = false;
|
|
+
|
|
+ mptcp_for_each_bit_set(priv->loc4_bits, i) {
|
|
+ if (priv->locaddr4[i].loc4_id == loc_id) {
|
|
+ loc.addr = priv->locaddr4[i].addr;
|
|
+ loc.low_prio =
|
|
+ priv->locaddr4[i].low_prio;
|
|
+ loc.if_idx =
|
|
+ priv->locaddr4[i].if_idx;
|
|
+ found = true;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!found)
|
|
+ goto create_failed;
|
|
+ } else {
|
|
+ loc.addr.s_addr =
|
|
+ nla_get_u32(info->attrs[MPTCP_ATTR_SADDR4]);
|
|
+ loc.low_prio = backup;
|
|
+ loc.if_idx = if_idx;
|
|
+ }
|
|
+
|
|
+ ret = __mptcp_init4_subsockets(meta_sk, &loc, sport, &rem,
|
|
+ &subsk);
|
|
+ if (ret < 0)
|
|
+ goto unlock;
|
|
+ break;
|
|
+ }
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ case AF_INET6: {
|
|
+ struct mptcp_rem6 rem = {
|
|
+ .rem6_id = rem_id,
|
|
+ };
|
|
+ struct mptcp_loc6 loc = {
|
|
+ .loc6_id = loc_id,
|
|
+ };
|
|
+
|
|
+ if (!info->attrs[MPTCP_ATTR_DADDR6] ||
|
|
+ !info->attrs[MPTCP_ATTR_DPORT]) {
|
|
+ goto create_failed;
|
|
+ } else {
|
|
+ rem.addr = *(struct in6_addr *)
|
|
+ nla_data(info->attrs[MPTCP_ATTR_DADDR6]);
|
|
+ rem.port =
|
|
+ ntohs(nla_get_u16(info->attrs[MPTCP_ATTR_DPORT]));
|
|
+ }
|
|
+
|
|
+ if (!info->attrs[MPTCP_ATTR_SADDR6]) {
|
|
+ bool found = false;
|
|
+
|
|
+ mptcp_for_each_bit_set(priv->loc6_bits, i) {
|
|
+ if (priv->locaddr6[i].loc6_id == loc_id) {
|
|
+ loc.addr = priv->locaddr6[i].addr;
|
|
+ loc.low_prio =
|
|
+ priv->locaddr6[i].low_prio;
|
|
+ loc.if_idx =
|
|
+ priv->locaddr6[i].if_idx;
|
|
+
|
|
+ found = true;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!found)
|
|
+ goto create_failed;
|
|
+ } else {
|
|
+ loc.addr = *(struct in6_addr *)
|
|
+ nla_data(info->attrs[MPTCP_ATTR_SADDR6]);
|
|
+ loc.low_prio = backup;
|
|
+ loc.if_idx = if_idx;
|
|
+ }
|
|
+
|
|
+ ret = __mptcp_init6_subsockets(meta_sk, &loc, sport, &rem,
|
|
+ &subsk);
|
|
+ if (ret < 0)
|
|
+ goto unlock;
|
|
+ break;
|
|
+ }
|
|
+#endif
|
|
+ default:
|
|
+ goto create_failed;
|
|
+ }
|
|
+
|
|
+unlock:
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ sock_put(meta_sk);
|
|
+ return ret;
|
|
+
|
|
+create_failed:
|
|
+ ret = -EINVAL;
|
|
+ goto unlock;
|
|
+}
|
|
+
|
|
+static struct sock *
|
|
+mptcp_nl_subsk_lookup(struct mptcp_cb *mpcb, struct nlattr **attrs)
|
|
+{
|
|
+ struct sock *sk;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct hlist_node *tmp;
|
|
+ u16 family;
|
|
+ __be16 sport, dport;
|
|
+
|
|
+ if (!attrs[MPTCP_ATTR_FAMILY] || !attrs[MPTCP_ATTR_SPORT] ||
|
|
+ !attrs[MPTCP_ATTR_DPORT])
|
|
+ goto exit;
|
|
+
|
|
+ family = nla_get_u16(attrs[MPTCP_ATTR_FAMILY]);
|
|
+ sport = htons(nla_get_u16(attrs[MPTCP_ATTR_SPORT]));
|
|
+ dport = htons(nla_get_u16(attrs[MPTCP_ATTR_DPORT]));
|
|
+
|
|
+ switch (family) {
|
|
+ case AF_INET: {
|
|
+ __be32 saddr, daddr;
|
|
+
|
|
+ if (!attrs[MPTCP_ATTR_SADDR4] || !attrs[MPTCP_ATTR_DADDR4])
|
|
+ break;
|
|
+
|
|
+ saddr = nla_get_u32(attrs[MPTCP_ATTR_SADDR4]);
|
|
+ daddr = nla_get_u32(attrs[MPTCP_ATTR_DADDR4]);
|
|
+
|
|
+ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) {
|
|
+ struct sock *subsk = mptcp_to_sock(mptcp);
|
|
+ struct inet_sock *isk = inet_sk(subsk);
|
|
+
|
|
+ if (subsk->sk_family != AF_INET)
|
|
+ continue;
|
|
+
|
|
+ if (isk->inet_saddr == saddr &&
|
|
+ isk->inet_daddr == daddr &&
|
|
+ isk->inet_sport == sport &&
|
|
+ isk->inet_dport == dport) {
|
|
+ sk = subsk;
|
|
+ goto found;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ case AF_INET6: {
|
|
+ struct in6_addr saddr, daddr;
|
|
+
|
|
+ if (!attrs[MPTCP_ATTR_SADDR6] || !attrs[MPTCP_ATTR_DADDR6])
|
|
+ break;
|
|
+
|
|
+ saddr = *(struct in6_addr *)nla_data(attrs[MPTCP_ATTR_SADDR6]);
|
|
+ daddr = *(struct in6_addr *)nla_data(attrs[MPTCP_ATTR_DADDR6]);
|
|
+
|
|
+ mptcp_for_each_sub_safe(mpcb, mptcp, tmp) {
|
|
+ struct sock *subsk = mptcp_to_sock(mptcp);
|
|
+ struct inet_sock *isk = inet_sk(subsk);
|
|
+ struct ipv6_pinfo *np;
|
|
+
|
|
+ if (subsk->sk_family != AF_INET6)
|
|
+ continue;
|
|
+
|
|
+ np = inet6_sk(subsk);
|
|
+ if (ipv6_addr_equal(&saddr, &np->saddr) &&
|
|
+ ipv6_addr_equal(&daddr, &subsk->sk_v6_daddr) &&
|
|
+ isk->inet_sport == sport &&
|
|
+ isk->inet_dport == dport) {
|
|
+ sk = subsk;
|
|
+ goto found;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+#endif
|
|
+ }
|
|
+
|
|
+exit:
|
|
+ sk = NULL;
|
|
+found:
|
|
+ return sk;
|
|
+}
|
|
+
|
|
+static int
|
|
+mptcp_nl_genl_destroy(struct sk_buff *skb, struct genl_info *info)
|
|
+{
|
|
+ struct sock *meta_sk, *subsk;
|
|
+ struct mptcp_cb *mpcb;
|
|
+ int ret = 0;
|
|
+ u32 token;
|
|
+
|
|
+ if (!info->attrs[MPTCP_ATTR_TOKEN])
|
|
+ return -EINVAL;
|
|
+
|
|
+ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]);
|
|
+
|
|
+ meta_sk = mptcp_hash_find(genl_info_net(info), token);
|
|
+ if (!meta_sk)
|
|
+ return -EINVAL;
|
|
+
|
|
+ mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
|
|
+
|
|
+ subsk = mptcp_nl_subsk_lookup(mpcb, info->attrs);
|
|
+ if (subsk) {
|
|
+ local_bh_disable();
|
|
+ mptcp_reinject_data(subsk, 0);
|
|
+ mptcp_send_reset(subsk);
|
|
+ local_bh_enable();
|
|
+ } else {
|
|
+ ret = -EINVAL;
|
|
+ }
|
|
+
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ sock_put(meta_sk);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int
|
|
+mptcp_nl_genl_conn_exists(struct sk_buff *skb, struct genl_info *info)
|
|
+{
|
|
+ struct sock *meta_sk;
|
|
+ u32 token;
|
|
+
|
|
+ if (!info->attrs[MPTCP_ATTR_TOKEN])
|
|
+ return -EINVAL;
|
|
+
|
|
+ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]);
|
|
+
|
|
+ meta_sk = mptcp_hash_find(genl_info_net(info), token);
|
|
+ if (!meta_sk)
|
|
+ return -ENOTCONN;
|
|
+
|
|
+ sock_put(meta_sk);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int
|
|
+mptcp_nl_genl_priority(struct sk_buff *skb, struct genl_info *info)
|
|
+{
|
|
+ struct sock *meta_sk, *subsk;
|
|
+ struct mptcp_cb *mpcb;
|
|
+ int ret = 0;
|
|
+ u32 token;
|
|
+ u8 backup = 0;
|
|
+
|
|
+ if (!info->attrs[MPTCP_ATTR_TOKEN])
|
|
+ return -EINVAL;
|
|
+
|
|
+ token = nla_get_u32(info->attrs[MPTCP_ATTR_TOKEN]);
|
|
+ if (info->attrs[MPTCP_ATTR_BACKUP])
|
|
+ backup = nla_get_u8(info->attrs[MPTCP_ATTR_BACKUP]);
|
|
+
|
|
+ meta_sk = mptcp_hash_find(genl_info_net(info), token);
|
|
+ if (!meta_sk)
|
|
+ return -EINVAL;
|
|
+
|
|
+ mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+
|
|
+ mutex_lock(&mpcb->mpcb_mutex);
|
|
+ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
|
|
+
|
|
+ subsk = mptcp_nl_subsk_lookup(mpcb, info->attrs);
|
|
+ if (subsk) {
|
|
+ tcp_sk(subsk)->mptcp->send_mp_prio = 1;
|
|
+ tcp_sk(subsk)->mptcp->low_prio = !!backup;
|
|
+
|
|
+ local_bh_disable();
|
|
+ if (mptcp_sk_can_send_ack(subsk))
|
|
+ tcp_send_ack(subsk);
|
|
+ else
|
|
+ ret = -ENOTCONN;
|
|
+ local_bh_enable();
|
|
+ } else {
|
|
+ ret = -EINVAL;
|
|
+ }
|
|
+
|
|
+ release_sock(meta_sk);
|
|
+ mutex_unlock(&mpcb->mpcb_mutex);
|
|
+ sock_put(meta_sk);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int
|
|
+mptcp_nl_genl_set_filter(struct sk_buff *skb, struct genl_info *info)
|
|
+{
|
|
+ u16 flags;
|
|
+
|
|
+ if (!info->attrs[MPTCP_ATTR_FLAGS])
|
|
+ return -EINVAL;
|
|
+
|
|
+ flags = nla_get_u16(info->attrs[MPTCP_ATTR_FLAGS]);
|
|
+
|
|
+ /* Only want to receive events that correspond to these flags */
|
|
+ mptcp_nl_event_filter = ~flags;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static struct genl_ops mptcp_genl_ops[] = {
|
|
+ {
|
|
+ .cmd = MPTCP_CMD_ANNOUNCE,
|
|
+ .doit = mptcp_nl_genl_announce,
|
|
+ .flags = GENL_ADMIN_PERM,
|
|
+ },
|
|
+ {
|
|
+ .cmd = MPTCP_CMD_REMOVE,
|
|
+ .doit = mptcp_nl_genl_remove,
|
|
+ .flags = GENL_ADMIN_PERM,
|
|
+ },
|
|
+ {
|
|
+ .cmd = MPTCP_CMD_SUB_CREATE,
|
|
+ .doit = mptcp_nl_genl_create,
|
|
+ .flags = GENL_ADMIN_PERM,
|
|
+ },
|
|
+ {
|
|
+ .cmd = MPTCP_CMD_SUB_DESTROY,
|
|
+ .doit = mptcp_nl_genl_destroy,
|
|
+ .flags = GENL_ADMIN_PERM,
|
|
+ },
|
|
+ {
|
|
+ .cmd = MPTCP_CMD_SUB_PRIORITY,
|
|
+ .doit = mptcp_nl_genl_priority,
|
|
+ .flags = GENL_ADMIN_PERM,
|
|
+ },
|
|
+ {
|
|
+ .cmd = MPTCP_CMD_SET_FILTER,
|
|
+ .doit = mptcp_nl_genl_set_filter,
|
|
+ .flags = GENL_ADMIN_PERM,
|
|
+ },
|
|
+ {
|
|
+ .cmd = MPTCP_CMD_EXIST,
|
|
+ .doit = mptcp_nl_genl_conn_exists,
|
|
+ .flags = GENL_ADMIN_PERM,
|
|
+ },
|
|
+};
|
|
+
|
|
+static struct mptcp_pm_ops mptcp_nl_pm_ops = {
|
|
+ .new_session = mptcp_nl_pm_new_session,
|
|
+ .close_session = mptcp_nl_pm_close_session,
|
|
+ .fully_established = mptcp_nl_pm_fully_established,
|
|
+ .established_subflow = mptcp_nl_pm_established_subflow,
|
|
+ .delete_subflow = mptcp_nl_pm_delete_subflow,
|
|
+ .add_raddr = mptcp_nl_pm_add_raddr,
|
|
+ .rem_raddr = mptcp_nl_pm_rem_raddr,
|
|
+ .get_local_id = mptcp_nl_pm_get_local_id,
|
|
+ .addr_signal = mptcp_nl_pm_addr_signal,
|
|
+ .prio_changed = mptcp_nl_pm_prio_changed,
|
|
+ .name = "netlink",
|
|
+ .owner = THIS_MODULE,
|
|
+};
|
|
+
|
|
+static struct genl_family mptcp_genl_family = {
|
|
+ .hdrsize = 0,
|
|
+ .name = MPTCP_GENL_NAME,
|
|
+ .version = MPTCP_GENL_VER,
|
|
+ .maxattr = MPTCP_ATTR_MAX,
|
|
+ .policy = mptcp_nl_genl_policy,
|
|
+ .netnsok = true,
|
|
+ .module = THIS_MODULE,
|
|
+ .ops = mptcp_genl_ops,
|
|
+ .n_ops = ARRAY_SIZE(mptcp_genl_ops),
|
|
+ .mcgrps = mptcp_mcgrps,
|
|
+ .n_mcgrps = ARRAY_SIZE(mptcp_mcgrps),
|
|
+};
|
|
+
|
|
+static int __init
|
|
+mptcp_nl_init(void)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ BUILD_BUG_ON(sizeof(struct mptcp_nl_priv) > MPTCP_PM_SIZE);
|
|
+
|
|
+ ret = genl_register_family(&mptcp_genl_family);
|
|
+ if (ret)
|
|
+ goto out_genl;
|
|
+
|
|
+ ret = mptcp_register_path_manager(&mptcp_nl_pm_ops);
|
|
+ if (ret)
|
|
+ goto out_pm;
|
|
+
|
|
+ return 0;
|
|
+out_pm:
|
|
+ genl_unregister_family(&mptcp_genl_family);
|
|
+out_genl:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void __exit
|
|
+mptcp_nl_exit(void)
|
|
+{
|
|
+ mptcp_unregister_path_manager(&mptcp_nl_pm_ops);
|
|
+ genl_unregister_family(&mptcp_genl_family);
|
|
+}
|
|
+
|
|
+module_init(mptcp_nl_init);
|
|
+module_exit(mptcp_nl_exit);
|
|
+
|
|
+MODULE_AUTHOR("Gregory Detal <gregory.detal@tessares.net>");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("MPTCP netlink-based path manager");
|
|
+MODULE_ALIAS_GENL_FAMILY(MPTCP_GENL_NAME);
|
|
diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c
|
|
new file mode 100644
|
|
index 000000000000..c44eb9208581
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_olia.c
|
|
@@ -0,0 +1,318 @@
|
|
+/*
|
|
+ * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL:
|
|
+ *
|
|
+ * Algorithm design:
|
|
+ * Ramin Khalili <ramin.khalili@epfl.ch>
|
|
+ * Nicolas Gast <nicolas.gast@epfl.ch>
|
|
+ * Jean-Yves Le Boudec <jean-yves.leboudec@epfl.ch>
|
|
+ *
|
|
+ * Implementation:
|
|
+ * Ramin Khalili <ramin.khalili@epfl.ch>
|
|
+ *
|
|
+ * Ported to the official MPTCP-kernel:
|
|
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+
|
|
+#include <net/tcp.h>
|
|
+#include <net/mptcp.h>
|
|
+
|
|
+#include <linux/module.h>
|
|
+
|
|
+static int scale = 10;
|
|
+
|
|
+struct mptcp_olia {
|
|
+ u32 mptcp_loss1;
|
|
+ u32 mptcp_loss2;
|
|
+ u32 mptcp_loss3;
|
|
+ int epsilon_num;
|
|
+ u32 epsilon_den;
|
|
+ int mptcp_snd_cwnd_cnt;
|
|
+};
|
|
+
|
|
+static inline int mptcp_olia_sk_can_send(const struct sock *sk)
|
|
+{
|
|
+ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
|
|
+}
|
|
+
|
|
+static inline u64 mptcp_olia_scale(u64 val, int scale)
|
|
+{
|
|
+ return (u64) val << scale;
|
|
+}
|
|
+
|
|
+/* take care of artificially inflate (see RFC5681)
|
|
+ * of cwnd during fast-retransmit phase
|
|
+ */
|
|
+static u32 mptcp_get_crt_cwnd(struct sock *sk)
|
|
+{
|
|
+ const struct inet_connection_sock *icsk = inet_csk(sk);
|
|
+
|
|
+ if (icsk->icsk_ca_state == TCP_CA_Recovery)
|
|
+ return tcp_sk(sk)->snd_ssthresh;
|
|
+ else
|
|
+ return tcp_sk(sk)->snd_cwnd;
|
|
+}
|
|
+
|
|
+/* return the dominator of the first term of the increasing term */
|
|
+static u64 mptcp_get_rate(const struct mptcp_cb *mpcb , u32 path_rtt)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ u64 scaled_num;
|
|
+ u32 tmp_cwnd;
|
|
+
|
|
+ if (!mptcp_olia_sk_can_send(sk))
|
|
+ continue;
|
|
+
|
|
+ tmp_cwnd = mptcp_get_crt_cwnd(sk);
|
|
+ scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt;
|
|
+ rate += div_u64(scaled_num , tp->srtt_us);
|
|
+ }
|
|
+ rate *= rate;
|
|
+ return rate;
|
|
+}
|
|
+
|
|
+/* find the maximum cwnd, used to find set M */
|
|
+static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ u32 best_cwnd = 0;
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ u32 tmp_cwnd;
|
|
+
|
|
+ if (!mptcp_olia_sk_can_send(sk))
|
|
+ continue;
|
|
+
|
|
+ tmp_cwnd = mptcp_get_crt_cwnd(sk);
|
|
+ if (tmp_cwnd > best_cwnd)
|
|
+ best_cwnd = tmp_cwnd;
|
|
+ }
|
|
+ return best_cwnd;
|
|
+}
|
|
+
|
|
+static void mptcp_get_epsilon(const struct mptcp_cb *mpcb)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct mptcp_olia *ca;
|
|
+ struct tcp_sock *tp;
|
|
+ struct sock *sk;
|
|
+ u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1;
|
|
+ u32 max_cwnd, tmp_cwnd, established_cnt = 0;
|
|
+ u8 M = 0, B_not_M = 0;
|
|
+
|
|
+ /* TODO - integrate this in the following loop - we just want to iterate once */
|
|
+
|
|
+ max_cwnd = mptcp_get_max_cwnd(mpcb);
|
|
+
|
|
+ /* find the best path */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ sk = mptcp_to_sock(mptcp);
|
|
+ tp = tcp_sk(sk);
|
|
+ ca = inet_csk_ca(sk);
|
|
+
|
|
+ if (!mptcp_olia_sk_can_send(sk))
|
|
+ continue;
|
|
+
|
|
+ established_cnt++;
|
|
+
|
|
+ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
|
|
+ /* TODO - check here and rename variables */
|
|
+ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
|
|
+ ca->mptcp_loss2 - ca->mptcp_loss1);
|
|
+
|
|
+ if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) {
|
|
+ best_rtt = tmp_rtt;
|
|
+ best_int = tmp_int;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */
|
|
+ /* find the size of M and B_not_M */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ sk = mptcp_to_sock(mptcp);
|
|
+ tp = tcp_sk(sk);
|
|
+ ca = inet_csk_ca(sk);
|
|
+
|
|
+ if (!mptcp_olia_sk_can_send(sk))
|
|
+ continue;
|
|
+
|
|
+ tmp_cwnd = mptcp_get_crt_cwnd(sk);
|
|
+ if (tmp_cwnd == max_cwnd) {
|
|
+ M++;
|
|
+ } else {
|
|
+ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
|
|
+ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
|
|
+ ca->mptcp_loss2 - ca->mptcp_loss1);
|
|
+
|
|
+ if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt)
|
|
+ B_not_M++;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* check if the path is in M or B_not_M and set the value of epsilon accordingly */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ sk = mptcp_to_sock(mptcp);
|
|
+ tp = tcp_sk(sk);
|
|
+ ca = inet_csk_ca(sk);
|
|
+
|
|
+ if (!mptcp_olia_sk_can_send(sk))
|
|
+ continue;
|
|
+
|
|
+ if (B_not_M == 0) {
|
|
+ ca->epsilon_num = 0;
|
|
+ ca->epsilon_den = 1;
|
|
+ } else {
|
|
+ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
|
|
+ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
|
|
+ ca->mptcp_loss2 - ca->mptcp_loss1);
|
|
+ tmp_cwnd = mptcp_get_crt_cwnd(sk);
|
|
+
|
|
+ if (tmp_cwnd < max_cwnd &&
|
|
+ (u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) {
|
|
+ ca->epsilon_num = 1;
|
|
+ ca->epsilon_den = established_cnt * B_not_M;
|
|
+ } else if (tmp_cwnd == max_cwnd) {
|
|
+ ca->epsilon_num = -1;
|
|
+ ca->epsilon_den = established_cnt * M;
|
|
+ } else {
|
|
+ ca->epsilon_num = 0;
|
|
+ ca->epsilon_den = 1;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/* setting the initial values */
|
|
+static void mptcp_olia_init(struct sock *sk)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct mptcp_olia *ca = inet_csk_ca(sk);
|
|
+
|
|
+ if (mptcp(tp)) {
|
|
+ ca->mptcp_loss1 = tp->snd_una;
|
|
+ ca->mptcp_loss2 = tp->snd_una;
|
|
+ ca->mptcp_loss3 = tp->snd_una;
|
|
+ ca->mptcp_snd_cwnd_cnt = 0;
|
|
+ ca->epsilon_num = 0;
|
|
+ ca->epsilon_den = 1;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* updating inter-loss distance and ssthresh */
|
|
+static void mptcp_olia_set_state(struct sock *sk, u8 new_state)
|
|
+{
|
|
+ if (!mptcp(tcp_sk(sk)))
|
|
+ return;
|
|
+
|
|
+ if (new_state == TCP_CA_Loss ||
|
|
+ new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) {
|
|
+ struct mptcp_olia *ca = inet_csk_ca(sk);
|
|
+
|
|
+ if (ca->mptcp_loss3 != ca->mptcp_loss2 &&
|
|
+ !inet_csk(sk)->icsk_retransmits) {
|
|
+ ca->mptcp_loss1 = ca->mptcp_loss2;
|
|
+ ca->mptcp_loss2 = ca->mptcp_loss3;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/* main algorithm */
|
|
+static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct mptcp_olia *ca = inet_csk_ca(sk);
|
|
+ const struct mptcp_cb *mpcb = tp->mpcb;
|
|
+
|
|
+ u64 inc_num, inc_den, rate, cwnd_scaled;
|
|
+
|
|
+ if (!mptcp(tp)) {
|
|
+ tcp_reno_cong_avoid(sk, ack, acked);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ ca->mptcp_loss3 = tp->snd_una;
|
|
+
|
|
+ if (!tcp_is_cwnd_limited(sk))
|
|
+ return;
|
|
+
|
|
+ /* slow start if it is in the safe area */
|
|
+ if (tcp_in_slow_start(tp)) {
|
|
+ tcp_slow_start(tp, acked);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ mptcp_get_epsilon(mpcb);
|
|
+ rate = mptcp_get_rate(mpcb, tp->srtt_us);
|
|
+ cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale);
|
|
+ inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1;
|
|
+
|
|
+ /* calculate the increasing term, scaling is used to reduce the rounding effect */
|
|
+ if (ca->epsilon_num == -1) {
|
|
+ if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) {
|
|
+ inc_num = rate - ca->epsilon_den *
|
|
+ cwnd_scaled * cwnd_scaled;
|
|
+ ca->mptcp_snd_cwnd_cnt -= div64_u64(
|
|
+ mptcp_olia_scale(inc_num , scale) , inc_den);
|
|
+ } else {
|
|
+ inc_num = ca->epsilon_den *
|
|
+ cwnd_scaled * cwnd_scaled - rate;
|
|
+ ca->mptcp_snd_cwnd_cnt += div64_u64(
|
|
+ mptcp_olia_scale(inc_num , scale) , inc_den);
|
|
+ }
|
|
+ } else {
|
|
+ inc_num = ca->epsilon_num * rate +
|
|
+ ca->epsilon_den * cwnd_scaled * cwnd_scaled;
|
|
+ ca->mptcp_snd_cwnd_cnt += div64_u64(
|
|
+ mptcp_olia_scale(inc_num , scale) , inc_den);
|
|
+ }
|
|
+
|
|
+
|
|
+ if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) {
|
|
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
|
|
+ tp->snd_cwnd++;
|
|
+ ca->mptcp_snd_cwnd_cnt = 0;
|
|
+ } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) {
|
|
+ tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1);
|
|
+ ca->mptcp_snd_cwnd_cnt = 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct tcp_congestion_ops mptcp_olia = {
|
|
+ .init = mptcp_olia_init,
|
|
+ .ssthresh = tcp_reno_ssthresh,
|
|
+ .cong_avoid = mptcp_olia_cong_avoid,
|
|
+ .undo_cwnd = tcp_reno_undo_cwnd,
|
|
+ .set_state = mptcp_olia_set_state,
|
|
+ .owner = THIS_MODULE,
|
|
+ .name = "olia",
|
|
+};
|
|
+
|
|
+static int __init mptcp_olia_register(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE);
|
|
+ return tcp_register_congestion_control(&mptcp_olia);
|
|
+}
|
|
+
|
|
+static void __exit mptcp_olia_unregister(void)
|
|
+{
|
|
+ tcp_unregister_congestion_control(&mptcp_olia);
|
|
+}
|
|
+
|
|
+module_init(mptcp_olia_register);
|
|
+module_exit(mptcp_olia_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
|
|
+MODULE_VERSION("0.1");
|
|
diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c
|
|
new file mode 100644
|
|
index 000000000000..39eae2199802
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_output.c
|
|
@@ -0,0 +1,2009 @@
|
|
+/*
|
|
+ * MPTCP implementation - Sending side
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
|
|
+ *
|
|
+ * Current Maintainer & Author:
|
|
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
|
|
+ *
|
|
+ * Additional authors:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ * Gregory Detal <gregory.detal@uclouvain.be>
|
|
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
|
|
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
|
|
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
|
|
+ * Andreas Ripke <ripke@neclab.eu>
|
|
+ * Vlad Dogaru <vlad.dogaru@intel.com>
|
|
+ * Octavian Purdila <octavian.purdila@intel.com>
|
|
+ * John Ronan <jronan@tssg.org>
|
|
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
|
|
+ * Brandon Heller <brandonh@stanford.edu>
|
|
+ *
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#include <linux/kconfig.h>
|
|
+#include <linux/skbuff.h>
|
|
+#include <linux/tcp.h>
|
|
+
|
|
+#include <net/mptcp.h>
|
|
+#include <net/mptcp_v4.h>
|
|
+#include <net/mptcp_v6.h>
|
|
+#include <net/sock.h>
|
|
+
|
|
+static const int mptcp_dss_len = MPTCP_SUB_LEN_DSS_ALIGN +
|
|
+ MPTCP_SUB_LEN_ACK_ALIGN +
|
|
+ MPTCP_SUB_LEN_SEQ_ALIGN;
|
|
+
|
|
+static inline int mptcp_sub_len_remove_addr(u16 bitfield)
|
|
+{
|
|
+ unsigned int c;
|
|
+ for (c = 0; bitfield; c++)
|
|
+ bitfield &= bitfield - 1;
|
|
+ return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1;
|
|
+}
|
|
+
|
|
+int mptcp_sub_len_remove_addr_align(u16 bitfield)
|
|
+{
|
|
+ return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4);
|
|
+}
|
|
+EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align);
|
|
+
|
|
+/* get the data-seq and end-data-seq and store them again in the
|
|
+ * tcp_skb_cb
|
|
+ */
|
|
+static bool mptcp_reconstruct_mapping(struct sk_buff *skb)
|
|
+{
|
|
+ const struct mp_dss *mpdss = (struct mp_dss *)TCP_SKB_CB(skb)->dss;
|
|
+ __be32 *p32;
|
|
+ __be16 *p16;
|
|
+
|
|
+ if (!mptcp_is_data_seq(skb))
|
|
+ return false;
|
|
+
|
|
+ if (!mpdss->M)
|
|
+ return false;
|
|
+
|
|
+ /* Move the pointer to the data-seq */
|
|
+ p32 = (__be32 *)mpdss;
|
|
+ p32++;
|
|
+ if (mpdss->A) {
|
|
+ p32++;
|
|
+ if (mpdss->a)
|
|
+ p32++;
|
|
+ }
|
|
+
|
|
+ TCP_SKB_CB(skb)->seq = ntohl(*p32);
|
|
+
|
|
+ /* Get the data_len to calculate the end_data_seq */
|
|
+ p32++;
|
|
+ p32++;
|
|
+ p16 = (__be16 *)p32;
|
|
+ TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static bool mptcp_is_reinjected(const struct sk_buff *skb)
|
|
+{
|
|
+ return TCP_SKB_CB(skb)->mptcp_flags & MPTCP_REINJECT;
|
|
+}
|
|
+
|
|
+static void mptcp_find_and_set_pathmask(struct sock *meta_sk, struct sk_buff *skb)
|
|
+{
|
|
+ struct rb_node **p = &meta_sk->tcp_rtx_queue.rb_node;
|
|
+ struct rb_node *parent;
|
|
+ struct sk_buff *skb_it;
|
|
+
|
|
+ while (*p) {
|
|
+ parent = *p;
|
|
+ skb_it = rb_to_skb(parent);
|
|
+ if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb_it)->seq)) {
|
|
+ p = &parent->rb_left;
|
|
+ continue;
|
|
+ }
|
|
+ if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb_it)->seq)) {
|
|
+ p = &parent->rb_right;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask;
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are
|
|
+ * coming from the meta-retransmit-timer
|
|
+ */
|
|
+static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk,
|
|
+ struct sock *sk, int clone_it,
|
|
+ enum tcp_queue tcp_queue)
|
|
+{
|
|
+ struct sk_buff *skb, *skb1;
|
|
+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ u32 seq, end_seq;
|
|
+
|
|
+ if (clone_it) {
|
|
+ /* pskb_copy is necessary here, because the TCP/IP-headers
|
|
+ * will be changed when it's going to be reinjected on another
|
|
+ * subflow.
|
|
+ */
|
|
+ tcp_skb_tsorted_save(orig_skb) {
|
|
+ skb = pskb_copy_for_clone(orig_skb, GFP_ATOMIC);
|
|
+ } tcp_skb_tsorted_restore(orig_skb);
|
|
+ } else {
|
|
+ if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) {
|
|
+ __skb_unlink(orig_skb, &sk->sk_write_queue);
|
|
+ } else {
|
|
+ list_del(&orig_skb->tcp_tsorted_anchor);
|
|
+ tcp_rtx_queue_unlink(orig_skb, sk);
|
|
+ INIT_LIST_HEAD(&orig_skb->tcp_tsorted_anchor);
|
|
+ }
|
|
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
|
|
+ sk->sk_wmem_queued -= orig_skb->truesize;
|
|
+ sk_mem_uncharge(sk, orig_skb->truesize);
|
|
+ skb = orig_skb;
|
|
+ }
|
|
+ if (unlikely(!skb))
|
|
+ return;
|
|
+
|
|
+ /* Make sure that this list is clean */
|
|
+ tcp_skb_tsorted_anchor_cleanup(skb);
|
|
+
|
|
+ if (sk && !mptcp_reconstruct_mapping(skb)) {
|
|
+ __kfree_skb(skb);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ skb->sk = meta_sk;
|
|
+
|
|
+ /* Reset subflow-specific TCP control-data */
|
|
+ TCP_SKB_CB(skb)->sacked = 0;
|
|
+ TCP_SKB_CB(skb)->tcp_flags &= (TCPHDR_ACK | TCPHDR_PSH);
|
|
+
|
|
+ /* If it reached already the destination, we don't have to reinject it */
|
|
+ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
|
|
+ __kfree_skb(skb);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /* Only reinject segments that are fully covered by the mapping */
|
|
+ if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) !=
|
|
+ TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
|
|
+ struct rb_node *parent, **p = &meta_sk->tcp_rtx_queue.rb_node;
|
|
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
|
|
+ u32 seq = TCP_SKB_CB(skb)->seq;
|
|
+
|
|
+ __kfree_skb(skb);
|
|
+
|
|
+ /* Ok, now we have to look for the full mapping in the meta
|
|
+ * send-queue :S
|
|
+ */
|
|
+
|
|
+ /* First, find the first skb that covers us */
|
|
+ while (*p) {
|
|
+ parent = *p;
|
|
+ skb = rb_to_skb(parent);
|
|
+
|
|
+ /* Not yet at the mapping? */
|
|
+ if (!after(end_seq, TCP_SKB_CB(skb)->seq)) {
|
|
+ p = &parent->rb_left;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
|
|
+ p = &parent->rb_right;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (*p) {
|
|
+ /* We found it, now let's reinject everything */
|
|
+ skb = rb_to_skb(*p);
|
|
+
|
|
+ skb_rbtree_walk_from(skb) {
|
|
+ if (after(TCP_SKB_CB(skb)->end_seq, end_seq))
|
|
+ return;
|
|
+ __mptcp_reinject_data(skb, meta_sk, NULL, 1,
|
|
+ TCP_FRAG_IN_RTX_QUEUE);
|
|
+ }
|
|
+ }
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /* Segment goes back to the MPTCP-layer. So, we need to zero the
|
|
+ * path_mask/dss.
|
|
+ */
|
|
+ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);
|
|
+
|
|
+ /* We need to find out the path-mask from the meta-write-queue
|
|
+ * to properly select a subflow.
|
|
+ */
|
|
+ mptcp_find_and_set_pathmask(meta_sk, skb);
|
|
+
|
|
+ /* If it's empty, just add */
|
|
+ if (skb_queue_empty(&mpcb->reinject_queue)) {
|
|
+ skb_queue_head(&mpcb->reinject_queue, skb);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /* Find place to insert skb - or even we can 'drop' it, as the
|
|
+ * data is already covered by other skb's in the reinject-queue.
|
|
+ *
|
|
+ * This is inspired by code from tcp_data_queue.
|
|
+ */
|
|
+
|
|
+ skb1 = skb_peek_tail(&mpcb->reinject_queue);
|
|
+ seq = TCP_SKB_CB(skb)->seq;
|
|
+ while (1) {
|
|
+ if (!after(TCP_SKB_CB(skb1)->seq, seq))
|
|
+ break;
|
|
+ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) {
|
|
+ skb1 = NULL;
|
|
+ break;
|
|
+ }
|
|
+ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
|
|
+ }
|
|
+
|
|
+ /* Do skb overlap to previous one? */
|
|
+ end_seq = TCP_SKB_CB(skb)->end_seq;
|
|
+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
|
|
+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
|
|
+ /* All the bits are present. Don't reinject */
|
|
+ __kfree_skb(skb);
|
|
+ return;
|
|
+ }
|
|
+ if (seq == TCP_SKB_CB(skb1)->seq) {
|
|
+ if (skb_queue_is_first(&mpcb->reinject_queue, skb1))
|
|
+ skb1 = NULL;
|
|
+ else
|
|
+ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
|
|
+ }
|
|
+ }
|
|
+ if (!skb1)
|
|
+ __skb_queue_head(&mpcb->reinject_queue, skb);
|
|
+ else
|
|
+ __skb_queue_after(&mpcb->reinject_queue, skb1, skb);
|
|
+
|
|
+ /* And clean segments covered by new one as whole. */
|
|
+ while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) {
|
|
+ skb1 = skb_queue_next(&mpcb->reinject_queue, skb);
|
|
+
|
|
+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
|
|
+ break;
|
|
+
|
|
+ if (before(end_seq, TCP_SKB_CB(skb1)->end_seq))
|
|
+ break;
|
|
+
|
|
+ __skb_unlink(skb1, &mpcb->reinject_queue);
|
|
+ __kfree_skb(skb1);
|
|
+ }
|
|
+ return;
|
|
+}
|
|
+
|
|
+/* Inserts data into the reinject queue */
|
|
+void mptcp_reinject_data(struct sock *sk, int clone_it)
|
|
+{
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ struct sk_buff *skb_it, *tmp;
|
|
+ enum tcp_queue tcp_queue;
|
|
+
|
|
+ /* It has already been closed - there is really no point in reinjecting */
|
|
+ if (meta_sk->sk_state == TCP_CLOSE)
|
|
+ return;
|
|
+
|
|
+ skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) {
|
|
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it);
|
|
+ /* Subflow syn's and fin's are not reinjected.
|
|
+ *
|
|
+ * As well as empty subflow-fins with a data-fin.
|
|
+ * They are reinjected below (without the subflow-fin-flag)
|
|
+ */
|
|
+ if (tcb->tcp_flags & TCPHDR_SYN ||
|
|
+ (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) ||
|
|
+ (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len))
|
|
+ continue;
|
|
+
|
|
+ if (mptcp_is_reinjected(skb_it))
|
|
+ continue;
|
|
+
|
|
+ tcb->mptcp_flags |= MPTCP_REINJECT;
|
|
+ __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it,
|
|
+ TCP_FRAG_IN_WRITE_QUEUE);
|
|
+ }
|
|
+
|
|
+ skb_it = tcp_rtx_queue_head(sk);
|
|
+ skb_rbtree_walk_from_safe(skb_it, tmp) {
|
|
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it);
|
|
+
|
|
+ /* Subflow syn's and fin's are not reinjected.
|
|
+ *
|
|
+ * As well as empty subflow-fins with a data-fin.
|
|
+ * They are reinjected below (without the subflow-fin-flag)
|
|
+ */
|
|
+ if (tcb->tcp_flags & TCPHDR_SYN ||
|
|
+ (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) ||
|
|
+ (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len))
|
|
+ continue;
|
|
+
|
|
+ if (mptcp_is_reinjected(skb_it))
|
|
+ continue;
|
|
+
|
|
+ tcb->mptcp_flags |= MPTCP_REINJECT;
|
|
+ __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it,
|
|
+ TCP_FRAG_IN_RTX_QUEUE);
|
|
+ }
|
|
+
|
|
+ skb_it = tcp_write_queue_tail(meta_sk);
|
|
+ tcp_queue = TCP_FRAG_IN_WRITE_QUEUE;
|
|
+
|
|
+ if (!skb_it) {
|
|
+ skb_it = skb_rb_last(&meta_sk->tcp_rtx_queue);
|
|
+ tcp_queue = TCP_FRAG_IN_RTX_QUEUE;
|
|
+ }
|
|
+
|
|
+ /* If sk has sent the empty data-fin, we have to reinject it too. */
|
|
+ if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 &&
|
|
+ TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tcp_sk(sk)->mptcp->path_index)) {
|
|
+ __mptcp_reinject_data(skb_it, meta_sk, NULL, 1, tcp_queue);
|
|
+ }
|
|
+
|
|
+ tcp_sk(sk)->pf = 1;
|
|
+
|
|
+ mptcp_push_pending_frames(meta_sk);
|
|
+}
|
|
+EXPORT_SYMBOL(mptcp_reinject_data);
|
|
+
|
|
+static void mptcp_combine_dfin(const struct sk_buff *skb,
|
|
+ const struct sock *meta_sk,
|
|
+ struct sock *subsk)
|
|
+{
|
|
+ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ const struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+
|
|
+ /* In infinite mapping we always try to combine */
|
|
+ if (mpcb->infinite_mapping_snd)
|
|
+ goto combine;
|
|
+
|
|
+ /* Don't combine, if they didn't combine when closing - otherwise we end
|
|
+ * up in TIME_WAIT, even if our app is smart enough to avoid it.
|
|
+ */
|
|
+ if (!mptcp_sk_can_recv(meta_sk) && !mpcb->dfin_combined)
|
|
+ return;
|
|
+
|
|
+ /* Don't combine if there is still outstanding data that remains to be
|
|
+ * DATA_ACKed, because otherwise we may never be able to deliver this.
|
|
+ */
|
|
+ if (meta_tp->snd_una != TCP_SKB_CB(skb)->seq)
|
|
+ return;
|
|
+
|
|
+combine:
|
|
+ if (tcp_close_state(subsk)) {
|
|
+ subsk->sk_shutdown |= SEND_SHUTDOWN;
|
|
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb,
|
|
+ __be32 *ptr)
|
|
+{
|
|
+ const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
|
|
+ __be32 *start = ptr;
|
|
+ __u16 data_len;
|
|
+
|
|
+ *ptr++ = htonl(tcb->seq); /* data_seq */
|
|
+
|
|
+ /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */
|
|
+ if (mptcp_is_data_fin(skb) && skb->len == 0)
|
|
+ *ptr++ = 0; /* subseq */
|
|
+ else
|
|
+ *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */
|
|
+
|
|
+ if (tcb->mptcp_flags & MPTCPHDR_INF)
|
|
+ data_len = 0;
|
|
+ else
|
|
+ data_len = tcb->end_seq - tcb->seq;
|
|
+
|
|
+ if (tp->mpcb->dss_csum && data_len) {
|
|
+ __sum16 *p16 = (__sum16 *)ptr;
|
|
+ __be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb);
|
|
+ __wsum csum;
|
|
+
|
|
+ *ptr = htonl(((data_len) << 16) |
|
|
+ (TCPOPT_EOL << 8) |
|
|
+ (TCPOPT_EOL));
|
|
+ csum = csum_partial(ptr - 2, 12, skb->csum);
|
|
+ p16++;
|
|
+ *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));
|
|
+ } else {
|
|
+ *ptr++ = htonl(((data_len) << 16) |
|
|
+ (TCPOPT_NOP << 8) |
|
|
+ (TCPOPT_NOP));
|
|
+ }
|
|
+
|
|
+ return ptr - start;
|
|
+}
|
|
+
|
|
+static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb,
|
|
+ __be32 *ptr)
|
|
+{
|
|
+ struct mp_dss *mdss = (struct mp_dss *)ptr;
|
|
+ __be32 *start = ptr;
|
|
+
|
|
+ mdss->kind = TCPOPT_MPTCP;
|
|
+ mdss->sub = MPTCP_SUB_DSS;
|
|
+ mdss->rsv1 = 0;
|
|
+ mdss->rsv2 = 0;
|
|
+ mdss->F = mptcp_is_data_fin(skb) ? 1 : 0;
|
|
+ mdss->m = 0;
|
|
+ mdss->M = mptcp_is_data_seq(skb) ? 1 : 0;
|
|
+ mdss->a = 0;
|
|
+ mdss->A = 1;
|
|
+ mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
|
|
+ ptr++;
|
|
+
|
|
+ *ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt);
|
|
+
|
|
+ return ptr - start;
|
|
+}
|
|
+
|
|
+/* RFC6824 states that once a particular subflow mapping has been sent
|
|
+ * out it must never be changed. However, packets may be split while
|
|
+ * they are in the retransmission queue (due to SACK or ACKs) and that
|
|
+ * arguably means that we would change the mapping (e.g. it splits it,
|
|
+ * our sends out a subset of the initial mapping).
|
|
+ *
|
|
+ * Furthermore, the skb checksum is not always preserved across splits
|
|
+ * (e.g. mptcp_fragment) which would mean that we need to recompute
|
|
+ * the DSS checksum in this case.
|
|
+ *
|
|
+ * To avoid this we save the initial DSS mapping which allows us to
|
|
+ * send the same DSS mapping even for fragmented retransmits.
|
|
+ */
|
|
+static void mptcp_save_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb)
|
|
+{
|
|
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
|
|
+ __be32 *ptr = (__be32 *)tcb->dss;
|
|
+
|
|
+ tcb->mptcp_flags |= MPTCPHDR_SEQ;
|
|
+
|
|
+ ptr += mptcp_write_dss_data_ack(tp, skb, ptr);
|
|
+ ptr += mptcp_write_dss_mapping(tp, skb, ptr);
|
|
+}
|
|
+
|
|
+/* Write the MP_CAPABLE with data-option */
|
|
+static int mptcp_write_mpcapable_data(const struct tcp_sock *tp,
|
|
+ struct sk_buff *skb,
|
|
+ __be32 *ptr)
|
|
+{
|
|
+ struct mp_capable *mpc = (struct mp_capable *)ptr;
|
|
+ u8 length;
|
|
+
|
|
+ if (tp->mpcb->dss_csum)
|
|
+ length = MPTCPV1_SUB_LEN_CAPABLE_DATA_CSUM;
|
|
+ else
|
|
+ length = MPTCPV1_SUB_LEN_CAPABLE_DATA;
|
|
+
|
|
+ mpc->kind = TCPOPT_MPTCP;
|
|
+ mpc->len = length;
|
|
+ mpc->sub = MPTCP_SUB_CAPABLE;
|
|
+ mpc->ver = MPTCP_VERSION_1;
|
|
+ mpc->a = tp->mpcb->dss_csum;
|
|
+ mpc->b = 0;
|
|
+ mpc->rsv = 0;
|
|
+ mpc->h = 1;
|
|
+
|
|
+ ptr++;
|
|
+ memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len);
|
|
+
|
|
+ mpc->sender_key = tp->mpcb->mptcp_loc_key;
|
|
+ mpc->receiver_key = tp->mpcb->mptcp_rem_key;
|
|
+
|
|
+ /* dss is in a union with inet_skb_parm and
|
|
+ * the IP layer expects zeroed IPCB fields.
|
|
+ */
|
|
+ memset(TCP_SKB_CB(skb)->dss, 0, mptcp_dss_len);
|
|
+
|
|
+ return MPTCPV1_SUB_LEN_CAPABLE_DATA_ALIGN / sizeof(*ptr);
|
|
+}
|
|
+
|
|
+/* Write the saved DSS mapping to the header */
|
|
+static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb,
|
|
+ __be32 *ptr)
|
|
+{
|
|
+ int length;
|
|
+ __be32 *start = ptr;
|
|
+
|
|
+ if (tp->mpcb->rem_key_set) {
|
|
+ memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len);
|
|
+
|
|
+ /* update the data_ack */
|
|
+ start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt);
|
|
+
|
|
+ length = mptcp_dss_len / sizeof(*ptr);
|
|
+ } else {
|
|
+ memcpy(ptr, TCP_SKB_CB(skb)->dss, MPTCP_SUB_LEN_DSS_ALIGN);
|
|
+
|
|
+ ptr++;
|
|
+ memcpy(ptr, TCP_SKB_CB(skb)->dss + 2, MPTCP_SUB_LEN_SEQ_ALIGN);
|
|
+
|
|
+ length = (MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_SEQ_ALIGN) / sizeof(*ptr);
|
|
+ }
|
|
+
|
|
+ /* dss is in a union with inet_skb_parm and
|
|
+ * the IP layer expects zeroed IPCB fields.
|
|
+ */
|
|
+ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);
|
|
+
|
|
+ return length;
|
|
+}
|
|
+
|
|
+static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ const struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+ struct tcp_skb_cb *tcb;
|
|
+ struct sk_buff *subskb = NULL;
|
|
+
|
|
+ if (!reinject)
|
|
+ TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ?
|
|
+ MPTCPHDR_SEQ64_INDEX : 0);
|
|
+
|
|
+ tcp_skb_tsorted_save(skb) {
|
|
+ subskb = pskb_copy_for_clone(skb, GFP_ATOMIC);
|
|
+ } tcp_skb_tsorted_restore(skb);
|
|
+ if (!subskb)
|
|
+ return false;
|
|
+
|
|
+ /* At the subflow-level we need to call again tcp_init_tso_segs. We
|
|
+ * force this, by setting pcount to 0. It has been set to 1 prior to
|
|
+ * the call to mptcp_skb_entail.
|
|
+ */
|
|
+ tcp_skb_pcount_set(subskb, 0);
|
|
+
|
|
+ TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index);
|
|
+
|
|
+ /* Compute checksum */
|
|
+ if (tp->mpcb->dss_csum)
|
|
+ subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0);
|
|
+
|
|
+ tcb = TCP_SKB_CB(subskb);
|
|
+
|
|
+ if (tp->mpcb->send_infinite_mapping &&
|
|
+ !tp->mpcb->infinite_mapping_snd &&
|
|
+ !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) {
|
|
+ tp->mptcp->fully_established = 1;
|
|
+ tp->mpcb->infinite_mapping_snd = 1;
|
|
+ tp->mptcp->infinite_cutoff_seq = tp->write_seq;
|
|
+ tcb->mptcp_flags |= MPTCPHDR_INF;
|
|
+ }
|
|
+
|
|
+ if (mptcp_is_data_fin(subskb))
|
|
+ mptcp_combine_dfin(subskb, meta_sk, sk);
|
|
+
|
|
+ mptcp_save_dss_data_seq(tp, subskb);
|
|
+
|
|
+ if (mpcb->send_mptcpv1_mpcapable) {
|
|
+ TCP_SKB_CB(subskb)->mptcp_flags |= MPTCPHDR_MPC_DATA;
|
|
+ mpcb->send_mptcpv1_mpcapable = 0;
|
|
+ }
|
|
+
|
|
+ tcb->seq = tp->write_seq;
|
|
+
|
|
+ /* Take into account seg len */
|
|
+ tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0);
|
|
+ tcb->end_seq = tp->write_seq;
|
|
+
|
|
+ /* txstamp_ack is handled at the meta-level */
|
|
+ tcb->txstamp_ack = 0;
|
|
+
|
|
+ /* If it's a non-payload DATA_FIN (also no subflow-fin), the
|
|
+ * segment is not part of the subflow but on a meta-only-level.
|
|
+ */
|
|
+ if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) {
|
|
+ /* Make sure that this list is clean */
|
|
+ INIT_LIST_HEAD(&subskb->tcp_tsorted_anchor);
|
|
+
|
|
+ tcp_add_write_queue_tail(sk, subskb);
|
|
+ sk->sk_wmem_queued += subskb->truesize;
|
|
+ sk_mem_charge(sk, subskb->truesize);
|
|
+ } else {
|
|
+ /* Necessary to initialize for tcp_transmit_skb. mss of 1, as
|
|
+ * skb->len = 0 will force tso_segs to 1.
|
|
+ */
|
|
+ tcp_init_tso_segs(subskb, 1);
|
|
+
|
|
+ /* Empty data-fins are sent immediatly on the subflow */
|
|
+ if (tcp_transmit_skb(sk, subskb, 0, GFP_ATOMIC))
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (!tp->mptcp->fully_established) {
|
|
+ tp->mptcp->second_packet = 1;
|
|
+ tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq;
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/* Fragment an skb and update the mptcp meta-data. Due to reinject, we
|
|
+ * might need to undo some operations done by tcp_fragment.
|
|
+ *
|
|
+ * Be careful, the skb may come from 3 different places:
|
|
+ * - The send-queue (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
|
|
+ * - The retransmit-queue (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
|
|
+ * - The reinject-queue (reinject == -1)
|
|
+ */
|
|
+static int mptcp_fragment(struct sock *meta_sk, enum tcp_queue tcp_queue,
|
|
+ struct sk_buff *skb, u32 len,
|
|
+ gfp_t gfp, int reinject)
|
|
+{
|
|
+ int ret, diff, old_factor;
|
|
+ struct sk_buff *buff;
|
|
+ u8 flags;
|
|
+
|
|
+ if (skb_headlen(skb) < len)
|
|
+ diff = skb->len - len;
|
|
+ else
|
|
+ diff = skb->data_len;
|
|
+ old_factor = tcp_skb_pcount(skb);
|
|
+
|
|
+ /* The mss_now in tcp_fragment is used to set the tso_segs of the skb.
|
|
+ * At the MPTCP-level we do not care about the absolute value. All we
|
|
+ * care about is that it is set to 1 for accurate packets_out
|
|
+ * accounting.
|
|
+ */
|
|
+ ret = tcp_fragment(meta_sk, tcp_queue, skb, len, UINT_MAX, gfp);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
|
|
+ buff = skb->next;
|
|
+ else
|
|
+ buff = skb_rb_next(skb);
|
|
+
|
|
+ flags = TCP_SKB_CB(skb)->mptcp_flags;
|
|
+ TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);
|
|
+ TCP_SKB_CB(buff)->mptcp_flags = flags;
|
|
+ TCP_SKB_CB(buff)->path_mask = TCP_SKB_CB(skb)->path_mask;
|
|
+
|
|
+ /* If reinject == 1, the buff will be added to the reinject
|
|
+ * queue, which is currently not part of memory accounting. So
|
|
+ * undo the changes done by tcp_fragment and update the
|
|
+ * reinject queue. Also, undo changes to the packet counters.
|
|
+ */
|
|
+ if (reinject == 1) {
|
|
+ int undo = buff->truesize - diff;
|
|
+ meta_sk->sk_wmem_queued -= undo;
|
|
+ sk_mem_uncharge(meta_sk, undo);
|
|
+
|
|
+ tcp_sk(meta_sk)->mpcb->reinject_queue.qlen++;
|
|
+ if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
|
|
+ meta_sk->sk_write_queue.qlen--;
|
|
+
|
|
+ if (!before(tcp_sk(meta_sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
|
|
+ undo = old_factor - tcp_skb_pcount(skb) -
|
|
+ tcp_skb_pcount(buff);
|
|
+ if (undo)
|
|
+ tcp_adjust_pcount(meta_sk, skb, -undo);
|
|
+ }
|
|
+
|
|
+ /* tcp_fragment's call to sk_stream_alloc_skb initializes the
|
|
+ * tcp_tsorted_anchor. We need to revert this as it clashes
|
|
+ * with the refdst pointer.
|
|
+ */
|
|
+ tcp_skb_tsorted_anchor_cleanup(buff);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Inspired by tcp_write_wakeup */
|
|
+int mptcp_write_wakeup(struct sock *meta_sk, int mib)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct sk_buff *skb;
|
|
+ int ans = 0;
|
|
+
|
|
+ if (meta_sk->sk_state == TCP_CLOSE)
|
|
+ return -1;
|
|
+
|
|
+ skb = tcp_send_head(meta_sk);
|
|
+ if (skb &&
|
|
+ before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) {
|
|
+ unsigned int mss;
|
|
+ unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq;
|
|
+ struct sock *subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, true);
|
|
+ struct tcp_sock *subtp;
|
|
+
|
|
+ WARN_ON(TCP_SKB_CB(skb)->sacked);
|
|
+
|
|
+ if (!subsk)
|
|
+ goto window_probe;
|
|
+ subtp = tcp_sk(subsk);
|
|
+ mss = tcp_current_mss(subsk);
|
|
+
|
|
+ seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq,
|
|
+ tcp_wnd_end(subtp) - subtp->write_seq);
|
|
+
|
|
+ if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
|
|
+ meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
|
|
+
|
|
+ /* We are probing the opening of a window
|
|
+ * but the window size is != 0
|
|
+ * must have been a result SWS avoidance ( sender )
|
|
+ */
|
|
+ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
|
|
+ skb->len > mss) {
|
|
+ seg_size = min(seg_size, mss);
|
|
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
|
|
+ if (mptcp_fragment(meta_sk, TCP_FRAG_IN_WRITE_QUEUE,
|
|
+ skb, seg_size, GFP_ATOMIC, 0))
|
|
+ return -1;
|
|
+ } else if (!tcp_skb_pcount(skb)) {
|
|
+ /* see mptcp_write_xmit on why we use UINT_MAX */
|
|
+ tcp_set_skb_tso_segs(skb, UINT_MAX);
|
|
+ }
|
|
+
|
|
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
|
|
+ if (!mptcp_skb_entail(subsk, skb, 0))
|
|
+ return -1;
|
|
+
|
|
+ mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq -
|
|
+ TCP_SKB_CB(skb)->seq);
|
|
+ tcp_event_new_data_sent(meta_sk, skb);
|
|
+
|
|
+ __tcp_push_pending_frames(subsk, mss, TCP_NAGLE_PUSH);
|
|
+ tcp_update_skb_after_send(meta_sk, skb, meta_tp->tcp_wstamp_ns);
|
|
+ meta_tp->lsndtime = tcp_jiffies32;
|
|
+
|
|
+ return 0;
|
|
+ } else {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+window_probe:
|
|
+ if (between(meta_tp->snd_up, meta_tp->snd_una + 1,
|
|
+ meta_tp->snd_una + 0xFFFF)) {
|
|
+ mptcp_for_each_sub(meta_tp->mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (mptcp_sk_can_send_ack(sk_it))
|
|
+ tcp_xmit_probe_skb(sk_it, 1, mib);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* At least one of the tcp_xmit_probe_skb's has to succeed */
|
|
+ mptcp_for_each_sub(meta_tp->mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+ int ret;
|
|
+
|
|
+ if (!mptcp_sk_can_send_ack(sk_it))
|
|
+ continue;
|
|
+
|
|
+ ret = tcp_xmit_probe_skb(sk_it, 0, mib);
|
|
+ if (unlikely(ret > 0))
|
|
+ ans = ret;
|
|
+ }
|
|
+ return ans;
|
|
+ }
|
|
+}
|
|
+
|
|
+bool mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle,
|
|
+ int push_one, gfp_t gfp)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp;
|
|
+ bool is_rwnd_limited = false;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct sock *subsk = NULL;
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ struct sk_buff *skb;
|
|
+ int reinject = 0;
|
|
+ unsigned int sublimit;
|
|
+ __u32 path_mask = 0;
|
|
+
|
|
+ tcp_mstamp_refresh(meta_tp);
|
|
+
|
|
+ if (inet_csk(meta_sk)->icsk_retransmits) {
|
|
+ /* If the timer already once fired, retransmit the head of the
|
|
+ * queue to unblock us ASAP.
|
|
+ */
|
|
+ if (meta_tp->packets_out && !mpcb->infinite_mapping_snd)
|
|
+ mptcp_retransmit_skb(meta_sk, tcp_rtx_queue_head(meta_sk));
|
|
+ }
|
|
+
|
|
+ while ((skb = mpcb->sched_ops->next_segment(meta_sk, &reinject, &subsk,
|
|
+ &sublimit))) {
|
|
+ enum tcp_queue tcp_queue = TCP_FRAG_IN_WRITE_QUEUE;
|
|
+ unsigned int limit;
|
|
+
|
|
+ WARN(TCP_SKB_CB(skb)->sacked, "sacked: %u reinject: %u",
|
|
+ TCP_SKB_CB(skb)->sacked, reinject);
|
|
+
|
|
+ subtp = tcp_sk(subsk);
|
|
+ mss_now = tcp_current_mss(subsk);
|
|
+
|
|
+ if (reinject == 1) {
|
|
+ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
|
|
+ /* Segment already reached the peer, take the next one */
|
|
+ __skb_unlink(skb, &mpcb->reinject_queue);
|
|
+ __kfree_skb(skb);
|
|
+ continue;
|
|
+ }
|
|
+ } else if (reinject == -1) {
|
|
+ tcp_queue = TCP_FRAG_IN_RTX_QUEUE;
|
|
+ }
|
|
+
|
|
+ /* If the segment was cloned (e.g. a meta retransmission),
|
|
+ * the header must be expanded/copied so that there is no
|
|
+ * corruption of TSO information.
|
|
+ */
|
|
+ if (skb_unclone(skb, GFP_ATOMIC))
|
|
+ break;
|
|
+
|
|
+ if (unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now))) {
|
|
+ is_rwnd_limited = true;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* Force tso_segs to 1 by using UINT_MAX.
|
|
+ * We actually don't care about the exact number of segments
|
|
+ * emitted on the subflow. We need just to set tso_segs, because
|
|
+ * we still need an accurate packets_out count in
|
|
+ * tcp_event_new_data_sent.
|
|
+ */
|
|
+ tcp_set_skb_tso_segs(skb, UINT_MAX);
|
|
+
|
|
+ /* Check for nagle, irregardless of tso_segs. If the segment is
|
|
+ * actually larger than mss_now (TSO segment), then
|
|
+ * tcp_nagle_check will have partial == false and always trigger
|
|
+ * the transmission.
|
|
+ * tcp_write_xmit has a TSO-level nagle check which is not
|
|
+ * subject to the MPTCP-level. It is based on the properties of
|
|
+ * the subflow, not the MPTCP-level.
|
|
+ * When the segment is a reinjection or redundant scheduled
|
|
+ * segment, nagle check at meta-level may prevent
|
|
+ * sending. This could hurt with certain schedulers, as they
|
|
+ * to reinjection to recover from a window-stall or reduce latency.
|
|
+ * Therefore, Nagle check should be disabled in that case.
|
|
+ */
|
|
+ if (!reinject &&
|
|
+ unlikely(!tcp_nagle_test(meta_tp, skb, mss_now,
|
|
+ (tcp_skb_is_last(meta_sk, skb) ?
|
|
+ nonagle : TCP_NAGLE_PUSH))))
|
|
+ break;
|
|
+
|
|
+ limit = mss_now;
|
|
+ /* skb->len > mss_now is the equivalent of tso_segs > 1 in
|
|
+ * tcp_write_xmit. Otherwise split-point would return 0.
|
|
+ */
|
|
+ if (skb->len > mss_now && !tcp_urg_mode(meta_tp))
|
|
+ /* We limit the size of the skb so that it fits into the
|
|
+ * window. Call tcp_mss_split_point to avoid duplicating
|
|
+ * code.
|
|
+ * We really only care about fitting the skb into the
|
|
+ * window. That's why we use UINT_MAX. If the skb does
|
|
+ * not fit into the cwnd_quota or the NIC's max-segs
|
|
+ * limitation, it will be split by the subflow's
|
|
+ * tcp_write_xmit which does the appropriate call to
|
|
+ * tcp_mss_split_point.
|
|
+ */
|
|
+ limit = tcp_mss_split_point(meta_sk, skb, mss_now,
|
|
+ UINT_MAX / mss_now,
|
|
+ nonagle);
|
|
+
|
|
+ if (sublimit)
|
|
+ limit = min(limit, sublimit);
|
|
+
|
|
+ if (skb->len > limit &&
|
|
+ unlikely(mptcp_fragment(meta_sk, tcp_queue,
|
|
+ skb, limit, gfp, reinject)))
|
|
+ break;
|
|
+
|
|
+ if (!mptcp_skb_entail(subsk, skb, reinject))
|
|
+ break;
|
|
+
|
|
+ if (reinject <= 0)
|
|
+ tcp_update_skb_after_send(meta_sk, skb, meta_tp->tcp_wstamp_ns);
|
|
+ meta_tp->lsndtime = tcp_jiffies32;
|
|
+
|
|
+ path_mask |= mptcp_pi_to_flag(subtp->mptcp->path_index);
|
|
+
|
|
+ if (!reinject) {
|
|
+ mptcp_check_sndseq_wrap(meta_tp,
|
|
+ TCP_SKB_CB(skb)->end_seq -
|
|
+ TCP_SKB_CB(skb)->seq);
|
|
+ tcp_event_new_data_sent(meta_sk, skb);
|
|
+ }
|
|
+
|
|
+ tcp_minshall_update(meta_tp, mss_now, skb);
|
|
+
|
|
+ if (reinject > 0) {
|
|
+ __skb_unlink(skb, &mpcb->reinject_queue);
|
|
+ kfree_skb(skb);
|
|
+ }
|
|
+
|
|
+ if (push_one)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (is_rwnd_limited)
|
|
+ tcp_chrono_start(meta_sk, TCP_CHRONO_RWND_LIMITED);
|
|
+ else
|
|
+ tcp_chrono_stop(meta_sk, TCP_CHRONO_RWND_LIMITED);
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ subsk = mptcp_to_sock(mptcp);
|
|
+ subtp = tcp_sk(subsk);
|
|
+
|
|
+ if (!(path_mask & mptcp_pi_to_flag(subtp->mptcp->path_index)))
|
|
+ continue;
|
|
+
|
|
+ mss_now = tcp_current_mss(subsk);
|
|
+
|
|
+ /* Nagle is handled at the MPTCP-layer, so
|
|
+ * always push on the subflow
|
|
+ */
|
|
+ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);
|
|
+ }
|
|
+
|
|
+ return !meta_tp->packets_out && tcp_send_head(meta_sk);
|
|
+}
|
|
+
|
|
+void mptcp_write_space(struct sock *sk)
|
|
+{
|
|
+ mptcp_push_pending_frames(mptcp_meta_sk(sk));
|
|
+}
|
|
+
|
|
+u32 __mptcp_select_window(struct sock *sk)
|
|
+{
|
|
+ struct inet_connection_sock *icsk = inet_csk(sk);
|
|
+ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+ int mss, free_space, full_space, window;
|
|
+
|
|
+ /* MSS for the peer's data. Previous versions used mss_clamp
|
|
+ * here. I don't know if the value based on our guesses
|
|
+ * of peer's MSS is better for the performance. It's more correct
|
|
+ * but may be worse for the performance because of rcv_mss
|
|
+ * fluctuations. --SAW 1998/11/1
|
|
+ */
|
|
+ mss = icsk->icsk_ack.rcv_mss;
|
|
+ free_space = tcp_space(meta_sk);
|
|
+ full_space = min_t(int, meta_tp->window_clamp,
|
|
+ tcp_full_space(meta_sk));
|
|
+
|
|
+ if (mss > full_space)
|
|
+ mss = full_space;
|
|
+
|
|
+ if (free_space < (full_space >> 1)) {
|
|
+ /* If free_space is decreasing due to mostly meta-level
|
|
+ * out-of-order packets, don't turn off the quick-ack mode.
|
|
+ */
|
|
+ if (meta_tp->rcv_nxt - meta_tp->copied_seq > ((full_space - free_space) >> 1))
|
|
+ icsk->icsk_ack.quick = 0;
|
|
+
|
|
+ if (tcp_memory_pressure)
|
|
+ /* TODO this has to be adapted when we support different
|
|
+ * MSS's among the subflows.
|
|
+ */
|
|
+ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh,
|
|
+ 4U * meta_tp->advmss);
|
|
+
|
|
+ if (free_space < mss)
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (free_space > meta_tp->rcv_ssthresh)
|
|
+ free_space = meta_tp->rcv_ssthresh;
|
|
+
|
|
+ /* Don't do rounding if we are using window scaling, since the
|
|
+ * scaled window will not line up with the MSS boundary anyway.
|
|
+ */
|
|
+ window = meta_tp->rcv_wnd;
|
|
+ if (tp->rx_opt.rcv_wscale) {
|
|
+ window = free_space;
|
|
+
|
|
+ /* Advertise enough space so that it won't get scaled away.
|
|
+ * Import case: prevent zero window announcement if
|
|
+ * 1<<rcv_wscale > mss.
|
|
+ */
|
|
+ if (((window >> tp->rx_opt.rcv_wscale) << tp->
|
|
+ rx_opt.rcv_wscale) != window)
|
|
+ window = (((window >> tp->rx_opt.rcv_wscale) + 1)
|
|
+ << tp->rx_opt.rcv_wscale);
|
|
+ } else {
|
|
+ /* Get the largest window that is a nice multiple of mss.
|
|
+ * Window clamp already applied above.
|
|
+ * If our current window offering is within 1 mss of the
|
|
+ * free space we just keep it. This prevents the divide
|
|
+ * and multiply from happening most of the time.
|
|
+ * We also don't do any window rounding when the free space
|
|
+ * is too small.
|
|
+ */
|
|
+ if (window <= free_space - mss || window > free_space)
|
|
+ window = (free_space / mss) * mss;
|
|
+ else if (mss == full_space &&
|
|
+ free_space > window + (full_space >> 1))
|
|
+ window = free_space;
|
|
+ }
|
|
+
|
|
+ return window;
|
|
+}
|
|
+
|
|
+void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,
|
|
+ unsigned *remaining)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ if (is_master_tp(tp)) {
|
|
+ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN;
|
|
+ opts->mptcp_ver = tp->mptcp_ver;
|
|
+
|
|
+ if (tp->mptcp_ver >= MPTCP_VERSION_1)
|
|
+ *remaining -= MPTCPV1_SUB_LEN_CAPABLE_SYN_ALIGN;
|
|
+ else
|
|
+ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
|
|
+
|
|
+ opts->mp_capable.sender_key = tp->mptcp_loc_key;
|
|
+ opts->dss_csum = !!sysctl_mptcp_checksum;
|
|
+ } else {
|
|
+ const struct mptcp_cb *mpcb = tp->mpcb;
|
|
+
|
|
+ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN;
|
|
+ *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN;
|
|
+ opts->mp_join_syns.token = mpcb->mptcp_rem_token;
|
|
+ opts->mp_join_syns.low_prio = tp->mptcp->low_prio;
|
|
+ opts->addr_id = tp->mptcp->loc_id;
|
|
+ opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce;
|
|
+ }
|
|
+}
|
|
+
|
|
+void mptcp_synack_options(struct request_sock *req,
|
|
+ struct tcp_out_options *opts, unsigned *remaining)
|
|
+{
|
|
+ struct mptcp_request_sock *mtreq;
|
|
+ mtreq = mptcp_rsk(req);
|
|
+
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ /* MPCB not yet set - thus it's a new MPTCP-session */
|
|
+ if (!mtreq->is_sub) {
|
|
+ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK;
|
|
+ opts->mptcp_ver = mtreq->mptcp_ver;
|
|
+ opts->mp_capable.sender_key = mtreq->mptcp_loc_key;
|
|
+ opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum;
|
|
+ if (mtreq->mptcp_ver >= MPTCP_VERSION_1) {
|
|
+ *remaining -= MPTCPV1_SUB_LEN_CAPABLE_SYNACK_ALIGN;
|
|
+ } else {
|
|
+ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
|
|
+ }
|
|
+ } else {
|
|
+ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK;
|
|
+ opts->mp_join_syns.sender_truncated_mac =
|
|
+ mtreq->mptcp_hash_tmac;
|
|
+ opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce;
|
|
+ opts->mp_join_syns.low_prio = mtreq->low_prio;
|
|
+ opts->addr_id = mtreq->loc_id;
|
|
+ *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN;
|
|
+ }
|
|
+}
|
|
+
|
|
+void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
|
|
+ struct tcp_out_options *opts, unsigned *size)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+ const struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
|
|
+
|
|
+ /* We are coming from tcp_current_mss with the meta_sk as an argument.
|
|
+ * It does not make sense to check for the options, because when the
|
|
+ * segment gets sent, another subflow will be chosen.
|
|
+ */
|
|
+ if (!skb && is_meta_sk(sk))
|
|
+ return;
|
|
+
|
|
+ if (unlikely(tp->send_mp_fclose)) {
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_MP_FCLOSE;
|
|
+ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
|
|
+ *size += MPTCP_SUB_LEN_FCLOSE_ALIGN;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /* 1. If we are the sender of the infinite-mapping, we need the
|
|
+ * MPTCPHDR_INF-flag, because a retransmission of the
|
|
+ * infinite-announcment still needs the mptcp-option.
|
|
+ *
|
|
+ * We need infinite_cutoff_seq, because retransmissions from before
|
|
+ * the infinite-cutoff-moment still need the MPTCP-signalling to stay
|
|
+ * consistent.
|
|
+ *
|
|
+ * 2. If we are the receiver of the infinite-mapping, we always skip
|
|
+ * mptcp-options, because acknowledgments from before the
|
|
+ * infinite-mapping point have already been sent out.
|
|
+ *
|
|
+ * I know, the whole infinite-mapping stuff is ugly...
|
|
+ *
|
|
+ * TODO: Handle wrapped data-sequence numbers
|
|
+ * (even if it's very unlikely)
|
|
+ */
|
|
+ if (unlikely(mpcb->infinite_mapping_snd) &&
|
|
+ ((mpcb->send_infinite_mapping && tcb &&
|
|
+ mptcp_is_data_seq(skb) &&
|
|
+ !(tcb->mptcp_flags & MPTCPHDR_INF) &&
|
|
+ !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) ||
|
|
+ !mpcb->send_infinite_mapping))
|
|
+ return;
|
|
+
|
|
+ if (unlikely(tp->mptcp->include_mpc)) {
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_MP_CAPABLE |
|
|
+ OPTION_TYPE_ACK;
|
|
+
|
|
+ if (mpcb->mptcp_ver >= MPTCP_VERSION_1)
|
|
+ *size += MPTCPV1_SUB_LEN_CAPABLE_ACK_ALIGN;
|
|
+ else
|
|
+ *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN;
|
|
+
|
|
+ opts->mptcp_ver = mpcb->mptcp_ver;
|
|
+ opts->mp_capable.sender_key = mpcb->mptcp_loc_key;
|
|
+ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
|
|
+ opts->dss_csum = mpcb->dss_csum;
|
|
+
|
|
+ if (skb)
|
|
+ tp->mptcp->include_mpc = 0;
|
|
+ }
|
|
+ if (unlikely(tp->mptcp->pre_established) &&
|
|
+ (!skb || !(tcb->tcp_flags & (TCPHDR_FIN | TCPHDR_RST)))) {
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK;
|
|
+ *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN;
|
|
+ }
|
|
+
|
|
+ if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal &&
|
|
+ mpcb->mptcp_ver >= MPTCP_VERSION_1 && skb && !mptcp_is_data_seq(skb)) {
|
|
+ mpcb->pm_ops->addr_signal(sk, size, opts, skb);
|
|
+
|
|
+ if (opts->add_addr_v6)
|
|
+ /* Skip subsequent options */
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) {
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_DATA_ACK;
|
|
+ /* If !skb, we come from tcp_current_mss and thus we always
|
|
+ * assume that the DSS-option will be set for the data-packet.
|
|
+ */
|
|
+ if (skb && !mptcp_is_data_seq(skb) && mpcb->rem_key_set) {
|
|
+ *size += MPTCP_SUB_LEN_ACK_ALIGN;
|
|
+ } else if ((skb && mptcp_is_data_mpcapable(skb)) ||
|
|
+ (!skb && tp->mpcb->send_mptcpv1_mpcapable)) {
|
|
+ *size += MPTCPV1_SUB_LEN_CAPABLE_DATA_ALIGN;
|
|
+ } else {
|
|
+ /* Doesn't matter, if csum included or not. It will be
|
|
+ * either 10 or 12, and thus aligned = 12
|
|
+ */
|
|
+ if (mpcb->rem_key_set)
|
|
+ *size += MPTCP_SUB_LEN_ACK_ALIGN +
|
|
+ MPTCP_SUB_LEN_SEQ_ALIGN;
|
|
+ else
|
|
+ *size += MPTCP_SUB_LEN_SEQ_ALIGN;
|
|
+ }
|
|
+
|
|
+ *size += MPTCP_SUB_LEN_DSS_ALIGN;
|
|
+ }
|
|
+
|
|
+ /* In fallback mp_fail-mode, we have to repeat it until the fallback
|
|
+ * has been done by the sender
|
|
+ */
|
|
+ if (unlikely(tp->mptcp->send_mp_fail) && skb &&
|
|
+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_FAIL) {
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_MP_FAIL;
|
|
+ *size += MPTCP_SUB_LEN_FAIL;
|
|
+ }
|
|
+
|
|
+ if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal &&
|
|
+ mpcb->mptcp_ver < MPTCP_VERSION_1)
|
|
+ mpcb->pm_ops->addr_signal(sk, size, opts, skb);
|
|
+
|
|
+ if (unlikely(tp->mptcp->send_mp_prio) &&
|
|
+ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) {
|
|
+ opts->options |= OPTION_MPTCP;
|
|
+ opts->mptcp_options |= OPTION_MP_PRIO;
|
|
+ if (skb)
|
|
+ tp->mptcp->send_mp_prio = 0;
|
|
+ *size += MPTCP_SUB_LEN_PRIO_ALIGN;
|
|
+ }
|
|
+
|
|
+ return;
|
|
+}
|
|
+
|
|
+u16 mptcp_select_window(struct sock *sk)
|
|
+{
|
|
+ u16 new_win = tcp_select_window(sk);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct tcp_sock *meta_tp = mptcp_meta_tp(tp);
|
|
+
|
|
+ meta_tp->rcv_wnd = tp->rcv_wnd;
|
|
+ meta_tp->rcv_wup = meta_tp->rcv_nxt;
|
|
+ /* no need to use tcp_update_rcv_right_edge, because at the meta level
|
|
+ * right edge cannot go back
|
|
+ */
|
|
+ meta_tp->rcv_right_edge = meta_tp->rcv_wnd + meta_tp->rcv_wup;
|
|
+
|
|
+ return new_win;
|
|
+}
|
|
+
|
|
+void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
|
|
+ const struct tcp_out_options *opts,
|
|
+ struct sk_buff *skb)
|
|
+{
|
|
+ if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) {
|
|
+ struct mp_capable *mpc = (struct mp_capable *)ptr;
|
|
+
|
|
+ mpc->kind = TCPOPT_MPTCP;
|
|
+
|
|
+ if (OPTION_TYPE_SYN & opts->mptcp_options) {
|
|
+ mpc->ver = opts->mptcp_ver;
|
|
+
|
|
+ if (mpc->ver >= MPTCP_VERSION_1) {
|
|
+ mpc->len = MPTCPV1_SUB_LEN_CAPABLE_SYN;
|
|
+ ptr += MPTCPV1_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;
|
|
+ } else {
|
|
+ mpc->sender_key = opts->mp_capable.sender_key;
|
|
+ mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN;
|
|
+ ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;
|
|
+ }
|
|
+ } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) {
|
|
+ mpc->ver = opts->mptcp_ver;
|
|
+
|
|
+ if (mpc->ver >= MPTCP_VERSION_1) {
|
|
+ mpc->len = MPTCPV1_SUB_LEN_CAPABLE_SYNACK;
|
|
+ ptr += MPTCPV1_SUB_LEN_CAPABLE_SYNACK_ALIGN >> 2;
|
|
+ } else {
|
|
+ mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN;
|
|
+ ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;
|
|
+ }
|
|
+
|
|
+ mpc->sender_key = opts->mp_capable.sender_key;
|
|
+ } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
|
|
+ mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK;
|
|
+ mpc->ver = opts->mptcp_ver;
|
|
+ ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2;
|
|
+
|
|
+ mpc->sender_key = opts->mp_capable.sender_key;
|
|
+ mpc->receiver_key = opts->mp_capable.receiver_key;
|
|
+ }
|
|
+
|
|
+ mpc->sub = MPTCP_SUB_CAPABLE;
|
|
+ mpc->a = opts->dss_csum;
|
|
+ mpc->b = 0;
|
|
+ mpc->rsv = 0;
|
|
+ mpc->h = 1;
|
|
+ }
|
|
+ if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) {
|
|
+ struct mp_join *mpj = (struct mp_join *)ptr;
|
|
+
|
|
+ mpj->kind = TCPOPT_MPTCP;
|
|
+ mpj->sub = MPTCP_SUB_JOIN;
|
|
+ mpj->rsv = 0;
|
|
+
|
|
+ if (OPTION_TYPE_SYN & opts->mptcp_options) {
|
|
+ mpj->len = MPTCP_SUB_LEN_JOIN_SYN;
|
|
+ mpj->u.syn.token = opts->mp_join_syns.token;
|
|
+ mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce;
|
|
+ mpj->b = opts->mp_join_syns.low_prio;
|
|
+ mpj->addr_id = opts->addr_id;
|
|
+ ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2;
|
|
+ } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) {
|
|
+ mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK;
|
|
+ mpj->u.synack.mac =
|
|
+ opts->mp_join_syns.sender_truncated_mac;
|
|
+ mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce;
|
|
+ mpj->b = opts->mp_join_syns.low_prio;
|
|
+ mpj->addr_id = opts->addr_id;
|
|
+ ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2;
|
|
+ } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
|
|
+ mpj->len = MPTCP_SUB_LEN_JOIN_ACK;
|
|
+ mpj->addr_id = 0; /* addr_id is rsv (RFC 6824, p. 21) */
|
|
+ memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20);
|
|
+ ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2;
|
|
+ }
|
|
+ }
|
|
+ if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) {
|
|
+ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
|
|
+ struct mptcp_cb *mpcb = tp->mpcb;
|
|
+
|
|
+ mpadd->kind = TCPOPT_MPTCP;
|
|
+ if (opts->add_addr_v4) {
|
|
+ mpadd->addr_id = opts->add_addr4.addr_id;
|
|
+ mpadd->u.v4.addr = opts->add_addr4.addr;
|
|
+ if (mpcb->mptcp_ver < MPTCP_VERSION_1) {
|
|
+ mpadd->u_bit.v0.sub = MPTCP_SUB_ADD_ADDR;
|
|
+ mpadd->u_bit.v0.ipver = 4;
|
|
+ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4;
|
|
+ ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2;
|
|
+ } else {
|
|
+ mpadd->u_bit.v1.sub = MPTCP_SUB_ADD_ADDR;
|
|
+ mpadd->u_bit.v1.rsv = 0;
|
|
+ mpadd->u_bit.v1.echo = 0;
|
|
+ memcpy((char *)mpadd->u.v4.mac - 2,
|
|
+ (char *)&opts->add_addr4.trunc_mac, 8);
|
|
+ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4_VER1;
|
|
+ ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1 >> 2;
|
|
+ }
|
|
+ } else if (opts->add_addr_v6) {
|
|
+ mpadd->addr_id = opts->add_addr6.addr_id;
|
|
+ memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr,
|
|
+ sizeof(mpadd->u.v6.addr));
|
|
+ if (mpcb->mptcp_ver < MPTCP_VERSION_1) {
|
|
+ mpadd->u_bit.v0.sub = MPTCP_SUB_ADD_ADDR;
|
|
+ mpadd->u_bit.v0.ipver = 6;
|
|
+ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6;
|
|
+ ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2;
|
|
+ } else {
|
|
+ mpadd->u_bit.v1.sub = MPTCP_SUB_ADD_ADDR;
|
|
+ mpadd->u_bit.v1.rsv = 0;
|
|
+ mpadd->u_bit.v1.echo = 0;
|
|
+ memcpy((char *)mpadd->u.v6.mac - 2,
|
|
+ (char *)&opts->add_addr6.trunc_mac, 8);
|
|
+ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6_VER1;
|
|
+ ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1 >> 2;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net((struct sock *)tp), MPTCP_MIB_ADDADDRTX);
|
|
+ }
|
|
+ if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) {
|
|
+ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
|
|
+ u8 *addrs_id;
|
|
+ int id, len, len_align;
|
|
+
|
|
+ len = mptcp_sub_len_remove_addr(opts->remove_addrs);
|
|
+ len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs);
|
|
+
|
|
+ mprem->kind = TCPOPT_MPTCP;
|
|
+ mprem->len = len;
|
|
+ mprem->sub = MPTCP_SUB_REMOVE_ADDR;
|
|
+ mprem->rsv = 0;
|
|
+ addrs_id = &mprem->addrs_id;
|
|
+
|
|
+ mptcp_for_each_bit_set(opts->remove_addrs, id)
|
|
+ *(addrs_id++) = id;
|
|
+
|
|
+ /* Fill the rest with NOP's */
|
|
+ if (len_align > len) {
|
|
+ int i;
|
|
+ for (i = 0; i < len_align - len; i++)
|
|
+ *(addrs_id++) = TCPOPT_NOP;
|
|
+ }
|
|
+
|
|
+ ptr += len_align >> 2;
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net((struct sock *)tp), MPTCP_MIB_REMADDRTX);
|
|
+ }
|
|
+ if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) {
|
|
+ struct mp_fail *mpfail = (struct mp_fail *)ptr;
|
|
+
|
|
+ mpfail->kind = TCPOPT_MPTCP;
|
|
+ mpfail->len = MPTCP_SUB_LEN_FAIL;
|
|
+ mpfail->sub = MPTCP_SUB_FAIL;
|
|
+ mpfail->rsv1 = 0;
|
|
+ mpfail->rsv2 = 0;
|
|
+ mpfail->data_seq = htonll(tp->mpcb->csum_cutoff_seq);
|
|
+
|
|
+ ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2;
|
|
+ }
|
|
+ if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) {
|
|
+ struct mp_fclose *mpfclose = (struct mp_fclose *)ptr;
|
|
+
|
|
+ mpfclose->kind = TCPOPT_MPTCP;
|
|
+ mpfclose->len = MPTCP_SUB_LEN_FCLOSE;
|
|
+ mpfclose->sub = MPTCP_SUB_FCLOSE;
|
|
+ mpfclose->rsv1 = 0;
|
|
+ mpfclose->rsv2 = 0;
|
|
+ mpfclose->key = opts->mp_capable.receiver_key;
|
|
+
|
|
+ ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2;
|
|
+ }
|
|
+
|
|
+ if (OPTION_DATA_ACK & opts->mptcp_options) {
|
|
+ if (!mptcp_is_data_seq(skb) && tp->mpcb->rem_key_set)
|
|
+ ptr += mptcp_write_dss_data_ack(tp, skb, ptr);
|
|
+ else if (mptcp_is_data_mpcapable(skb))
|
|
+ ptr += mptcp_write_mpcapable_data(tp, skb, ptr);
|
|
+ else
|
|
+ ptr += mptcp_write_dss_data_seq(tp, skb, ptr);
|
|
+ }
|
|
+ if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) {
|
|
+ struct mp_prio *mpprio = (struct mp_prio *)ptr;
|
|
+
|
|
+ mpprio->kind = TCPOPT_MPTCP;
|
|
+ mpprio->len = MPTCP_SUB_LEN_PRIO;
|
|
+ mpprio->sub = MPTCP_SUB_PRIO;
|
|
+ mpprio->rsv = 0;
|
|
+ mpprio->b = tp->mptcp->low_prio;
|
|
+ mpprio->addr_id = TCPOPT_NOP;
|
|
+
|
|
+ ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Sends the datafin */
|
|
+void mptcp_send_fin(struct sock *meta_sk)
|
|
+{
|
|
+ struct sk_buff *skb, *tskb = tcp_write_queue_tail(meta_sk);
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ int mss_now;
|
|
+
|
|
+ if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
|
|
+ meta_tp->mpcb->passive_close = 1;
|
|
+
|
|
+ /* Optimization, tack on the FIN if we have a queue of
|
|
+ * unsent frames. But be careful about outgoing SACKS
|
|
+ * and IP options.
|
|
+ */
|
|
+ mss_now = mptcp_current_mss(meta_sk);
|
|
+
|
|
+ if (tskb) {
|
|
+ TCP_SKB_CB(tskb)->mptcp_flags |= MPTCPHDR_FIN;
|
|
+ TCP_SKB_CB(tskb)->end_seq++;
|
|
+ meta_tp->write_seq++;
|
|
+ } else {
|
|
+ /* Socket is locked, keep trying until memory is available. */
|
|
+ for (;;) {
|
|
+ skb = alloc_skb_fclone(MAX_TCP_HEADER,
|
|
+ meta_sk->sk_allocation);
|
|
+ if (skb)
|
|
+ break;
|
|
+ yield();
|
|
+ }
|
|
+ /* Reserve space for headers and prepare control bits. */
|
|
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
|
|
+ skb_reserve(skb, MAX_TCP_HEADER);
|
|
+
|
|
+ tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK);
|
|
+ TCP_SKB_CB(skb)->end_seq++;
|
|
+ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
|
|
+ tcp_queue_skb(meta_sk, skb);
|
|
+ }
|
|
+ __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF);
|
|
+}
|
|
+
|
|
+void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ struct sock *sk;
|
|
+
|
|
+ if (hlist_empty(&mpcb->conn_list))
|
|
+ return;
|
|
+
|
|
+ WARN_ON(meta_tp->send_mp_fclose);
|
|
+
|
|
+ /* First - select a socket */
|
|
+ sk = mptcp_select_ack_sock(meta_sk);
|
|
+
|
|
+ /* May happen if no subflow is in an appropriate state, OR
|
|
+ * we are in infinite mode or about to go there - just send a reset
|
|
+ */
|
|
+ if (!sk || mptcp_in_infinite_mapping_weak(mpcb)) {
|
|
+ /* tcp_done must be handled with bh disabled */
|
|
+ if (!in_serving_softirq())
|
|
+ local_bh_disable();
|
|
+
|
|
+ mptcp_sub_force_close_all(mpcb, NULL);
|
|
+
|
|
+ if (!in_serving_softirq())
|
|
+ local_bh_enable();
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ tcp_mstamp_refresh(meta_tp);
|
|
+
|
|
+ tcp_sk(sk)->send_mp_fclose = 1;
|
|
+ /** Reset all other subflows */
|
|
+
|
|
+ /* tcp_done must be handled with bh disabled */
|
|
+ if (!in_serving_softirq())
|
|
+ local_bh_disable();
|
|
+
|
|
+ mptcp_sub_force_close_all(mpcb, sk);
|
|
+
|
|
+ tcp_set_state(sk, TCP_RST_WAIT);
|
|
+
|
|
+ if (!in_serving_softirq())
|
|
+ local_bh_enable();
|
|
+
|
|
+ tcp_send_ack(sk);
|
|
+ tcp_clear_xmit_timers(sk);
|
|
+ inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto);
|
|
+
|
|
+ meta_tp->send_mp_fclose = 1;
|
|
+ inet_csk(sk)->icsk_retransmits = 0;
|
|
+
|
|
+ /* Prevent exp backoff reverting on ICMP dest unreachable */
|
|
+ inet_csk(sk)->icsk_backoff = 0;
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_FASTCLOSETX);
|
|
+}
|
|
+
|
|
+static void mptcp_ack_retransmit_timer(struct sock *sk)
|
|
+{
|
|
+ struct inet_connection_sock *icsk = inet_csk(sk);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct net *net = sock_net(sk);
|
|
+ struct sk_buff *skb;
|
|
+
|
|
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
|
|
+ goto out; /* Routing failure or similar */
|
|
+
|
|
+ tcp_mstamp_refresh(tp);
|
|
+
|
|
+ if (tcp_write_timeout(sk)) {
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKRTO);
|
|
+ tp->mptcp->pre_established = 0;
|
|
+ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
|
|
+ tp->ops->send_active_reset(sk, GFP_ATOMIC);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
|
|
+ if (skb == NULL) {
|
|
+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
|
|
+ jiffies + icsk->icsk_rto);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /* Reserve space for headers and prepare control bits */
|
|
+ skb_reserve(skb, MAX_TCP_HEADER);
|
|
+ tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK);
|
|
+
|
|
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKRXMIT);
|
|
+
|
|
+ if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) {
|
|
+ /* Retransmission failed because of local congestion,
|
|
+ * do not backoff.
|
|
+ */
|
|
+ if (!icsk->icsk_retransmits)
|
|
+ icsk->icsk_retransmits = 1;
|
|
+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
|
|
+ jiffies + icsk->icsk_rto);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (!tp->retrans_stamp)
|
|
+ tp->retrans_stamp = tcp_time_stamp(tp) ? : 1;
|
|
+
|
|
+ icsk->icsk_retransmits++;
|
|
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
|
|
+ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
|
|
+ jiffies + icsk->icsk_rto);
|
|
+ if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
|
|
+ __sk_dst_reset(sk);
|
|
+
|
|
+out:;
|
|
+}
|
|
+
|
|
+void mptcp_ack_handler(struct timer_list *t)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp = from_timer(mptcp, t, mptcp_ack_timer);
|
|
+ struct sock *sk = (struct sock *)mptcp->tp;
|
|
+ struct sock *meta_sk = mptcp_meta_sk(sk);
|
|
+
|
|
+ bh_lock_sock(meta_sk);
|
|
+ if (sock_owned_by_user(meta_sk)) {
|
|
+ /* Try again later */
|
|
+ sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer,
|
|
+ jiffies + (HZ / 20));
|
|
+ goto out_unlock;
|
|
+ }
|
|
+
|
|
+ if (sk->sk_state == TCP_CLOSE)
|
|
+ goto out_unlock;
|
|
+ if (!tcp_sk(sk)->mptcp->pre_established)
|
|
+ goto out_unlock;
|
|
+
|
|
+ mptcp_ack_retransmit_timer(sk);
|
|
+
|
|
+ sk_mem_reclaim(sk);
|
|
+
|
|
+out_unlock:
|
|
+ bh_unlock_sock(meta_sk);
|
|
+ sock_put(sk);
|
|
+}
|
|
+
|
|
+/* Similar to tcp_retransmit_skb
|
|
+ *
|
|
+ * The diff is that we handle the retransmission-stats (retrans_stamp) at the
|
|
+ * meta-level.
|
|
+ */
|
|
+int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct sock *subsk;
|
|
+ unsigned int limit, mss_now;
|
|
+ int err = -1;
|
|
+
|
|
+ WARN_ON(TCP_SKB_CB(skb)->sacked);
|
|
+
|
|
+ /* Do not sent more than we queued. 1/4 is reserved for possible
|
|
+ * copying overhead: fragmentation, tunneling, mangling etc.
|
|
+ *
|
|
+ * This is a meta-retransmission thus we check on the meta-socket.
|
|
+ */
|
|
+ if (refcount_read(&meta_sk->sk_wmem_alloc) >
|
|
+ min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) {
|
|
+ return -EAGAIN;
|
|
+ }
|
|
+
|
|
+ /* We need to make sure that the retransmitted segment can be sent on a
|
|
+ * subflow right now. If it is too big, it needs to be fragmented.
|
|
+ */
|
|
+ subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, false);
|
|
+ if (!subsk) {
|
|
+ /* We want to increase icsk_retransmits, thus return 0, so that
|
|
+ * mptcp_meta_retransmit_timer enters the desired branch.
|
|
+ */
|
|
+ err = 0;
|
|
+ goto failed;
|
|
+ }
|
|
+ mss_now = tcp_current_mss(subsk);
|
|
+
|
|
+ /* If the segment was cloned (e.g. a meta retransmission), the header
|
|
+ * must be expanded/copied so that there is no corruption of TSO
|
|
+ * information.
|
|
+ */
|
|
+ if (skb_unclone(skb, GFP_ATOMIC)) {
|
|
+ err = -ENOMEM;
|
|
+ goto failed;
|
|
+ }
|
|
+
|
|
+ /* Must have been set by mptcp_write_xmit before */
|
|
+ BUG_ON(!tcp_skb_pcount(skb));
|
|
+
|
|
+ limit = mss_now;
|
|
+ /* skb->len > mss_now is the equivalent of tso_segs > 1 in
|
|
+ * tcp_write_xmit. Otherwise split-point would return 0.
|
|
+ */
|
|
+ if (skb->len > mss_now && !tcp_urg_mode(meta_tp))
|
|
+ limit = tcp_mss_split_point(meta_sk, skb, mss_now,
|
|
+ UINT_MAX / mss_now,
|
|
+ TCP_NAGLE_OFF);
|
|
+
|
|
+ limit = min(limit, tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq);
|
|
+
|
|
+ if (skb->len > limit &&
|
|
+ unlikely(mptcp_fragment(meta_sk, TCP_FRAG_IN_RTX_QUEUE, skb,
|
|
+ limit, GFP_ATOMIC, 0)))
|
|
+ goto failed;
|
|
+
|
|
+ if (!mptcp_skb_entail(subsk, skb, -1))
|
|
+ goto failed;
|
|
+
|
|
+ /* Update global TCP statistics. */
|
|
+ MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_RETRANSSEGS);
|
|
+
|
|
+ /* Diff to tcp_retransmit_skb */
|
|
+
|
|
+ /* Save stamp of the first retransmit. */
|
|
+ if (!meta_tp->retrans_stamp) {
|
|
+ tcp_mstamp_refresh(meta_tp);
|
|
+ meta_tp->retrans_stamp = tcp_time_stamp(meta_tp);
|
|
+ }
|
|
+
|
|
+ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);
|
|
+ tcp_update_skb_after_send(meta_sk, skb, meta_tp->tcp_wstamp_ns);
|
|
+ meta_tp->lsndtime = tcp_jiffies32;
|
|
+
|
|
+ return 0;
|
|
+
|
|
+failed:
|
|
+ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+/* Similar to tcp_retransmit_timer
|
|
+ *
|
|
+ * The diff is that we have to handle retransmissions of the FAST_CLOSE-message
|
|
+ * and that we don't have an srtt estimation at the meta-level.
|
|
+ */
|
|
+void mptcp_meta_retransmit_timer(struct sock *meta_sk)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
|
|
+ int err;
|
|
+
|
|
+ /* In fallback, retransmission is handled at the subflow-level */
|
|
+ if (!meta_tp->packets_out || mpcb->infinite_mapping_snd)
|
|
+ return;
|
|
+
|
|
+ WARN_ON(tcp_rtx_queue_empty(meta_sk));
|
|
+
|
|
+ if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) &&
|
|
+ !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
|
|
+ /* Receiver dastardly shrinks window. Our retransmits
|
|
+ * become zero probes, but we should not timeout this
|
|
+ * connection. If the socket is an orphan, time it out,
|
|
+ * we cannot allow such beasts to hang infinitely.
|
|
+ */
|
|
+ struct inet_sock *meta_inet = inet_sk(meta_sk);
|
|
+ if (meta_sk->sk_family == AF_INET) {
|
|
+ net_dbg_ratelimited("MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
|
|
+ &meta_inet->inet_daddr,
|
|
+ ntohs(meta_inet->inet_dport),
|
|
+ meta_inet->inet_num, meta_tp->snd_una,
|
|
+ meta_tp->snd_nxt);
|
|
+ }
|
|
+#if IS_ENABLED(CONFIG_IPV6)
|
|
+ else if (meta_sk->sk_family == AF_INET6) {
|
|
+ net_dbg_ratelimited("MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
|
|
+ &meta_sk->sk_v6_daddr,
|
|
+ ntohs(meta_inet->inet_dport),
|
|
+ meta_inet->inet_num, meta_tp->snd_una,
|
|
+ meta_tp->snd_nxt);
|
|
+ }
|
|
+#endif
|
|
+ if (tcp_jiffies32 - meta_tp->rcv_tstamp > TCP_RTO_MAX) {
|
|
+ tcp_write_err(meta_sk);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ mptcp_retransmit_skb(meta_sk, tcp_rtx_queue_head(meta_sk));
|
|
+ goto out_reset_timer;
|
|
+ }
|
|
+
|
|
+ if (tcp_write_timeout(meta_sk))
|
|
+ return;
|
|
+
|
|
+ if (meta_icsk->icsk_retransmits == 0)
|
|
+ NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS);
|
|
+
|
|
+ meta_icsk->icsk_ca_state = TCP_CA_Loss;
|
|
+
|
|
+ err = mptcp_retransmit_skb(meta_sk, tcp_rtx_queue_head(meta_sk));
|
|
+ if (err > 0) {
|
|
+ /* Retransmission failed because of local congestion,
|
|
+ * do not backoff.
|
|
+ */
|
|
+ if (!meta_icsk->icsk_retransmits)
|
|
+ meta_icsk->icsk_retransmits = 1;
|
|
+ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
|
|
+ min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
|
|
+ TCP_RTO_MAX);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /* Increase the timeout each time we retransmit. Note that
|
|
+ * we do not increase the rtt estimate. rto is initialized
|
|
+ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
|
|
+ * that doubling rto each time is the least we can get away with.
|
|
+ * In KA9Q, Karn uses this for the first few times, and then
|
|
+ * goes to quadratic. netBSD doubles, but only goes up to *64,
|
|
+ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
|
|
+ * defined in the protocol as the maximum possible RTT. I guess
|
|
+ * we'll have to use something other than TCP to talk to the
|
|
+ * University of Mars.
|
|
+ *
|
|
+ * PAWS allows us longer timeouts and large windows, so once
|
|
+ * implemented ftp to mars will work nicely. We will have to fix
|
|
+ * the 120 second clamps though!
|
|
+ */
|
|
+ meta_icsk->icsk_backoff++;
|
|
+ meta_icsk->icsk_retransmits++;
|
|
+
|
|
+out_reset_timer:
|
|
+ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
|
|
+ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
|
|
+ * might be increased if the stream oscillates between thin and thick,
|
|
+ * thus the old value might already be too high compared to the value
|
|
+ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
|
|
+ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
|
|
+ * exponential backoff behaviour to avoid continue hammering
|
|
+ * linear-timeout retransmissions into a black hole
|
|
+ */
|
|
+ if (meta_sk->sk_state == TCP_ESTABLISHED &&
|
|
+ (meta_tp->thin_lto || sock_net(meta_sk)->ipv4.sysctl_tcp_thin_linear_timeouts) &&
|
|
+ tcp_stream_is_thin(meta_tp) &&
|
|
+ meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
|
|
+ meta_icsk->icsk_backoff = 0;
|
|
+ /* We cannot do the same as in tcp_write_timer because the
|
|
+ * srtt is not set here.
|
|
+ */
|
|
+ mptcp_set_rto(meta_sk);
|
|
+ } else {
|
|
+ /* Use normal (exponential) backoff */
|
|
+ meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX);
|
|
+ }
|
|
+ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX);
|
|
+
|
|
+ return;
|
|
+}
|
|
+
|
|
+void mptcp_sub_retransmit_timer(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ tcp_retransmit_timer(sk);
|
|
+
|
|
+ if (!tp->fastopen_rsk) {
|
|
+ mptcp_reinject_data(sk, 1);
|
|
+ mptcp_set_rto(sk);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Modify values to an mptcp-level for the initial window of new subflows */
|
|
+void mptcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
|
|
+ __u32 *rcv_wnd, __u32 *window_clamp,
|
|
+ int wscale_ok, __u8 *rcv_wscale,
|
|
+ __u32 init_rcv_wnd)
|
|
+{
|
|
+ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
|
|
+
|
|
+ *window_clamp = mpcb->orig_window_clamp;
|
|
+ __space = tcp_win_from_space(sk, mpcb->orig_sk_rcvbuf);
|
|
+
|
|
+ tcp_select_initial_window(sk, __space, mss, rcv_wnd, window_clamp,
|
|
+ wscale_ok, rcv_wscale, init_rcv_wnd);
|
|
+}
|
|
+
|
|
+static inline u64 mptcp_calc_rate(const struct sock *meta_sk, unsigned int mss)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ u64 rate = 0;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ int this_mss;
|
|
+ u64 this_rate;
|
|
+
|
|
+ if (!mptcp_sk_can_send(sk))
|
|
+ continue;
|
|
+
|
|
+ /* Do not consider subflows without a RTT estimation yet
|
|
+ * otherwise this_rate >>> rate.
|
|
+ */
|
|
+ if (unlikely(!tp->srtt_us))
|
|
+ continue;
|
|
+
|
|
+ this_mss = tcp_current_mss(sk);
|
|
+
|
|
+ /* If this_mss is smaller than mss, it means that a segment will
|
|
+ * be splitted in two (or more) when pushed on this subflow. If
|
|
+ * you consider that mss = 1428 and this_mss = 1420 then two
|
|
+ * segments will be generated: a 1420-byte and 8-byte segment.
|
|
+ * The latter will introduce a large overhead as for a single
|
|
+ * data segment 2 slots will be used in the congestion window.
|
|
+ * Therefore reducing by ~2 the potential throughput of this
|
|
+ * subflow. Indeed, 1428 will be send while 2840 could have been
|
|
+ * sent if mss == 1420 reducing the throughput by 2840 / 1428.
|
|
+ *
|
|
+ * The following algorithm take into account this overhead
|
|
+ * when computing the potential throughput that MPTCP can
|
|
+ * achieve when generating mss-byte segments.
|
|
+ *
|
|
+ * The formulae is the following:
|
|
+ * \sum_{\forall sub} ratio * \frac{mss * cwnd_sub}{rtt_sub}
|
|
+ * Where ratio is computed as follows:
|
|
+ * \frac{mss}{\ceil{mss / mss_sub} * mss_sub}
|
|
+ *
|
|
+ * ratio gives the reduction factor of the theoretical
|
|
+ * throughput a subflow can achieve if MPTCP uses a specific
|
|
+ * MSS value.
|
|
+ */
|
|
+ this_rate = div64_u64((u64)mss * mss * (USEC_PER_SEC << 3) *
|
|
+ max(tp->snd_cwnd, tp->packets_out),
|
|
+ (u64)tp->srtt_us *
|
|
+ DIV_ROUND_UP(mss, this_mss) * this_mss);
|
|
+ rate += this_rate;
|
|
+ }
|
|
+
|
|
+ return rate;
|
|
+}
|
|
+
|
|
+static unsigned int __mptcp_current_mss(const struct sock *meta_sk)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ unsigned int mss = 0;
|
|
+ u64 rate = 0;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ int this_mss;
|
|
+ u64 this_rate;
|
|
+
|
|
+ if (!mptcp_sk_can_send(sk))
|
|
+ continue;
|
|
+
|
|
+ this_mss = tcp_current_mss(sk);
|
|
+
|
|
+ /* Same mss values will produce the same throughput. */
|
|
+ if (this_mss == mss)
|
|
+ continue;
|
|
+
|
|
+ /* See whether using this mss value can theoretically improve
|
|
+ * the performances.
|
|
+ */
|
|
+ this_rate = mptcp_calc_rate(meta_sk, this_mss);
|
|
+ if (this_rate >= rate) {
|
|
+ mss = this_mss;
|
|
+ rate = this_rate;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return mss;
|
|
+}
|
|
+
|
|
+unsigned int mptcp_current_mss(struct sock *meta_sk)
|
|
+{
|
|
+ unsigned int mss = __mptcp_current_mss(meta_sk);
|
|
+
|
|
+ /* If no subflow is available, we take a default-mss from the
|
|
+ * meta-socket.
|
|
+ */
|
|
+ return !mss ? tcp_current_mss(meta_sk) : mss;
|
|
+}
|
|
+
|
|
+int mptcp_check_snd_buf(const struct tcp_sock *tp)
|
|
+{
|
|
+ const struct mptcp_tcp_sock *mptcp;
|
|
+ u32 rtt_max = tp->srtt_us;
|
|
+ u64 bw_est;
|
|
+
|
|
+ if (!tp->srtt_us)
|
|
+ return tp->reordering + 1;
|
|
+
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ const struct sock *sk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (!mptcp_sk_can_send(sk))
|
|
+ continue;
|
|
+
|
|
+ if (rtt_max < tcp_sk(sk)->srtt_us)
|
|
+ rtt_max = tcp_sk(sk)->srtt_us;
|
|
+ }
|
|
+
|
|
+ bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16,
|
|
+ (u64)tp->srtt_us);
|
|
+
|
|
+ return max_t(unsigned int, (u32)(bw_est >> 16),
|
|
+ tp->reordering + 1);
|
|
+}
|
|
+
|
|
+unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,
|
|
+ int large_allowed)
|
|
+{
|
|
+ u32 xmit_size_goal = 0;
|
|
+
|
|
+ if (large_allowed && !tcp_sk(meta_sk)->mpcb->dss_csum) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(tcp_sk(meta_sk)->mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ int this_size_goal;
|
|
+
|
|
+ if (!mptcp_sk_can_send(sk))
|
|
+ continue;
|
|
+
|
|
+ this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1);
|
|
+ if (this_size_goal > xmit_size_goal)
|
|
+ xmit_size_goal = this_size_goal;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return max(xmit_size_goal, mss_now);
|
|
+}
|
|
+
|
|
diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c
|
|
new file mode 100644
|
|
index 000000000000..0e24e0aaa70a
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_pm.c
|
|
@@ -0,0 +1,226 @@
|
|
+/*
|
|
+ * MPTCP implementation - MPTCP-subflow-management
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
|
|
+ *
|
|
+ * Current Maintainer & Author:
|
|
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
|
|
+ *
|
|
+ * Additional authors:
|
|
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
|
|
+ * Gregory Detal <gregory.detal@uclouvain.be>
|
|
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
|
|
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
|
|
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
|
|
+ * Andreas Ripke <ripke@neclab.eu>
|
|
+ * Vlad Dogaru <vlad.dogaru@intel.com>
|
|
+ * Octavian Purdila <octavian.purdila@intel.com>
|
|
+ * John Ronan <jronan@tssg.org>
|
|
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
|
|
+ * Brandon Heller <brandonh@stanford.edu>
|
|
+ *
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+
|
|
+#include <linux/module.h>
|
|
+#include <net/mptcp.h>
|
|
+
|
|
+static DEFINE_SPINLOCK(mptcp_pm_list_lock);
|
|
+static LIST_HEAD(mptcp_pm_list);
|
|
+
|
|
+static int mptcp_default_id(const struct sock *meta_sk, sa_family_t family,
|
|
+ union inet_addr *addr, bool *low_prio)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+struct mptcp_pm_ops mptcp_pm_default = {
|
|
+ .get_local_id = mptcp_default_id, /* We do not care */
|
|
+ .name = "default",
|
|
+ .owner = THIS_MODULE,
|
|
+};
|
|
+
|
|
+static struct mptcp_pm_ops *mptcp_pm_find(const char *name)
|
|
+{
|
|
+ struct mptcp_pm_ops *e;
|
|
+
|
|
+ list_for_each_entry_rcu(e, &mptcp_pm_list, list) {
|
|
+ if (strcmp(e->name, name) == 0)
|
|
+ return e;
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+int mptcp_register_path_manager(struct mptcp_pm_ops *pm)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!pm->get_local_id)
|
|
+ return -EINVAL;
|
|
+
|
|
+ spin_lock(&mptcp_pm_list_lock);
|
|
+ if (mptcp_pm_find(pm->name)) {
|
|
+ pr_notice("%s already registered\n", pm->name);
|
|
+ ret = -EEXIST;
|
|
+ } else {
|
|
+ list_add_tail_rcu(&pm->list, &mptcp_pm_list);
|
|
+ pr_info("%s registered\n", pm->name);
|
|
+ }
|
|
+ spin_unlock(&mptcp_pm_list_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(mptcp_register_path_manager);
|
|
+
|
|
+void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm)
|
|
+{
|
|
+ spin_lock(&mptcp_pm_list_lock);
|
|
+ list_del_rcu(&pm->list);
|
|
+ spin_unlock(&mptcp_pm_list_lock);
|
|
+
|
|
+ /* Wait for outstanding readers to complete before the
|
|
+ * module gets removed entirely.
|
|
+ *
|
|
+ * A try_module_get() should fail by now as our module is
|
|
+ * in "going" state since no refs are held anymore and
|
|
+ * module_exit() handler being called.
|
|
+ */
|
|
+ synchronize_rcu();
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager);
|
|
+
|
|
+void mptcp_get_default_path_manager(char *name)
|
|
+{
|
|
+ struct mptcp_pm_ops *pm;
|
|
+
|
|
+ BUG_ON(list_empty(&mptcp_pm_list));
|
|
+
|
|
+ rcu_read_lock();
|
|
+ pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list);
|
|
+ strncpy(name, pm->name, MPTCP_PM_NAME_MAX);
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+int mptcp_set_default_path_manager(const char *name)
|
|
+{
|
|
+ struct mptcp_pm_ops *pm;
|
|
+ int ret = -ENOENT;
|
|
+
|
|
+ spin_lock(&mptcp_pm_list_lock);
|
|
+ pm = mptcp_pm_find(name);
|
|
+#ifdef CONFIG_MODULES
|
|
+ if (!pm && capable(CAP_NET_ADMIN)) {
|
|
+ spin_unlock(&mptcp_pm_list_lock);
|
|
+
|
|
+ request_module("mptcp_%s", name);
|
|
+ spin_lock(&mptcp_pm_list_lock);
|
|
+ pm = mptcp_pm_find(name);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ if (pm) {
|
|
+ list_move(&pm->list, &mptcp_pm_list);
|
|
+ ret = 0;
|
|
+ } else {
|
|
+ pr_info("%s is not available\n", name);
|
|
+ }
|
|
+ spin_unlock(&mptcp_pm_list_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct mptcp_pm_ops *__mptcp_pm_find_autoload(const char *name)
|
|
+{
|
|
+ struct mptcp_pm_ops *pm = mptcp_pm_find(name);
|
|
+#ifdef CONFIG_MODULES
|
|
+ if (!pm && capable(CAP_NET_ADMIN)) {
|
|
+ rcu_read_unlock();
|
|
+ request_module("mptcp_%s", name);
|
|
+ rcu_read_lock();
|
|
+ pm = mptcp_pm_find(name);
|
|
+ }
|
|
+#endif
|
|
+ return pm;
|
|
+}
|
|
+
|
|
+void mptcp_init_path_manager(struct mptcp_cb *mpcb)
|
|
+{
|
|
+ struct mptcp_pm_ops *pm;
|
|
+ struct sock *meta_sk = mpcb->meta_sk;
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ /* if path manager was set using socket option */
|
|
+ if (meta_tp->mptcp_pm_setsockopt) {
|
|
+ pm = __mptcp_pm_find_autoload(meta_tp->mptcp_pm_name);
|
|
+ if (pm && try_module_get(pm->owner)) {
|
|
+ mpcb->pm_ops = pm;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ list_for_each_entry_rcu(pm, &mptcp_pm_list, list) {
|
|
+ if (try_module_get(pm->owner)) {
|
|
+ mpcb->pm_ops = pm;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+out:
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+/* Change path manager for socket */
|
|
+int mptcp_set_path_manager(struct sock *sk, const char *name)
|
|
+{
|
|
+ struct mptcp_pm_ops *pm;
|
|
+ int err = 0;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ pm = __mptcp_pm_find_autoload(name);
|
|
+
|
|
+ if (!pm) {
|
|
+ err = -ENOENT;
|
|
+ } else if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
|
|
+ err = -EPERM;
|
|
+ } else {
|
|
+ strcpy(tcp_sk(sk)->mptcp_pm_name, name);
|
|
+ tcp_sk(sk)->mptcp_pm_setsockopt = 1;
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return err;
|
|
+}
|
|
+
|
|
+/* Manage refcounts on socket close. */
|
|
+void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb)
|
|
+{
|
|
+ module_put(mpcb->pm_ops->owner);
|
|
+}
|
|
+
|
|
+/* Fallback to the default path-manager. */
|
|
+void mptcp_fallback_default(struct mptcp_cb *mpcb)
|
|
+{
|
|
+ struct mptcp_pm_ops *pm;
|
|
+
|
|
+ mptcp_cleanup_path_manager(mpcb);
|
|
+ pm = mptcp_pm_find("default");
|
|
+
|
|
+ /* Cannot fail - it's the default module */
|
|
+ try_module_get(pm->owner);
|
|
+ mpcb->pm_ops = pm;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(mptcp_fallback_default);
|
|
+
|
|
+/* Set default value from kernel configuration at bootup */
|
|
+static int __init mptcp_path_manager_default(void)
|
|
+{
|
|
+ return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM);
|
|
+}
|
|
+late_initcall(mptcp_path_manager_default);
|
|
diff --git a/net/mptcp/mptcp_redundant.c b/net/mptcp/mptcp_redundant.c
|
|
new file mode 100644
|
|
index 000000000000..3db4e69acef2
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_redundant.c
|
|
@@ -0,0 +1,395 @@
|
|
+/*
|
|
+ * MPTCP Scheduler to reduce latency and jitter.
|
|
+ *
|
|
+ * This scheduler sends all packets redundantly on all available subflows.
|
|
+ *
|
|
+ * Initial Design & Implementation:
|
|
+ * Tobias Erbshaeusser <erbshauesser@dvs.tu-darmstadt.de>
|
|
+ * Alexander Froemmgen <froemmge@dvs.tu-darmstadt.de>
|
|
+ *
|
|
+ * Initial corrections & modifications:
|
|
+ * Christian Pinedo <christian.pinedo@ehu.eus>
|
|
+ * Igor Lopez <igor.lopez@ehu.eus>
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#include <linux/module.h>
|
|
+#include <net/mptcp.h>
|
|
+
|
|
+/* Struct to store the data of a single subflow */
|
|
+struct redsched_priv {
|
|
+ /* The skb or NULL */
|
|
+ struct sk_buff *skb;
|
|
+ /* Start/end sequence number of the skb. This number should be checked
|
|
+ * to be valid before the skb field is used
|
|
+ */
|
|
+ u32 skb_start_seq;
|
|
+ u32 skb_end_seq;
|
|
+};
|
|
+
|
|
+/* Struct to store the data of the control block */
|
|
+struct redsched_cb {
|
|
+ /* The next subflow where a skb should be sent or NULL */
|
|
+ struct tcp_sock *next_subflow;
|
|
+};
|
|
+
|
|
+/* Returns the socket data from a given subflow socket */
|
|
+static struct redsched_priv *redsched_get_priv(struct tcp_sock *tp)
|
|
+{
|
|
+ return (struct redsched_priv *)&tp->mptcp->mptcp_sched[0];
|
|
+}
|
|
+
|
|
+/* Returns the control block data from a given meta socket */
|
|
+static struct redsched_cb *redsched_get_cb(struct tcp_sock *tp)
|
|
+{
|
|
+ return (struct redsched_cb *)&tp->mpcb->mptcp_sched[0];
|
|
+}
|
|
+
|
|
+static bool redsched_get_active_valid_sks(struct sock *meta_sk)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ int active_valid_sks = 0;
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (subflow_is_active((struct tcp_sock *)sk) &&
|
|
+ !mptcp_is_def_unavailable(sk))
|
|
+ active_valid_sks++;
|
|
+ }
|
|
+
|
|
+ return active_valid_sks;
|
|
+}
|
|
+
|
|
+static bool redsched_use_subflow(struct sock *meta_sk,
|
|
+ int active_valid_sks,
|
|
+ struct tcp_sock *tp,
|
|
+ struct sk_buff *skb)
|
|
+{
|
|
+ if (!skb || !mptcp_is_available((struct sock *)tp, skb, false))
|
|
+ return false;
|
|
+
|
|
+ if (TCP_SKB_CB(skb)->path_mask != 0)
|
|
+ return subflow_is_active(tp);
|
|
+
|
|
+ if (TCP_SKB_CB(skb)->path_mask == 0) {
|
|
+ if (active_valid_sks == -1)
|
|
+ active_valid_sks = redsched_get_active_valid_sks(meta_sk);
|
|
+
|
|
+ if (subflow_is_backup(tp) && active_valid_sks > 0)
|
|
+ return false;
|
|
+ else
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+#define mptcp_entry_next_rcu(__mptcp) \
|
|
+ hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
|
|
+ &(__mptcp)->node)), struct mptcp_tcp_sock, node)
|
|
+
|
|
+static void redsched_update_next_subflow(struct tcp_sock *tp,
|
|
+ struct redsched_cb *red_cb)
|
|
+{
|
|
+ struct mptcp_tcp_sock *mptcp = mptcp_entry_next_rcu(tp->mptcp);
|
|
+
|
|
+ if (mptcp)
|
|
+ red_cb->next_subflow = mptcp->tp;
|
|
+ else
|
|
+ red_cb->next_subflow = NULL;
|
|
+}
|
|
+
|
|
+static struct sock *red_get_available_subflow(struct sock *meta_sk,
|
|
+ struct sk_buff *skb,
|
|
+ bool zero_wnd_test)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ struct redsched_cb *red_cb = redsched_get_cb(meta_tp);
|
|
+ struct tcp_sock *first_tp = red_cb->next_subflow, *tp;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ int found = 0;
|
|
+
|
|
+ /* Answer data_fin on same subflow */
|
|
+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
|
|
+ skb && mptcp_is_data_fin(skb)) {
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (tcp_sk(sk)->mptcp->path_index ==
|
|
+ mpcb->dfin_path_index &&
|
|
+ mptcp_is_available(sk, skb, zero_wnd_test))
|
|
+ return sk;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!first_tp && !hlist_empty(&mpcb->conn_list)) {
|
|
+ first_tp = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&mpcb->conn_list)),
|
|
+ struct mptcp_tcp_sock, node)->tp;
|
|
+ }
|
|
+ tp = first_tp;
|
|
+
|
|
+ /* still NULL (no subflow in conn_list?) */
|
|
+ if (!first_tp)
|
|
+ return NULL;
|
|
+
|
|
+ /* Search for a subflow to send it.
|
|
+ *
|
|
+ * We want to pick a subflow that is after 'first_tp' in the list of subflows.
|
|
+ * Thus, the first mptcp_for_each_sub()-loop tries to walk the list up
|
|
+ * to the subflow 'tp' and then checks whether any one of the remaining
|
|
+ * ones is eligible to send.
|
|
+ * The second mptcp_for_each-sub()-loop is then iterating from the
|
|
+ * beginning of the list up to 'first_tp'.
|
|
+ */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ /* We go up to the subflow 'tp' and start from there */
|
|
+ if (tp == mptcp->tp)
|
|
+ found = 1;
|
|
+
|
|
+ if (!found)
|
|
+ continue;
|
|
+ tp = mptcp->tp;
|
|
+
|
|
+ if (mptcp_is_available((struct sock *)tp, skb,
|
|
+ zero_wnd_test)) {
|
|
+ redsched_update_next_subflow(tp, red_cb);
|
|
+ return (struct sock *)tp;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ tp = mptcp->tp;
|
|
+
|
|
+ if (tp == first_tp)
|
|
+ break;
|
|
+
|
|
+ if (mptcp_is_available((struct sock *)tp, skb,
|
|
+ zero_wnd_test)) {
|
|
+ redsched_update_next_subflow(tp, red_cb);
|
|
+ return (struct sock *)tp;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* No space */
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+/* Corrects the stored skb pointers if they are invalid */
|
|
+static void redsched_correct_skb_pointers(struct sock *meta_sk,
|
|
+ struct redsched_priv *red_p)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+
|
|
+ if (red_p->skb &&
|
|
+ (!after(red_p->skb_start_seq, meta_tp->snd_una) ||
|
|
+ after(red_p->skb_end_seq, meta_tp->snd_nxt)))
|
|
+ red_p->skb = NULL;
|
|
+}
|
|
+
|
|
+/* Returns the next skb from the queue */
|
|
+static struct sk_buff *redsched_next_skb_from_queue(struct sk_buff_head *queue,
|
|
+ struct sk_buff *previous,
|
|
+ struct sock *meta_sk)
|
|
+{
|
|
+ struct sk_buff *skb;
|
|
+
|
|
+ if (!previous)
|
|
+ return tcp_rtx_queue_head(meta_sk) ? : skb_peek(queue);
|
|
+
|
|
+ /* sk_data->skb stores the last scheduled packet for this subflow.
|
|
+ * If sk_data->skb was scheduled but not sent (e.g., due to nagle),
|
|
+ * we have to schedule it again.
|
|
+ *
|
|
+ * For the redundant scheduler, there are two cases:
|
|
+ * 1. sk_data->skb was not sent on another subflow:
|
|
+ * we have to schedule it again to ensure that we do not
|
|
+ * skip this packet.
|
|
+ * 2. sk_data->skb was already sent on another subflow:
|
|
+ * with regard to the redundant semantic, we have to
|
|
+ * schedule it again. However, we keep it simple and ignore it,
|
|
+ * as it was already sent by another subflow.
|
|
+ * This might be changed in the future.
|
|
+ *
|
|
+ * For case 1, send_head is equal previous, as only a single
|
|
+ * packet can be skipped.
|
|
+ */
|
|
+ if (tcp_send_head(meta_sk) == previous)
|
|
+ return tcp_send_head(meta_sk);
|
|
+
|
|
+ skb = skb_rb_next(previous);
|
|
+ if (skb)
|
|
+ return skb;
|
|
+
|
|
+ return tcp_send_head(meta_sk);
|
|
+}
|
|
+
|
|
+static struct sk_buff *mptcp_red_next_segment(struct sock *meta_sk,
|
|
+ int *reinject,
|
|
+ struct sock **subsk,
|
|
+ unsigned int *limit)
|
|
+{
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+ struct mptcp_cb *mpcb = meta_tp->mpcb;
|
|
+ struct redsched_cb *red_cb = redsched_get_cb(meta_tp);
|
|
+ struct tcp_sock *first_tp = red_cb->next_subflow, *tp;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ int active_valid_sks = -1;
|
|
+ struct sk_buff *skb;
|
|
+ int found = 0;
|
|
+
|
|
+ /* As we set it, we have to reset it as well. */
|
|
+ *limit = 0;
|
|
+
|
|
+ if (skb_queue_empty(&mpcb->reinject_queue) &&
|
|
+ skb_queue_empty(&meta_sk->sk_write_queue) &&
|
|
+ tcp_rtx_queue_empty(meta_sk))
|
|
+ /* Nothing to send */
|
|
+ return NULL;
|
|
+
|
|
+ /* First try reinjections */
|
|
+ skb = skb_peek(&mpcb->reinject_queue);
|
|
+ if (skb) {
|
|
+ *subsk = get_available_subflow(meta_sk, skb, false);
|
|
+ if (!*subsk)
|
|
+ return NULL;
|
|
+ *reinject = 1;
|
|
+ return skb;
|
|
+ }
|
|
+
|
|
+ /* Then try indistinctly redundant and normal skbs */
|
|
+
|
|
+ if (!first_tp && !hlist_empty(&mpcb->conn_list)) {
|
|
+ first_tp = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&mpcb->conn_list)),
|
|
+ struct mptcp_tcp_sock, node)->tp;
|
|
+ }
|
|
+
|
|
+ /* still NULL (no subflow in conn_list?) */
|
|
+ if (!first_tp)
|
|
+ return NULL;
|
|
+
|
|
+ tp = first_tp;
|
|
+
|
|
+ *reinject = 0;
|
|
+ active_valid_sks = redsched_get_active_valid_sks(meta_sk);
|
|
+
|
|
+ /* We want to pick a subflow that is after 'first_tp' in the list of subflows.
|
|
+ * Thus, the first mptcp_for_each_sub()-loop tries to walk the list up
|
|
+ * to the subflow 'tp' and then checks whether any one of the remaining
|
|
+ * ones can send a segment.
|
|
+ * The second mptcp_for_each-sub()-loop is then iterating from the
|
|
+ * beginning of the list up to 'first_tp'.
|
|
+ */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct redsched_priv *red_p;
|
|
+
|
|
+ if (tp == mptcp->tp)
|
|
+ found = 1;
|
|
+
|
|
+ if (!found)
|
|
+ continue;
|
|
+
|
|
+ tp = mptcp->tp;
|
|
+
|
|
+ /* Correct the skb pointers of the current subflow */
|
|
+ red_p = redsched_get_priv(tp);
|
|
+ redsched_correct_skb_pointers(meta_sk, red_p);
|
|
+
|
|
+ skb = redsched_next_skb_from_queue(&meta_sk->sk_write_queue,
|
|
+ red_p->skb, meta_sk);
|
|
+ if (skb && redsched_use_subflow(meta_sk, active_valid_sks, tp,
|
|
+ skb)) {
|
|
+ red_p->skb = skb;
|
|
+ red_p->skb_start_seq = TCP_SKB_CB(skb)->seq;
|
|
+ red_p->skb_end_seq = TCP_SKB_CB(skb)->end_seq;
|
|
+ redsched_update_next_subflow(tp, red_cb);
|
|
+ *subsk = (struct sock *)tp;
|
|
+
|
|
+ if (TCP_SKB_CB(skb)->path_mask)
|
|
+ *reinject = -1;
|
|
+ return skb;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct redsched_priv *red_p;
|
|
+
|
|
+ tp = mptcp->tp;
|
|
+
|
|
+ if (tp == first_tp)
|
|
+ break;
|
|
+
|
|
+ /* Correct the skb pointers of the current subflow */
|
|
+ red_p = redsched_get_priv(tp);
|
|
+ redsched_correct_skb_pointers(meta_sk, red_p);
|
|
+
|
|
+ skb = redsched_next_skb_from_queue(&meta_sk->sk_write_queue,
|
|
+ red_p->skb, meta_sk);
|
|
+ if (skb && redsched_use_subflow(meta_sk, active_valid_sks, tp,
|
|
+ skb)) {
|
|
+ red_p->skb = skb;
|
|
+ red_p->skb_start_seq = TCP_SKB_CB(skb)->seq;
|
|
+ red_p->skb_end_seq = TCP_SKB_CB(skb)->end_seq;
|
|
+ redsched_update_next_subflow(tp, red_cb);
|
|
+ *subsk = (struct sock *)tp;
|
|
+
|
|
+ if (TCP_SKB_CB(skb)->path_mask)
|
|
+ *reinject = -1;
|
|
+ return skb;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Nothing to send */
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void redsched_release(struct sock *sk)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct redsched_cb *red_cb = redsched_get_cb(tp);
|
|
+
|
|
+ /* Check if the next subflow would be the released one. If yes correct
|
|
+ * the pointer
|
|
+ */
|
|
+ if (red_cb->next_subflow == tp)
|
|
+ redsched_update_next_subflow(tp, red_cb);
|
|
+}
|
|
+
|
|
+static struct mptcp_sched_ops mptcp_sched_red = {
|
|
+ .get_subflow = red_get_available_subflow,
|
|
+ .next_segment = mptcp_red_next_segment,
|
|
+ .release = redsched_release,
|
|
+ .name = "redundant",
|
|
+ .owner = THIS_MODULE,
|
|
+};
|
|
+
|
|
+static int __init red_register(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct redsched_priv) > MPTCP_SCHED_SIZE);
|
|
+ BUILD_BUG_ON(sizeof(struct redsched_cb) > MPTCP_SCHED_DATA_SIZE);
|
|
+
|
|
+ if (mptcp_register_scheduler(&mptcp_sched_red))
|
|
+ return -1;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void red_unregister(void)
|
|
+{
|
|
+ mptcp_unregister_scheduler(&mptcp_sched_red);
|
|
+}
|
|
+
|
|
+module_init(red_register);
|
|
+module_exit(red_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Tobias Erbshaeusser, Alexander Froemmgen");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("REDUNDANT MPTCP");
|
|
+MODULE_VERSION("0.90");
|
|
diff --git a/net/mptcp/mptcp_rr.c b/net/mptcp/mptcp_rr.c
|
|
new file mode 100644
|
|
index 000000000000..396e8aaf4762
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_rr.c
|
|
@@ -0,0 +1,309 @@
|
|
+/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
|
|
+
|
|
+#include <linux/module.h>
|
|
+#include <net/mptcp.h>
|
|
+
|
|
+static unsigned char num_segments __read_mostly = 1;
|
|
+module_param(num_segments, byte, 0644);
|
|
+MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst");
|
|
+
|
|
+static bool cwnd_limited __read_mostly = 1;
|
|
+module_param(cwnd_limited, bool, 0644);
|
|
+MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows");
|
|
+
|
|
+struct rrsched_priv {
|
|
+ unsigned char quota;
|
|
+};
|
|
+
|
|
+static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp)
|
|
+{
|
|
+ return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0];
|
|
+}
|
|
+
|
|
+/* If the sub-socket sk available to send the skb? */
|
|
+static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb,
|
|
+ bool zero_wnd_test, bool cwnd_test)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+ unsigned int space, in_flight;
|
|
+
|
|
+ /* Set of states for which we are allowed to send data */
|
|
+ if (!mptcp_sk_can_send(sk))
|
|
+ return false;
|
|
+
|
|
+ /* We do not send data on this subflow unless it is
|
|
+ * fully established, i.e. the 4th ack has been received.
|
|
+ */
|
|
+ if (tp->mptcp->pre_established)
|
|
+ return false;
|
|
+
|
|
+ if (tp->pf)
|
|
+ return false;
|
|
+
|
|
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
|
|
+ /* If SACK is disabled, and we got a loss, TCP does not exit
|
|
+ * the loss-state until something above high_seq has been acked.
|
|
+ * (see tcp_try_undo_recovery)
|
|
+ *
|
|
+ * high_seq is the snd_nxt at the moment of the RTO. As soon
|
|
+ * as we have an RTO, we won't push data on the subflow.
|
|
+ * Thus, snd_una can never go beyond high_seq.
|
|
+ */
|
|
+ if (!tcp_is_reno(tp))
|
|
+ return false;
|
|
+ else if (tp->snd_una != tp->high_seq)
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (!tp->mptcp->fully_established) {
|
|
+ /* Make sure that we send in-order data */
|
|
+ if (skb && tp->mptcp->second_packet &&
|
|
+ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (!cwnd_test)
|
|
+ goto zero_wnd_test;
|
|
+
|
|
+ in_flight = tcp_packets_in_flight(tp);
|
|
+ /* Not even a single spot in the cwnd */
|
|
+ if (in_flight >= tp->snd_cwnd)
|
|
+ return false;
|
|
+
|
|
+ /* Now, check if what is queued in the subflow's send-queue
|
|
+ * already fills the cwnd.
|
|
+ */
|
|
+ space = (tp->snd_cwnd - in_flight) * tp->mss_cache;
|
|
+
|
|
+ if (tp->write_seq - tp->snd_nxt > space)
|
|
+ return false;
|
|
+
|
|
+zero_wnd_test:
|
|
+ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
|
|
+ return false;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/* Are we not allowed to reinject this skb on tp? */
|
|
+static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
|
|
+{
|
|
+ /* If the skb has already been enqueued in this sk, try to find
|
|
+ * another one.
|
|
+ */
|
|
+ return skb &&
|
|
+ /* Has the skb already been enqueued into this subsocket? */
|
|
+ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
|
|
+}
|
|
+
|
|
+/* We just look for any subflow that is available */
|
|
+static struct sock *rr_get_available_subflow(struct sock *meta_sk,
|
|
+ struct sk_buff *skb,
|
|
+ bool zero_wnd_test)
|
|
+{
|
|
+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct sock *sk = NULL, *bestsk = NULL, *backupsk = NULL;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ /* Answer data_fin on same subflow!!! */
|
|
+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
|
|
+ skb && mptcp_is_data_fin(skb)) {
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ sk = mptcp_to_sock(mptcp);
|
|
+ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
|
|
+ mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
|
|
+ return sk;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* First, find the best subflow */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct tcp_sock *tp;
|
|
+
|
|
+ sk = mptcp_to_sock(mptcp);
|
|
+ tp = tcp_sk(sk);
|
|
+
|
|
+ if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
|
|
+ continue;
|
|
+
|
|
+ if (mptcp_rr_dont_reinject_skb(tp, skb)) {
|
|
+ backupsk = sk;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ bestsk = sk;
|
|
+ }
|
|
+
|
|
+ if (bestsk) {
|
|
+ sk = bestsk;
|
|
+ } else if (backupsk) {
|
|
+ /* It has been sent on all subflows once - let's give it a
|
|
+ * chance again by restarting its pathmask.
|
|
+ */
|
|
+ if (skb)
|
|
+ TCP_SKB_CB(skb)->path_mask = 0;
|
|
+ sk = backupsk;
|
|
+ }
|
|
+
|
|
+ return sk;
|
|
+}
|
|
+
|
|
+/* Returns the next segment to be sent from the mptcp meta-queue.
|
|
+ * (chooses the reinject queue if any segment is waiting in it, otherwise,
|
|
+ * chooses the normal write queue).
|
|
+ * Sets *@reinject to 1 if the returned segment comes from the
|
|
+ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
|
|
+ * and sets it to -1 if it is a meta-level retransmission to optimize the
|
|
+ * receive-buffer.
|
|
+ */
|
|
+static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject)
|
|
+{
|
|
+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct sk_buff *skb = NULL;
|
|
+
|
|
+ *reinject = 0;
|
|
+
|
|
+ /* If we are in fallback-mode, just take from the meta-send-queue */
|
|
+ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
|
|
+ return tcp_send_head(meta_sk);
|
|
+
|
|
+ skb = skb_peek(&mpcb->reinject_queue);
|
|
+
|
|
+ if (skb)
|
|
+ *reinject = 1;
|
|
+ else
|
|
+ skb = tcp_send_head(meta_sk);
|
|
+ return skb;
|
|
+}
|
|
+
|
|
+static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk,
|
|
+ int *reinject,
|
|
+ struct sock **subsk,
|
|
+ unsigned int *limit)
|
|
+{
|
|
+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct sock *choose_sk = NULL;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject);
|
|
+ unsigned char split = num_segments;
|
|
+ unsigned char iter = 0, full_subs = 0;
|
|
+
|
|
+ /* As we set it, we have to reset it as well. */
|
|
+ *limit = 0;
|
|
+
|
|
+ if (!skb)
|
|
+ return NULL;
|
|
+
|
|
+ if (*reinject) {
|
|
+ *subsk = rr_get_available_subflow(meta_sk, skb, false);
|
|
+ if (!*subsk)
|
|
+ return NULL;
|
|
+
|
|
+ return skb;
|
|
+ }
|
|
+
|
|
+retry:
|
|
+
|
|
+ /* First, we look for a subflow who is currently being used */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *tp_it = tcp_sk(sk_it);
|
|
+ struct rrsched_priv *rr_p = rrsched_get_priv(tp_it);
|
|
+
|
|
+ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
|
|
+ continue;
|
|
+
|
|
+ iter++;
|
|
+
|
|
+ /* Is this subflow currently being used? */
|
|
+ if (rr_p->quota > 0 && rr_p->quota < num_segments) {
|
|
+ split = num_segments - rr_p->quota;
|
|
+ choose_sk = sk_it;
|
|
+ goto found;
|
|
+ }
|
|
+
|
|
+ /* Or, it's totally unused */
|
|
+ if (!rr_p->quota) {
|
|
+ split = num_segments;
|
|
+ choose_sk = sk_it;
|
|
+ }
|
|
+
|
|
+ /* Or, it must then be fully used */
|
|
+ if (rr_p->quota >= num_segments)
|
|
+ full_subs++;
|
|
+ }
|
|
+
|
|
+ /* All considered subflows have a full quota, and we considered at
|
|
+ * least one.
|
|
+ */
|
|
+ if (iter && iter == full_subs) {
|
|
+ /* So, we restart this round by setting quota to 0 and retry
|
|
+ * to find a subflow.
|
|
+ */
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk_it = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *tp_it = tcp_sk(sk_it);
|
|
+ struct rrsched_priv *rr_p = rrsched_get_priv(tp_it);
|
|
+
|
|
+ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
|
|
+ continue;
|
|
+
|
|
+ rr_p->quota = 0;
|
|
+ }
|
|
+
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+found:
|
|
+ if (choose_sk) {
|
|
+ unsigned int mss_now;
|
|
+ struct tcp_sock *choose_tp = tcp_sk(choose_sk);
|
|
+ struct rrsched_priv *rr_p = rrsched_get_priv(choose_tp);
|
|
+
|
|
+ if (!mptcp_rr_is_available(choose_sk, skb, false, true))
|
|
+ return NULL;
|
|
+
|
|
+ *subsk = choose_sk;
|
|
+ mss_now = tcp_current_mss(*subsk);
|
|
+ *limit = split * mss_now;
|
|
+
|
|
+ if (skb->len > mss_now)
|
|
+ rr_p->quota += DIV_ROUND_UP(skb->len, mss_now);
|
|
+ else
|
|
+ rr_p->quota++;
|
|
+
|
|
+ return skb;
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static struct mptcp_sched_ops mptcp_sched_rr = {
|
|
+ .get_subflow = rr_get_available_subflow,
|
|
+ .next_segment = mptcp_rr_next_segment,
|
|
+ .name = "roundrobin",
|
|
+ .owner = THIS_MODULE,
|
|
+};
|
|
+
|
|
+static int __init rr_register(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE);
|
|
+
|
|
+ if (mptcp_register_scheduler(&mptcp_sched_rr))
|
|
+ return -1;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void rr_unregister(void)
|
|
+{
|
|
+ mptcp_unregister_scheduler(&mptcp_sched_rr);
|
|
+}
|
|
+
|
|
+module_init(rr_register);
|
|
+module_exit(rr_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Christoph Paasch");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("ROUNDROBIN MPTCP");
|
|
+MODULE_VERSION("0.89");
|
|
diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c
|
|
new file mode 100644
|
|
index 000000000000..eed9bfb44b59
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_sched.c
|
|
@@ -0,0 +1,677 @@
|
|
+/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
|
|
+
|
|
+#include <linux/bug.h>
|
|
+#include <linux/module.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <trace/events/tcp.h>
|
|
+
|
|
+static DEFINE_SPINLOCK(mptcp_sched_list_lock);
|
|
+static LIST_HEAD(mptcp_sched_list);
|
|
+
|
|
+struct defsched_priv {
|
|
+ u32 last_rbuf_opti;
|
|
+};
|
|
+
|
|
+static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp)
|
|
+{
|
|
+ return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0];
|
|
+}
|
|
+
|
|
+bool mptcp_is_def_unavailable(struct sock *sk)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+
|
|
+ /* Set of states for which we are allowed to send data */
|
|
+ if (!mptcp_sk_can_send(sk))
|
|
+ return true;
|
|
+
|
|
+ /* We do not send data on this subflow unless it is
|
|
+ * fully established, i.e. the 4th ack has been received.
|
|
+ */
|
|
+ if (tp->mptcp->pre_established)
|
|
+ return true;
|
|
+
|
|
+ if (tp->pf)
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(mptcp_is_def_unavailable);
|
|
+
|
|
+/* estimate number of segments currently in flight + unsent in
|
|
+ * the subflow socket.
|
|
+ */
|
|
+static int mptcp_subflow_queued(struct sock *sk, u32 max_tso_segs)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+ unsigned int queued;
|
|
+
|
|
+ /* estimate the max number of segments in the write queue
|
|
+ * this is an overestimation, avoiding to iterate over the queue
|
|
+ * to make a better estimation.
|
|
+ * Having only one skb in the queue however might trigger tso deferral,
|
|
+ * delaying the sending of a tso segment in the hope that skb_entail
|
|
+ * will append more data to the skb soon.
|
|
+ * Therefore, in the case only one skb is in the queue, we choose to
|
|
+ * potentially underestimate, risking to schedule one skb too many onto
|
|
+ * the subflow rather than not enough.
|
|
+ */
|
|
+ if (sk->sk_write_queue.qlen > 1)
|
|
+ queued = sk->sk_write_queue.qlen * max_tso_segs;
|
|
+ else
|
|
+ queued = sk->sk_write_queue.qlen;
|
|
+
|
|
+ return queued + tcp_packets_in_flight(tp);
|
|
+}
|
|
+
|
|
+static bool mptcp_is_temp_unavailable(struct sock *sk,
|
|
+ const struct sk_buff *skb,
|
|
+ bool zero_wnd_test)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+ unsigned int mss_now;
|
|
+
|
|
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
|
|
+ /* If SACK is disabled, and we got a loss, TCP does not exit
|
|
+ * the loss-state until something above high_seq has been
|
|
+ * acked. (see tcp_try_undo_recovery)
|
|
+ *
|
|
+ * high_seq is the snd_nxt at the moment of the RTO. As soon
|
|
+ * as we have an RTO, we won't push data on the subflow.
|
|
+ * Thus, snd_una can never go beyond high_seq.
|
|
+ */
|
|
+ if (!tcp_is_reno(tp))
|
|
+ return true;
|
|
+ else if (tp->snd_una != tp->high_seq)
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ if (!tp->mptcp->fully_established) {
|
|
+ /* Make sure that we send in-order data */
|
|
+ if (skb && tp->mptcp->second_packet &&
|
|
+ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ mss_now = tcp_current_mss(sk);
|
|
+
|
|
+ /* Not even a single spot in the cwnd */
|
|
+ if (mptcp_subflow_queued(sk, tcp_tso_segs(sk, mss_now)) >= tp->snd_cwnd)
|
|
+ return true;
|
|
+
|
|
+ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
|
|
+ return true;
|
|
+
|
|
+ /* Don't send on this subflow if we bypass the allowed send-window at
|
|
+ * the per-subflow level. Similar to tcp_snd_wnd_test, but manually
|
|
+ * calculated end_seq (because here at this point end_seq is still at
|
|
+ * the meta-level).
|
|
+ */
|
|
+ if (skb && zero_wnd_test &&
|
|
+ after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp)))
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/* Is the sub-socket sk available to send the skb? */
|
|
+bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb,
|
|
+ bool zero_wnd_test)
|
|
+{
|
|
+ return !mptcp_is_def_unavailable(sk) &&
|
|
+ !mptcp_is_temp_unavailable(sk, skb, zero_wnd_test);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(mptcp_is_available);
|
|
+
|
|
+/* Are we not allowed to reinject this skb on tp? */
|
|
+static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
|
|
+{
|
|
+ /* If the skb has already been enqueued in this sk, try to find
|
|
+ * another one.
|
|
+ */
|
|
+ return skb &&
|
|
+ /* Has the skb already been enqueued into this subsocket? */
|
|
+ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
|
|
+}
|
|
+
|
|
+bool subflow_is_backup(const struct tcp_sock *tp)
|
|
+{
|
|
+ return tp->mptcp->rcv_low_prio || tp->mptcp->low_prio;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(subflow_is_backup);
|
|
+
|
|
+bool subflow_is_active(const struct tcp_sock *tp)
|
|
+{
|
|
+ return !tp->mptcp->rcv_low_prio && !tp->mptcp->low_prio;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(subflow_is_active);
|
|
+
|
|
+/* Generic function to iterate over used and unused subflows and to select the
|
|
+ * best one
|
|
+ */
|
|
+static struct sock
|
|
+*get_subflow_from_selectors(struct mptcp_cb *mpcb, struct sk_buff *skb,
|
|
+ bool (*selector)(const struct tcp_sock *),
|
|
+ bool zero_wnd_test, bool *force)
|
|
+{
|
|
+ struct sock *bestsk = NULL;
|
|
+ u32 min_srtt = 0xffffffff;
|
|
+ bool found_unused = false;
|
|
+ bool found_unused_una = false;
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sk = mptcp_to_sock(mptcp);
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ bool unused = false;
|
|
+
|
|
+ /* First, we choose only the wanted sks */
|
|
+ if (!(*selector)(tp))
|
|
+ continue;
|
|
+
|
|
+ if (!mptcp_dont_reinject_skb(tp, skb))
|
|
+ unused = true;
|
|
+ else if (found_unused)
|
|
+ /* If a unused sk was found previously, we continue -
|
|
+ * no need to check used sks anymore.
|
|
+ */
|
|
+ continue;
|
|
+
|
|
+ if (mptcp_is_def_unavailable(sk))
|
|
+ continue;
|
|
+
|
|
+ if (mptcp_is_temp_unavailable(sk, skb, zero_wnd_test)) {
|
|
+ if (unused)
|
|
+ found_unused_una = true;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (unused) {
|
|
+ if (!found_unused) {
|
|
+ /* It's the first time we encounter an unused
|
|
+ * sk - thus we reset the bestsk (which might
|
|
+ * have been set to a used sk).
|
|
+ */
|
|
+ min_srtt = 0xffffffff;
|
|
+ bestsk = NULL;
|
|
+ }
|
|
+ found_unused = true;
|
|
+ }
|
|
+
|
|
+ if (tp->srtt_us < min_srtt) {
|
|
+ min_srtt = tp->srtt_us;
|
|
+ bestsk = sk;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (bestsk) {
|
|
+ /* The force variable is used to mark the returned sk as
|
|
+ * previously used or not-used.
|
|
+ */
|
|
+ if (found_unused)
|
|
+ *force = true;
|
|
+ else
|
|
+ *force = false;
|
|
+ } else {
|
|
+ /* The force variable is used to mark if there are temporally
|
|
+ * unavailable not-used sks.
|
|
+ */
|
|
+ if (found_unused_una)
|
|
+ *force = true;
|
|
+ else
|
|
+ *force = false;
|
|
+ }
|
|
+
|
|
+ return bestsk;
|
|
+}
|
|
+
|
|
+/* This is the scheduler. This function decides on which flow to send
|
|
+ * a given MSS. If all subflows are found to be busy, NULL is returned
|
|
+ * The flow is selected based on the shortest RTT.
|
|
+ * If all paths have full cong windows, we simply return NULL.
|
|
+ *
|
|
+ * Additionally, this function is aware of the backup-subflows.
|
|
+ */
|
|
+struct sock *get_available_subflow(struct sock *meta_sk, struct sk_buff *skb,
|
|
+ bool zero_wnd_test)
|
|
+{
|
|
+ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct sock *sk;
|
|
+ bool looping = false, force;
|
|
+
|
|
+ /* Answer data_fin on same subflow!!! */
|
|
+ if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
|
|
+ skb && mptcp_is_data_fin(skb)) {
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ sk = mptcp_to_sock(mptcp);
|
|
+
|
|
+ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
|
|
+ mptcp_is_available(sk, skb, zero_wnd_test))
|
|
+ return sk;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Find the best subflow */
|
|
+restart:
|
|
+ sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_active,
|
|
+ zero_wnd_test, &force);
|
|
+ if (force)
|
|
+ /* one unused active sk or one NULL sk when there is at least
|
|
+ * one temporally unavailable unused active sk
|
|
+ */
|
|
+ return sk;
|
|
+
|
|
+ sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_backup,
|
|
+ zero_wnd_test, &force);
|
|
+ if (!force && skb) {
|
|
+ /* one used backup sk or one NULL sk where there is no one
|
|
+ * temporally unavailable unused backup sk
|
|
+ *
|
|
+ * the skb passed through all the available active and backups
|
|
+ * sks, so clean the path mask
|
|
+ */
|
|
+ TCP_SKB_CB(skb)->path_mask = 0;
|
|
+
|
|
+ if (!looping) {
|
|
+ looping = true;
|
|
+ goto restart;
|
|
+ }
|
|
+ }
|
|
+ return sk;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(get_available_subflow);
|
|
+
|
|
+static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal)
|
|
+{
|
|
+ struct sock *meta_sk;
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+ struct sk_buff *skb_head;
|
|
+ struct defsched_priv *def_p = defsched_get_priv(tp);
|
|
+
|
|
+ meta_sk = mptcp_meta_sk(sk);
|
|
+ skb_head = tcp_rtx_queue_head(meta_sk);
|
|
+
|
|
+ if (!skb_head)
|
|
+ return NULL;
|
|
+
|
|
+ /* If penalization is optional (coming from mptcp_next_segment() and
|
|
+ * We are not send-buffer-limited we do not penalize. The retransmission
|
|
+ * is just an optimization to fix the idle-time due to the delay before
|
|
+ * we wake up the application.
|
|
+ */
|
|
+ if (!penal && sk_stream_memory_free(meta_sk))
|
|
+ goto retrans;
|
|
+
|
|
+ /* Only penalize again after an RTT has elapsed */
|
|
+ if (tcp_jiffies32 - def_p->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3))
|
|
+ goto retrans;
|
|
+
|
|
+ /* Half the cwnd of the slow flows */
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ struct tcp_sock *tp_it = mptcp->tp;
|
|
+
|
|
+ if (tp_it != tp &&
|
|
+ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
|
|
+ if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {
|
|
+ u32 prior_cwnd = tp_it->snd_cwnd;
|
|
+
|
|
+ tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);
|
|
+
|
|
+ /* If in slow start, do not reduce the ssthresh */
|
|
+ if (prior_cwnd >= tp_it->snd_ssthresh)
|
|
+ tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);
|
|
+
|
|
+ def_p->last_rbuf_opti = tcp_jiffies32;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+retrans:
|
|
+
|
|
+ /* Segment not yet injected into this path? Take it!!! */
|
|
+ if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {
|
|
+ bool do_retrans = false;
|
|
+ mptcp_for_each_sub(tp->mpcb, mptcp) {
|
|
+ struct tcp_sock *tp_it = mptcp->tp;
|
|
+
|
|
+ if (tp_it != tp &&
|
|
+ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
|
|
+ if (tp_it->snd_cwnd <= 4) {
|
|
+ do_retrans = true;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (4 * tp->srtt_us >= tp_it->srtt_us) {
|
|
+ do_retrans = false;
|
|
+ break;
|
|
+ } else {
|
|
+ do_retrans = true;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (do_retrans && mptcp_is_available(sk, skb_head, false)) {
|
|
+ trace_mptcp_retransmit(sk, skb_head);
|
|
+ return skb_head;
|
|
+ }
|
|
+ }
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+/* Returns the next segment to be sent from the mptcp meta-queue.
|
|
+ * (chooses the reinject queue if any segment is waiting in it, otherwise,
|
|
+ * chooses the normal write queue).
|
|
+ * Sets *@reinject to 1 if the returned segment comes from the
|
|
+ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
|
|
+ * and sets it to -1 if it is a meta-level retransmission to optimize the
|
|
+ * receive-buffer.
|
|
+ */
|
|
+static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject)
|
|
+{
|
|
+ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
|
|
+ struct sk_buff *skb = NULL;
|
|
+
|
|
+ *reinject = 0;
|
|
+
|
|
+ /* If we are in fallback-mode, just take from the meta-send-queue */
|
|
+ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
|
|
+ return tcp_send_head(meta_sk);
|
|
+
|
|
+ skb = skb_peek(&mpcb->reinject_queue);
|
|
+
|
|
+ if (skb) {
|
|
+ *reinject = 1;
|
|
+ } else {
|
|
+ skb = tcp_send_head(meta_sk);
|
|
+
|
|
+ if (!skb && meta_sk->sk_socket &&
|
|
+ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) &&
|
|
+ sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {
|
|
+ struct sock *subsk;
|
|
+
|
|
+ /* meta is send buffer limited */
|
|
+ tcp_chrono_start(meta_sk, TCP_CHRONO_SNDBUF_LIMITED);
|
|
+
|
|
+ subsk = mpcb->sched_ops->get_subflow(meta_sk,
|
|
+ NULL, false);
|
|
+ if (!subsk)
|
|
+ return NULL;
|
|
+
|
|
+ skb = mptcp_rcv_buf_optimization(subsk, 0);
|
|
+ if (skb)
|
|
+ *reinject = -1;
|
|
+ else
|
|
+ tcp_chrono_start(subsk,
|
|
+ TCP_CHRONO_SNDBUF_LIMITED);
|
|
+ }
|
|
+ }
|
|
+ return skb;
|
|
+}
|
|
+
|
|
+struct sk_buff *mptcp_next_segment(struct sock *meta_sk,
|
|
+ int *reinject,
|
|
+ struct sock **subsk,
|
|
+ unsigned int *limit)
|
|
+{
|
|
+ struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject);
|
|
+ unsigned int mss_now;
|
|
+ u32 max_len, gso_max_segs, max_segs, max_tso_segs, window;
|
|
+ struct tcp_sock *subtp;
|
|
+ int queued;
|
|
+
|
|
+ /* As we set it, we have to reset it as well. */
|
|
+ *limit = 0;
|
|
+
|
|
+ if (!skb)
|
|
+ return NULL;
|
|
+
|
|
+ *subsk = tcp_sk(meta_sk)->mpcb->sched_ops->get_subflow(meta_sk, skb, false);
|
|
+ if (!*subsk)
|
|
+ return NULL;
|
|
+
|
|
+ subtp = tcp_sk(*subsk);
|
|
+ mss_now = tcp_current_mss(*subsk);
|
|
+
|
|
+ if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) {
|
|
+ /* an active flow is selected, but segment will not be sent due
|
|
+ * to no more space in send window
|
|
+ * this means the meta is receive window limited
|
|
+ * the subflow might also be, if we have nothing to reinject
|
|
+ */
|
|
+ tcp_chrono_start(meta_sk, TCP_CHRONO_RWND_LIMITED);
|
|
+ skb = mptcp_rcv_buf_optimization(*subsk, 1);
|
|
+ if (skb)
|
|
+ *reinject = -1;
|
|
+ else
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ if (!*reinject) {
|
|
+ /* this will stop any other chronos on the meta */
|
|
+ tcp_chrono_start(meta_sk, TCP_CHRONO_BUSY);
|
|
+ }
|
|
+
|
|
+ /* No splitting required, as we will only send one single segment */
|
|
+ if (skb->len <= mss_now)
|
|
+ return skb;
|
|
+
|
|
+ max_tso_segs = tcp_tso_segs(*subsk, tcp_current_mss(*subsk));
|
|
+ queued = mptcp_subflow_queued(*subsk, max_tso_segs);
|
|
+
|
|
+ /* this condition should already have been established in
|
|
+ * mptcp_is_temp_unavailable when selecting available flows
|
|
+ */
|
|
+ WARN_ONCE(subtp->snd_cwnd <= queued, "Selected subflow no cwnd room");
|
|
+
|
|
+ gso_max_segs = (*subsk)->sk_gso_max_segs;
|
|
+ if (!gso_max_segs) /* No gso supported on the subflow's NIC */
|
|
+ gso_max_segs = 1;
|
|
+
|
|
+ max_segs = min_t(unsigned int, subtp->snd_cwnd - queued, gso_max_segs);
|
|
+ if (!max_segs)
|
|
+ return NULL;
|
|
+
|
|
+ /* if there is room for a segment, schedule up to a complete TSO
|
|
+ * segment to avoid TSO splitting. Even if it is more than allowed by
|
|
+ * the congestion window.
|
|
+ */
|
|
+ max_segs = max_t(unsigned int, max_tso_segs, max_segs);
|
|
+
|
|
+ max_len = min(mss_now * max_segs, skb->len);
|
|
+
|
|
+ window = tcp_wnd_end(subtp) - subtp->write_seq;
|
|
+
|
|
+ /* max_len now also respects the announced receive-window */
|
|
+ max_len = min(max_len, window);
|
|
+
|
|
+ *limit = max_len;
|
|
+
|
|
+ return skb;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(mptcp_next_segment);
|
|
+
|
|
+static void defsched_init(struct sock *sk)
|
|
+{
|
|
+ struct defsched_priv *def_p = defsched_get_priv(tcp_sk(sk));
|
|
+
|
|
+ def_p->last_rbuf_opti = tcp_jiffies32;
|
|
+}
|
|
+
|
|
+struct mptcp_sched_ops mptcp_sched_default = {
|
|
+ .get_subflow = get_available_subflow,
|
|
+ .next_segment = mptcp_next_segment,
|
|
+ .init = defsched_init,
|
|
+ .name = "default",
|
|
+ .owner = THIS_MODULE,
|
|
+};
|
|
+
|
|
+static struct mptcp_sched_ops *mptcp_sched_find(const char *name)
|
|
+{
|
|
+ struct mptcp_sched_ops *e;
|
|
+
|
|
+ list_for_each_entry_rcu(e, &mptcp_sched_list, list) {
|
|
+ if (strcmp(e->name, name) == 0)
|
|
+ return e;
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!sched->get_subflow || !sched->next_segment)
|
|
+ return -EINVAL;
|
|
+
|
|
+ spin_lock(&mptcp_sched_list_lock);
|
|
+ if (mptcp_sched_find(sched->name)) {
|
|
+ pr_notice("%s already registered\n", sched->name);
|
|
+ ret = -EEXIST;
|
|
+ } else {
|
|
+ list_add_tail_rcu(&sched->list, &mptcp_sched_list);
|
|
+ pr_info("%s registered\n", sched->name);
|
|
+ }
|
|
+ spin_unlock(&mptcp_sched_list_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(mptcp_register_scheduler);
|
|
+
|
|
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
|
|
+{
|
|
+ spin_lock(&mptcp_sched_list_lock);
|
|
+ list_del_rcu(&sched->list);
|
|
+ spin_unlock(&mptcp_sched_list_lock);
|
|
+
|
|
+ /* Wait for outstanding readers to complete before the
|
|
+ * module gets removed entirely.
|
|
+ *
|
|
+ * A try_module_get() should fail by now as our module is
|
|
+ * in "going" state since no refs are held anymore and
|
|
+ * module_exit() handler being called.
|
|
+ */
|
|
+ synchronize_rcu();
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler);
|
|
+
|
|
+void mptcp_get_default_scheduler(char *name)
|
|
+{
|
|
+ struct mptcp_sched_ops *sched;
|
|
+
|
|
+ BUG_ON(list_empty(&mptcp_sched_list));
|
|
+
|
|
+ rcu_read_lock();
|
|
+ sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list);
|
|
+ strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX);
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+int mptcp_set_default_scheduler(const char *name)
|
|
+{
|
|
+ struct mptcp_sched_ops *sched;
|
|
+ int ret = -ENOENT;
|
|
+
|
|
+ spin_lock(&mptcp_sched_list_lock);
|
|
+ sched = mptcp_sched_find(name);
|
|
+#ifdef CONFIG_MODULES
|
|
+ if (!sched && capable(CAP_NET_ADMIN)) {
|
|
+ spin_unlock(&mptcp_sched_list_lock);
|
|
+
|
|
+ request_module("mptcp_%s", name);
|
|
+ spin_lock(&mptcp_sched_list_lock);
|
|
+ sched = mptcp_sched_find(name);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ if (sched) {
|
|
+ list_move(&sched->list, &mptcp_sched_list);
|
|
+ ret = 0;
|
|
+ } else {
|
|
+ pr_info("%s is not available\n", name);
|
|
+ }
|
|
+ spin_unlock(&mptcp_sched_list_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Must be called with rcu lock held */
|
|
+static struct mptcp_sched_ops *__mptcp_sched_find_autoload(const char *name)
|
|
+{
|
|
+ struct mptcp_sched_ops *sched = mptcp_sched_find(name);
|
|
+#ifdef CONFIG_MODULES
|
|
+ if (!sched && capable(CAP_NET_ADMIN)) {
|
|
+ rcu_read_unlock();
|
|
+ request_module("mptcp_%s", name);
|
|
+ rcu_read_lock();
|
|
+ sched = mptcp_sched_find(name);
|
|
+ }
|
|
+#endif
|
|
+ return sched;
|
|
+}
|
|
+
|
|
+void mptcp_init_scheduler(struct mptcp_cb *mpcb)
|
|
+{
|
|
+ struct mptcp_sched_ops *sched;
|
|
+ struct sock *meta_sk = mpcb->meta_sk;
|
|
+ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ /* if scheduler was set using socket option */
|
|
+ if (meta_tp->mptcp_sched_setsockopt) {
|
|
+ sched = __mptcp_sched_find_autoload(meta_tp->mptcp_sched_name);
|
|
+ if (sched && try_module_get(sched->owner)) {
|
|
+ mpcb->sched_ops = sched;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
|
|
+ if (try_module_get(sched->owner)) {
|
|
+ mpcb->sched_ops = sched;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+out:
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+/* Change scheduler for socket */
|
|
+int mptcp_set_scheduler(struct sock *sk, const char *name)
|
|
+{
|
|
+ struct mptcp_sched_ops *sched;
|
|
+ int err = 0;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ sched = __mptcp_sched_find_autoload(name);
|
|
+
|
|
+ if (!sched) {
|
|
+ err = -ENOENT;
|
|
+ } else if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
|
|
+ err = -EPERM;
|
|
+ } else {
|
|
+ strcpy(tcp_sk(sk)->mptcp_sched_name, name);
|
|
+ tcp_sk(sk)->mptcp_sched_setsockopt = 1;
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return err;
|
|
+}
|
|
+
|
|
+/* Manage refcounts on socket close. */
|
|
+void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb)
|
|
+{
|
|
+ module_put(mpcb->sched_ops->owner);
|
|
+}
|
|
+
|
|
+/* Set default value from kernel configuration at bootup */
|
|
+static int __init mptcp_scheduler_default(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE);
|
|
+
|
|
+ return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED);
|
|
+}
|
|
+late_initcall(mptcp_scheduler_default);
|
|
diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c
|
|
new file mode 100644
|
|
index 000000000000..787ddaab98a2
|
|
--- /dev/null
|
|
+++ b/net/mptcp/mptcp_wvegas.c
|
|
@@ -0,0 +1,271 @@
|
|
+/*
|
|
+ * MPTCP implementation - WEIGHTED VEGAS
|
|
+ *
|
|
+ * Algorithm design:
|
|
+ * Yu Cao <cyAnalyst@126.com>
|
|
+ * Mingwei Xu <xmw@csnet1.cs.tsinghua.edu.cn>
|
|
+ * Xiaoming Fu <fu@cs.uni-goettinggen.de>
|
|
+ *
|
|
+ * Implementation:
|
|
+ * Yu Cao <cyAnalyst@126.com>
|
|
+ * Enhuan Dong <deh13@mails.tsinghua.edu.cn>
|
|
+ *
|
|
+ * Ported to the official MPTCP-kernel:
|
|
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
|
|
+ *
|
|
+ * This program is free software; you can redistribute it and/or
|
|
+ * modify it under the terms of the GNU General Public License
|
|
+ * as published by the Free Software Foundation; either version
|
|
+ * 2 of the License, or (at your option) any later version.
|
|
+ */
|
|
+
|
|
+#include <linux/skbuff.h>
|
|
+#include <net/tcp.h>
|
|
+#include <net/mptcp.h>
|
|
+#include <linux/module.h>
|
|
+#include <linux/tcp.h>
|
|
+
|
|
+static int initial_alpha = 2;
|
|
+static int total_alpha = 10;
|
|
+static int gamma = 1;
|
|
+
|
|
+module_param(initial_alpha, int, 0644);
|
|
+MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows");
|
|
+module_param(total_alpha, int, 0644);
|
|
+MODULE_PARM_DESC(total_alpha, "total alpha for all subflows");
|
|
+module_param(gamma, int, 0644);
|
|
+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
|
|
+
|
|
+#define MPTCP_WVEGAS_SCALE 16
|
|
+
|
|
+/* wVegas variables */
|
|
+struct wvegas {
|
|
+ u32 beg_snd_nxt; /* right edge during last RTT */
|
|
+ u8 doing_wvegas_now;/* if true, do wvegas for this RTT */
|
|
+
|
|
+ u16 cnt_rtt; /* # of RTTs measured within last RTT */
|
|
+ u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */
|
|
+ u32 base_rtt; /* the min of all wVegas RTT measurements seen (in usec) */
|
|
+
|
|
+ u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */
|
|
+ u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */
|
|
+ int alpha; /* alpha for each subflows */
|
|
+
|
|
+ u32 queue_delay; /* queue delay*/
|
|
+};
|
|
+
|
|
+
|
|
+static inline u64 mptcp_wvegas_scale(u32 val, int scale)
|
|
+{
|
|
+ return (u64) val << scale;
|
|
+}
|
|
+
|
|
+static void wvegas_enable(const struct sock *sk)
|
|
+{
|
|
+ const struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct wvegas *wvegas = inet_csk_ca(sk);
|
|
+
|
|
+ wvegas->doing_wvegas_now = 1;
|
|
+
|
|
+ wvegas->beg_snd_nxt = tp->snd_nxt;
|
|
+
|
|
+ wvegas->cnt_rtt = 0;
|
|
+ wvegas->sampled_rtt = 0;
|
|
+
|
|
+ wvegas->instant_rate = 0;
|
|
+ wvegas->alpha = initial_alpha;
|
|
+ wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE);
|
|
+
|
|
+ wvegas->queue_delay = 0;
|
|
+}
|
|
+
|
|
+static inline void wvegas_disable(const struct sock *sk)
|
|
+{
|
|
+ struct wvegas *wvegas = inet_csk_ca(sk);
|
|
+
|
|
+ wvegas->doing_wvegas_now = 0;
|
|
+}
|
|
+
|
|
+static void mptcp_wvegas_init(struct sock *sk)
|
|
+{
|
|
+ struct wvegas *wvegas = inet_csk_ca(sk);
|
|
+
|
|
+ wvegas->base_rtt = 0x7fffffff;
|
|
+ wvegas_enable(sk);
|
|
+}
|
|
+
|
|
+static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us)
|
|
+{
|
|
+ return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us);
|
|
+}
|
|
+
|
|
+static void mptcp_wvegas_pkts_acked(struct sock *sk,
|
|
+ const struct ack_sample *sample)
|
|
+{
|
|
+ struct wvegas *wvegas = inet_csk_ca(sk);
|
|
+ u32 vrtt;
|
|
+
|
|
+ if (sample->rtt_us < 0)
|
|
+ return;
|
|
+
|
|
+ vrtt = sample->rtt_us + 1;
|
|
+
|
|
+ if (vrtt < wvegas->base_rtt)
|
|
+ wvegas->base_rtt = vrtt;
|
|
+
|
|
+ wvegas->sampled_rtt += vrtt;
|
|
+ wvegas->cnt_rtt++;
|
|
+}
|
|
+
|
|
+static void mptcp_wvegas_state(struct sock *sk, u8 ca_state)
|
|
+{
|
|
+ if (ca_state == TCP_CA_Open)
|
|
+ wvegas_enable(sk);
|
|
+ else
|
|
+ wvegas_disable(sk);
|
|
+}
|
|
+
|
|
+static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
|
|
+{
|
|
+ if (event == CA_EVENT_CWND_RESTART) {
|
|
+ mptcp_wvegas_init(sk);
|
|
+ } else if (event == CA_EVENT_LOSS) {
|
|
+ struct wvegas *wvegas = inet_csk_ca(sk);
|
|
+ wvegas->instant_rate = 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp)
|
|
+{
|
|
+ return min(tp->snd_ssthresh, tp->snd_cwnd);
|
|
+}
|
|
+
|
|
+static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk)
|
|
+{
|
|
+ u64 total_rate = 0;
|
|
+ const struct wvegas *wvegas = inet_csk_ca(sk);
|
|
+ struct mptcp_tcp_sock *mptcp;
|
|
+
|
|
+ if (!mpcb)
|
|
+ return wvegas->weight;
|
|
+
|
|
+
|
|
+ mptcp_for_each_sub(mpcb, mptcp) {
|
|
+ struct sock *sub_sk = mptcp_to_sock(mptcp);
|
|
+ struct wvegas *sub_wvegas = inet_csk_ca(sub_sk);
|
|
+
|
|
+ /* sampled_rtt is initialized by 0 */
|
|
+ if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0))
|
|
+ total_rate += sub_wvegas->instant_rate;
|
|
+ }
|
|
+
|
|
+ if (total_rate && wvegas->instant_rate)
|
|
+ return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate);
|
|
+ else
|
|
+ return wvegas->weight;
|
|
+}
|
|
+
|
|
+static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
|
+{
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
+ struct wvegas *wvegas = inet_csk_ca(sk);
|
|
+
|
|
+ if (!wvegas->doing_wvegas_now) {
|
|
+ tcp_reno_cong_avoid(sk, ack, acked);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (after(ack, wvegas->beg_snd_nxt)) {
|
|
+ wvegas->beg_snd_nxt = tp->snd_nxt;
|
|
+
|
|
+ if (wvegas->cnt_rtt <= 2) {
|
|
+ tcp_reno_cong_avoid(sk, ack, acked);
|
|
+ } else {
|
|
+ u32 rtt, diff, q_delay;
|
|
+ u64 target_cwnd;
|
|
+
|
|
+ rtt = wvegas->sampled_rtt / wvegas->cnt_rtt;
|
|
+ target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt);
|
|
+
|
|
+ diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt);
|
|
+
|
|
+ if (diff > gamma && tcp_in_slow_start(tp)) {
|
|
+ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
|
|
+ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
|
|
+
|
|
+ } else if (tcp_in_slow_start(tp)) {
|
|
+ tcp_slow_start(tp, acked);
|
|
+ } else {
|
|
+ if (diff >= wvegas->alpha) {
|
|
+ wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt);
|
|
+ wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk);
|
|
+ wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE));
|
|
+ }
|
|
+ if (diff > wvegas->alpha) {
|
|
+ tp->snd_cwnd--;
|
|
+ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
|
|
+ } else if (diff < wvegas->alpha) {
|
|
+ tp->snd_cwnd++;
|
|
+ }
|
|
+
|
|
+ /* Try to drain link queue if needed*/
|
|
+ q_delay = rtt - wvegas->base_rtt;
|
|
+ if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay))
|
|
+ wvegas->queue_delay = q_delay;
|
|
+
|
|
+ if (q_delay >= 2 * wvegas->queue_delay) {
|
|
+ u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt);
|
|
+ tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE;
|
|
+ wvegas->queue_delay = 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (tp->snd_cwnd < 2)
|
|
+ tp->snd_cwnd = 2;
|
|
+ else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
|
|
+ tp->snd_cwnd = tp->snd_cwnd_clamp;
|
|
+
|
|
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
|
|
+ }
|
|
+
|
|
+ wvegas->cnt_rtt = 0;
|
|
+ wvegas->sampled_rtt = 0;
|
|
+ }
|
|
+ /* Use normal slow start */
|
|
+ else if (tcp_in_slow_start(tp))
|
|
+ tcp_slow_start(tp, acked);
|
|
+}
|
|
+
|
|
+
|
|
+static struct tcp_congestion_ops mptcp_wvegas __read_mostly = {
|
|
+ .init = mptcp_wvegas_init,
|
|
+ .ssthresh = tcp_reno_ssthresh,
|
|
+ .cong_avoid = mptcp_wvegas_cong_avoid,
|
|
+ .undo_cwnd = tcp_reno_undo_cwnd,
|
|
+ .pkts_acked = mptcp_wvegas_pkts_acked,
|
|
+ .set_state = mptcp_wvegas_state,
|
|
+ .cwnd_event = mptcp_wvegas_cwnd_event,
|
|
+
|
|
+ .owner = THIS_MODULE,
|
|
+ .name = "wvegas",
|
|
+};
|
|
+
|
|
+static int __init mptcp_wvegas_register(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE);
|
|
+ tcp_register_congestion_control(&mptcp_wvegas);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void __exit mptcp_wvegas_unregister(void)
|
|
+{
|
|
+ tcp_unregister_congestion_control(&mptcp_wvegas);
|
|
+}
|
|
+
|
|
+module_init(mptcp_wvegas_register);
|
|
+module_exit(mptcp_wvegas_unregister);
|
|
+
|
|
+MODULE_AUTHOR("Yu Cao, Enhuan Dong");
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_DESCRIPTION("MPTCP wVegas");
|
|
+MODULE_VERSION("0.1");
|
|
diff --git a/net/socket.c b/net/socket.c
|
|
index 94358566c9d1..a26eeeda2b4d 100644
|
|
--- a/net/socket.c
|
|
+++ b/net/socket.c
|
|
@@ -91,6 +91,7 @@
|
|
#include <asm/unistd.h>
|
|
|
|
#include <net/compat.h>
|
|
+#include <net/mptcp.h>
|
|
#include <net/wext.h>
|
|
#include <net/cls_cgroup.h>
|
|
|
|
@@ -1339,6 +1340,7 @@ int __sock_create(struct net *net, int family, int type, int protocol,
|
|
int err;
|
|
struct socket *sock;
|
|
const struct net_proto_family *pf;
|
|
+ int old_protocol = protocol;
|
|
|
|
/*
|
|
* Check protocol is in range
|
|
@@ -1359,6 +1361,9 @@ int __sock_create(struct net *net, int family, int type, int protocol,
|
|
family = PF_PACKET;
|
|
}
|
|
|
|
+ if (old_protocol == IPPROTO_MPTCP)
|
|
+ protocol = IPPROTO_TCP;
|
|
+
|
|
err = security_socket_create(family, type, protocol, kern);
|
|
if (err)
|
|
return err;
|
|
@@ -1408,6 +1413,10 @@ int __sock_create(struct net *net, int family, int type, int protocol,
|
|
if (err < 0)
|
|
goto out_module_put;
|
|
|
|
+ if (sysctl_mptcp_enabled && old_protocol == IPPROTO_MPTCP &&
|
|
+ type == SOCK_STREAM && (family == AF_INET || family == AF_INET6))
|
|
+ mptcp_enable_sock(sock->sk);
|
|
+
|
|
/*
|
|
* Now to bump the refcnt of the [loadable] module that owns this
|
|
* socket at sock_release time we decrement its refcnt.
|
|
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
|
|
index 63038eb23560..7150eb62db86 100644
|
|
--- a/tools/include/uapi/linux/bpf.h
|
|
+++ b/tools/include/uapi/linux/bpf.h
|
|
@@ -3438,6 +3438,7 @@ enum {
|
|
BPF_TCP_LISTEN,
|
|
BPF_TCP_CLOSING, /* Now a valid state */
|
|
BPF_TCP_NEW_SYN_RECV,
|
|
+ BPF_TCP_RST_WAIT,
|
|
|
|
BPF_TCP_MAX_STATES /* Leave at the end! */
|
|
};
|