diff --git a/build.sh b/build.sh old mode 100755 new mode 100644 index e18af1cc..aa5ba54f --- a/build.sh +++ b/build.sh @@ -1,7 +1,7 @@ #!/bin/sh # # Copyright (C) 2017 OVH OverTheBox -# Copyright (C) 2017-2020 Ycarus (Yannick Chabanois) for OpenMPTCProuter project +# Copyright (C) 2017-2022 Ycarus (Yannick Chabanois) for OpenMPTCProuter project # # This is free software, licensed under the GNU General Public License v3. # See /LICENSE for more information. @@ -132,7 +132,7 @@ if [ "$OMR_KEEPBIN" = "no" ]; then fi rm -rf "$OMR_TARGET/${OMR_KERNEL}/source/files" "$OMR_TARGET/${OMR_KERNEL}/source/tmp" #rm -rf "$OMR_TARGET/${OMR_KERNEL}/source/target/linux/mediatek/patches-4.14" -rm -rf "$OMR_TARGET/${OMR_KERNEL}/source/target/linux/mediatek/patches-5.4" +#rm -rf "$OMR_TARGET/${OMR_KERNEL}/source/target/linux/mediatek/patches-5.4" rm -rf "$OMR_TARGET/${OMR_KERNEL}/source/package/boot/uboot-mediatek" rm -rf "$OMR_TARGET/${OMR_KERNEL}/source/package/boot/arm-trusted-firmware-mediatek" rm -rf "$OMR_TARGET/${OMR_KERNEL}/source/tools/firmware-utils" @@ -365,7 +365,7 @@ echo "Done" #echo "Done" # Add BBR2 patch, only working on 64bits images for now -if [ "$OMR_KERNEL" = "5.4" ] && ([ "$OMR_TARGET" = "x86_64" ] || [ "$OMR_TARGET" = "bpi-r64" ] || [ "$OMR_TARGET" = "rpi4" ] || [ "$OMR_TARGET" = "espressobin" ] || [ "$OMR_TARGET" = "r2s" ] || [ "$OMR_TARGET" = "r4s" ] || [ "$OMR_TARGET" = "rpi3" ]); then +if ([ "$OMR_KERNEL" = "5.4" ] || [ "$OMR_KERNEL" = "5.4" ]) && ([ "$OMR_TARGET" = "x86_64" ] || [ "$OMR_TARGET" = "bpi-r64" ] || [ "$OMR_TARGET" = "rpi4" ] || [ "$OMR_TARGET" = "espressobin" ] || [ "$OMR_TARGET" = "r2s" ] || [ "$OMR_TARGET" = "r4s" ] || [ "$OMR_TARGET" = "rpi3" ]); then echo "Checking if BBRv2 patch is set or not" if ! patch -Rf -N -p1 -s --dry-run < ../../../patches/bbr2.patch; then echo "apply..." @@ -373,6 +373,11 @@ if [ "$OMR_KERNEL" = "5.4" ] && ([ "$OMR_TARGET" = "x86_64" ] || [ "$OMR_TARGET" fi echo "Done" fi +if [ "$OMR_KERNEL" = "5.15" ] && ([ "$OMR_TARGET" = "x86_64" ] || [ "$OMR_TARGET" = "bpi-r64" ] || [ "$OMR_TARGET" = "rpi4" ] || [ "$OMR_TARGET" = "espressobin" ] || [ "$OMR_TARGET" = "r2s" ] || [ "$OMR_TARGET" = "r4s" ] || [ "$OMR_TARGET" = "rpi3" ]); then + echo "Checking if BBRv2 patch is set or not" + cp ../../../patches/bbr2-5.15.patch target/linux/generic/hack-5.15/693-tcp_bbr2.patch + echo "Done" +fi echo "Checking if smsc75xx patch is set or not" if ! patch -Rf -N -p1 -s --dry-run < ../../../patches/smsc75xx.patch; then @@ -478,6 +483,10 @@ fi #if [ -f target/linux/generic/pending-5.4/770-16-net-ethernet-mediatek-mtk_eth_soc-add-flow-offloadin.patch ]; then # rm -f target/linux/generic/pending-5.4/770-16-net-ethernet-mediatek-mtk_eth_soc-add-flow-offloadin.patch #fi +#if [ -f target/linux/generic/pending-5.15/850-0023-PCI-aardvark-Make-main-irq_chip-structure-a-static-d.patch ]; then +# rm -f target/linux/generic/pending-5.15/850-0023-PCI-aardvark-Make-main-irq_chip-structure-a-static-d.patch +#fi + if [ "$OMR_KERNEL" = "5.4" ]; then echo "Set to kernel 5.4 for rpi arch" diff --git a/patches/bbr2-5.15.patch b/patches/bbr2-5.15.patch new file mode 100644 index 00000000..ae33db4b --- /dev/null +++ b/patches/bbr2-5.15.patch @@ -0,0 +1,3408 @@ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index b06c2d02ec84e..2cdc5a0709fe0 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -134,8 +134,9 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++/* XXX inflated by temporary internal debugging info */ ++#define ICSK_CA_PRIV_SIZE (216) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 82389ead6304e..fa924be339a98 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -371,6 +371,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, + #define TCP_ECN_QUEUE_CWR 2 + #define TCP_ECN_DEMAND_CWR 4 + #define TCP_ECN_SEEN 8 ++#define TCP_ECN_ECT_PERMANENT 16 + + enum tcp_tw_status { + TCP_TW_SUCCESS = 0, +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 20ee93f0f8761..96d52dd9c48ac 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -231,9 +231,42 @@ struct tcp_bbr_info { + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ + }; + ++/* Phase as reported in netlink/ss stats. */ ++enum tcp_bbr2_phase { ++ BBR2_PHASE_INVALID = 0, ++ BBR2_PHASE_STARTUP = 1, ++ BBR2_PHASE_DRAIN = 2, ++ BBR2_PHASE_PROBE_RTT = 3, ++ BBR2_PHASE_PROBE_BW_UP = 4, ++ BBR2_PHASE_PROBE_BW_DOWN = 5, ++ BBR2_PHASE_PROBE_BW_CRUISE = 6, ++ BBR2_PHASE_PROBE_BW_REFILL = 7 ++}; ++ ++struct tcp_bbr2_info { ++ /* u64 bw: bandwidth (app throughput) estimate in Byte per sec: */ ++ __u32 bbr_bw_lsb; /* lower 32 bits of bw */ ++ __u32 bbr_bw_msb; /* upper 32 bits of bw */ ++ __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ ++ __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ ++ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* MUST be at this offset in struct */ ++ __u32 bbr_inflight_lo; /* lower/short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher/long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ + union tcp_cc_info { + struct tcpvegas_info vegas; + struct tcp_dctcp_info dctcp; + struct tcp_bbr_info bbr; ++ struct tcp_bbr2_info bbr2; + }; + #endif /* _UAPI_INET_DIAG_H_ */ +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 87983e70f03f3..a833a7a67ce79 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -669,6 +669,24 @@ config TCP_CONG_BBR + AQM schemes that do not provide a delay signal. It requires the fq + ("Fair Queue") pacing packet scheduler. + ++config TCP_CONG_BBR2 ++ tristate "BBR2 TCP" ++ default n ++ help ++ ++ BBR2 TCP congestion control is a model-based congestion control ++ algorithm that aims to maximize network utilization, keep queues and ++ retransmit rates low, and to be able to coexist with Reno/CUBIC in ++ common scenarios. It builds an explicit model of the network path. It ++ tolerates a targeted degree of random packet loss and delay that are ++ unrelated to congestion. It can operate over LAN, WAN, cellular, wifi, ++ or cable modem links, and can use DCTCP-L4S-style ECN signals. It can ++ coexist with flows that use loss-based congestion control, and can ++ operate with shallow buffers, deep buffers, bufferbloat, policers, or ++ AQM schemes that do not provide a delay signal. It requires pacing, ++ using either TCP internal pacing or the fq ("Fair Queue") pacing packet ++ scheduler. ++ + choice + prompt "Default TCP congestion control" + default DEFAULT_CUBIC +@@ -706,6 +724,9 @@ choice + config DEFAULT_BBR + bool "BBR" if TCP_CONG_BBR=y + ++ config DEFAULT_BBR2 ++ bool "BBR2" if TCP_CONG_BBR2=y ++ + config DEFAULT_RENO + bool "Reno" + endchoice +@@ -730,6 +751,7 @@ config DEFAULT_TCP_CONG + default "dctcp" if DEFAULT_DCTCP + default "cdg" if DEFAULT_CDG + default "bbr" if DEFAULT_BBR ++ default "bbr2" if DEFAULT_BBR2 + default "cubic" + + config TCP_MD5SIG +diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile +index bbdd9c44f14e3..8dee1547d8202 100644 +--- a/net/ipv4/Makefile ++++ b/net/ipv4/Makefile +@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o + obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o + obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o + obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o ++obj-$(CONFIG_TCP_CONG_BBR2) += tcp_bbr2.o + obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o + obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o + obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o +diff --git a/net/ipv4/tcp_bbr2.c b/net/ipv4/tcp_bbr2.c +new file mode 100644 +index 0000000000000..fa49e17c47ca9 +--- /dev/null ++++ b/net/ipv4/tcp_bbr2.c +@@ -0,0 +1,2674 @@ ++/* BBR (Bottleneck Bandwidth and RTT) congestion control, v2 ++ * ++ * BBRv2 is a model-based congestion control algorithm that aims for low ++ * queues, low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model ++ * of the network path, it uses measurements of bandwidth and RTT, as well as ++ * (if they occur) packet loss and/or DCTCP/L4S-style ECN signals. Note that ++ * although it can use ECN or loss signals explicitly, it does not require ++ * either; it can bound its in-flight data based on its estimate of the BDP. ++ * ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. ++ * ++ * Here is a state transition diagram for BBR: ++ * ++ * | ++ * V ++ * +---> STARTUP ----+ ++ * | | | ++ * | V | ++ * | DRAIN ----+ ++ * | | | ++ * | V | ++ * +---> PROBE_BW ----+ ++ * | ^ | | ++ * | | | | ++ * | +----+ | ++ * | | ++ * +---- PROBE_RTT <--+ ++ * ++ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. ++ * When it estimates the pipe is full, it enters DRAIN to drain the queue. ++ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. ++ * A long-lived BBR flow spends the vast majority of its time remaining ++ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth ++ * in a fair manner, with a small, bounded queue. *If* a flow has been ++ * continuously sending for the entire min_rtt window, and hasn't seen an RTT ++ * sample that matches or decreases its min_rtt estimate for 10 seconds, then ++ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe ++ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if ++ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; ++ * otherwise we enter STARTUP to try to fill the pipe. ++ * ++ * BBR is described in detail in: ++ * "BBR: Congestion-Based Congestion Control", ++ * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, ++ * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. ++ * ++ * There is a public e-mail list for discussing BBR development and testing: ++ * https://groups.google.com/forum/#!forum/bbr-dev ++ * ++ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled, ++ * otherwise TCP stack falls back to an internal pacing using one high ++ * resolution timer per TCP socket and may use more resources. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++#include "tcp_dctcp.h" ++ ++/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth ++ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. ++ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. ++ * Since the minimum window is >=4 packets, the lower bound isn't ++ * an issue. The upper bound isn't an issue with existing technologies. ++ */ ++#define BW_SCALE 24 ++#define BW_UNIT (1 << BW_SCALE) ++ ++#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ ++#define BBR_UNIT (1 << BBR_SCALE) ++ ++#define FLAG_DEBUG_VERBOSE 0x1 /* Verbose debugging messages */ ++#define FLAG_DEBUG_LOOPBACK 0x2 /* Do NOT skip loopback addr */ ++ ++#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++ ++/* BBR has the following modes for deciding how fast to send: */ ++enum bbr_mode { ++ BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ ++ BBR_DRAIN, /* drain any queue created during startup */ ++ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ ++ BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ ++}; ++ ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ ++/* BBR congestion control block */ ++struct bbr { ++ u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ ++ u32 min_rtt_stamp; /* timestamp of min_rtt_us */ ++ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ ++ u32 probe_rtt_min_us; /* min RTT in bbr_probe_rtt_win_ms window */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ ++ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u64 cycle_mstamp; /* time of this cycle phase start */ ++ u32 mode:3, /* current bbr_mode in state machine */ ++ prev_ca_state:3, /* CA state on previous ACK */ ++ packet_conservation:1, /* use packet conservation? */ ++ round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ ++ unused2:11, ++ idle_restart:1, /* restarting after idle? */ ++ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ ++ cycle_idx:3, /* current index in pacing_gain cycle array */ ++ has_seen_rtt:1; /* have we seen an RTT sample yet? */ ++ u32 pacing_gain:11, /* current gain for setting pacing rate */ ++ cwnd_gain:11, /* current gain for setting cwnd */ ++ full_bw_reached:1, /* reached full bw in Startup? */ ++ full_bw_cnt:2, /* number of rounds without large bw gains */ ++ init_cwnd:7; /* initial cwnd */ ++ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ ++ u32 full_bw; /* recent bw, to estimate if pipe is full */ ++ ++ /* For tracking ACK aggregation: */ ++ u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ ++ u16 extra_acked[2]; /* max excess data ACKed in epoch */ ++ u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ ++ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ ++ extra_acked_win_idx:1, /* current index in extra_acked array */ ++ /* BBR v2 state: */ ++ unused1:2, ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1; /* ECN in this cycle? */ ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* upper bound of sending bandwidth range*/ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ /* Params configurable using setsockopt. Refer to correspoding ++ * module param for detailed description of params. ++ */ ++ struct bbr_params { ++ u32 high_gain:11, /* max allowed value: 2047 */ ++ drain_gain:10, /* max allowed value: 1023 */ ++ cwnd_gain:11; /* max allowed value: 2047 */ ++ u32 cwnd_min_target:4, /* max allowed value: 15 */ ++ min_rtt_win_sec:5, /* max allowed value: 31 */ ++ probe_rtt_mode_ms:9, /* max allowed value: 511 */ ++ full_bw_cnt:3, /* max allowed value: 7 */ ++ cwnd_tso_budget:1, /* allowed values: {0, 1} */ ++ unused3:6, ++ drain_to_target:1, /* boolean */ ++ precise_ece_ack:1, /* boolean */ ++ extra_acked_in_startup:1, /* allowed values: {0, 1} */ ++ fast_path:1; /* boolean */ ++ u32 full_bw_thresh:10, /* max allowed value: 1023 */ ++ startup_cwnd_gain:11, /* max allowed value: 2047 */ ++ bw_probe_pif_gain:9, /* max allowed value: 511 */ ++ usage_based_cwnd:1, /* boolean */ ++ unused2:1; ++ u16 probe_rtt_win_ms:14, /* max allowed value: 16383 */ ++ refill_add_inc:2; /* max allowed value: 3 */ ++ u16 extra_acked_gain:11, /* max allowed value: 2047 */ ++ extra_acked_win_rtts:5; /* max allowed value: 31*/ ++ u16 pacing_gain[CYCLE_LEN]; /* max allowed value: 1023 */ ++ /* Mostly BBR v2 parameters below here: */ ++ u32 ecn_alpha_gain:8, /* max allowed value: 255 */ ++ ecn_factor:8, /* max allowed value: 255 */ ++ ecn_thresh:8, /* max allowed value: 255 */ ++ beta:8; /* max allowed value: 255 */ ++ u32 ecn_max_rtt_us:19, /* max allowed value: 524287 */ ++ bw_probe_reno_gain:9, /* max allowed value: 511 */ ++ full_loss_cnt:4; /* max allowed value: 15 */ ++ u32 probe_rtt_cwnd_gain:8, /* max allowed value: 255 */ ++ inflight_headroom:8, /* max allowed value: 255 */ ++ loss_thresh:8, /* max allowed value: 255 */ ++ bw_probe_max_rounds:8; /* max allowed value: 255 */ ++ u32 bw_probe_rand_rounds:4, /* max allowed value: 15 */ ++ bw_probe_base_us:26, /* usecs: 0..2^26-1 (67 secs) */ ++ full_ecn_cnt:2; /* max allowed value: 3 */ ++ u32 bw_probe_rand_us:26, /* usecs: 0..2^26-1 (67 secs) */ ++ undo:1, /* boolean */ ++ tso_rtt_shift:4, /* max allowed value: 15 */ ++ unused5:1; ++ u32 ecn_reprobe_gain:9, /* max allowed value: 511 */ ++ unused1:14, ++ ecn_alpha_init:9; /* max allowed value: 256 */ ++ } params; ++ ++ struct { ++ u32 snd_isn; /* Initial sequence number */ ++ u32 rs_bw; /* last valid rate sample bw */ ++ u32 target_cwnd; /* target cwnd, based on BDP */ ++ u8 undo:1, /* Undo even happened but not yet logged */ ++ unused:7; ++ char event; /* single-letter event debug codes */ ++ u16 unused2; ++ } debug; ++}; ++ ++struct bbr_context { ++ u32 sample_bw; ++ u32 target_cwnd; ++ u32 log:1; ++}; ++ ++/* Window length of min_rtt filter (in sec). Max allowed value is 31 (0x1F) */ ++static u32 bbr_min_rtt_win_sec = 10; ++/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode. ++ * Max allowed value is 511 (0x1FF). ++ */ ++static u32 bbr_probe_rtt_mode_ms = 200; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static u32 bbr_probe_rtt_win_ms = 5000; ++/* Skip TSO below the following bandwidth (bits/sec): */ ++static int bbr_min_tso_rate = 1200000; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. By default we cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static u32 bbr_tso_rtt_shift = 9; /* halve allowance per 2^9 usecs, 512us */ ++ ++/* Select cwnd TSO budget approach: ++ * 0: padding ++ * 1: flooring ++ */ ++static uint bbr_cwnd_tso_budget = 1; ++ ++/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. ++ * In order to help drive the network toward lower queues and low latency while ++ * maintaining high utilization, the average pacing rate aims to be slightly ++ * lower than the estimated bandwidth. This is an important aspect of the ++ * design. ++ */ ++static const int bbr_pacing_margin_percent = 1; ++ ++/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++ * that will allow a smoothly increasing pacing rate that will double each RTT ++ * and send the same number of packets per RTT that an un-paced, slow-starting ++ * Reno or CUBIC flow would. Max allowed value is 2047 (0x7FF). ++ */ ++static int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; ++/* The gain for deriving startup cwnd. Max allowed value is 2047 (0x7FF). */ ++static int bbr_startup_cwnd_gain = BBR_UNIT * 2885 / 1000 + 1; ++/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++ * the queue created in BBR_STARTUP in a single round. Max allowed value ++ * is 1023 (0x3FF). ++ */ ++static int bbr_drain_gain = BBR_UNIT * 1000 / 2885; ++/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs. ++ * Max allowed value is 2047 (0x7FF). ++ */ ++static int bbr_cwnd_gain = BBR_UNIT * 2; ++/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw. ++ * Max allowed value for each element is 1023 (0x3FF). ++ */ ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ ++}; ++static int bbr_pacing_gain[] = { ++ BBR_UNIT * 5 / 4, /* probe for more available bw */ ++ BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ ++ BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ ++ BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++}; ++ ++/* Try to keep at least this many packets in flight, if things go smoothly. For ++ * smooth functioning, a sliding window protocol ACKing every other packet ++ * needs at least 4 packets in flight. Max allowed value is 15 (0xF). ++ */ ++static u32 bbr_cwnd_min_target = 4; ++ ++/* Cwnd to BDP proportion in PROBE_RTT mode scaled by BBR_UNIT. Default: 50%. ++ * Use 0 to disable. Max allowed value is 255. ++ */ ++static u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* If bw has increased significantly (1.25x), there may be more bw available. ++ * Max allowed value is 1023 (0x3FF). ++ */ ++static u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; ++/* But after 3 rounds w/o significant bw growth, estimate pipe is full. ++ * Max allowed value is 7 (0x7). ++ */ ++static u32 bbr_full_bw_cnt = 3; ++ ++static u32 bbr_flags; /* Debugging related stuff */ ++ ++/* Whether to debug using printk. ++ */ ++static bool bbr_debug_with_printk; ++ ++/* Whether to debug using ftrace event tcp:tcp_bbr_event. ++ * Ignored when bbr_debug_with_printk is set. ++ */ ++static bool bbr_debug_ftrace; ++ ++/* Experiment: each cycle, try to hold sub-unity gain until inflight <= BDP. */ ++static bool bbr_drain_to_target = true; /* default: enabled */ ++ ++/* Experiment: Flags to control BBR with ECN behavior. ++ */ ++static bool bbr_precise_ece_ack = true; /* default: enabled */ ++ ++/* The max rwin scaling shift factor is 14 (RFC 1323), so the max sane rwin is ++ * (2^(16+14) B)/(1024 B/packet) = 1M packets. ++ */ ++static u32 bbr_cwnd_warn_val = 1U << 20; ++ ++static u16 bbr_debug_port_mask; ++ ++/* BBR module parameters. These are module parameters only in Google prod. ++ * Upstream these are intentionally not module parameters. ++ */ ++static int bbr_pacing_gain_size = CYCLE_LEN; ++ ++/* Gain factor for adding extra_acked to target cwnd: */ ++static int bbr_extra_acked_gain = 256; ++ ++/* Window length of extra_acked window. Max allowed val is 31. */ ++static u32 bbr_extra_acked_win_rtts = 5; ++ ++/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ ++static u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; ++ ++/* Time period for clamping cwnd increment due to ack aggregation */ ++static u32 bbr_extra_acked_max_us = 100 * 1000; ++ ++/* Use extra acked in startup ? ++ * 0: disabled ++ * 1: use latest extra_acked value from 1-2 rtt in startup ++ */ ++static int bbr_extra_acked_in_startup = 1; /* default: enabled */ ++ ++/* Experiment: don't grow cwnd beyond twice of what we just probed. */ ++static bool bbr_usage_based_cwnd; /* default: disabled */ ++ ++/* For lab testing, researchers can enable BBRv2 ECN support with this flag, ++ * when they know that any ECN marks that the connections experience will be ++ * DCTCP/L4S-style ECN marks, rather than RFC3168 ECN marks. ++ * TODO(ncardwell): Production use of the BBRv2 ECN functionality depends on ++ * negotiation or configuration that is outside the scope of the BBRv2 ++ * alpha release. ++ */ ++static bool bbr_ecn_enable = false; ++ ++module_param_named(min_tso_rate, bbr_min_tso_rate, int, 0644); ++module_param_named(tso_rtt_shift, bbr_tso_rtt_shift, int, 0644); ++module_param_named(high_gain, bbr_high_gain, int, 0644); ++module_param_named(drain_gain, bbr_drain_gain, int, 0644); ++module_param_named(startup_cwnd_gain, bbr_startup_cwnd_gain, int, 0644); ++module_param_named(cwnd_gain, bbr_cwnd_gain, int, 0644); ++module_param_array_named(pacing_gain, bbr_pacing_gain, int, ++ &bbr_pacing_gain_size, 0644); ++module_param_named(cwnd_min_target, bbr_cwnd_min_target, uint, 0644); ++module_param_named(probe_rtt_cwnd_gain, ++ bbr_probe_rtt_cwnd_gain, uint, 0664); ++module_param_named(cwnd_warn_val, bbr_cwnd_warn_val, uint, 0664); ++module_param_named(debug_port_mask, bbr_debug_port_mask, ushort, 0644); ++module_param_named(flags, bbr_flags, uint, 0644); ++module_param_named(debug_ftrace, bbr_debug_ftrace, bool, 0644); ++module_param_named(debug_with_printk, bbr_debug_with_printk, bool, 0644); ++module_param_named(min_rtt_win_sec, bbr_min_rtt_win_sec, uint, 0644); ++module_param_named(probe_rtt_mode_ms, bbr_probe_rtt_mode_ms, uint, 0644); ++module_param_named(probe_rtt_win_ms, bbr_probe_rtt_win_ms, uint, 0644); ++module_param_named(full_bw_thresh, bbr_full_bw_thresh, uint, 0644); ++module_param_named(full_bw_cnt, bbr_full_bw_cnt, uint, 0644); ++module_param_named(cwnd_tso_bduget, bbr_cwnd_tso_budget, uint, 0664); ++module_param_named(extra_acked_gain, bbr_extra_acked_gain, int, 0664); ++module_param_named(extra_acked_win_rtts, ++ bbr_extra_acked_win_rtts, uint, 0664); ++module_param_named(extra_acked_max_us, ++ bbr_extra_acked_max_us, uint, 0664); ++module_param_named(ack_epoch_acked_reset_thresh, ++ bbr_ack_epoch_acked_reset_thresh, uint, 0664); ++module_param_named(drain_to_target, bbr_drain_to_target, bool, 0664); ++module_param_named(precise_ece_ack, bbr_precise_ece_ack, bool, 0664); ++module_param_named(extra_acked_in_startup, ++ bbr_extra_acked_in_startup, int, 0664); ++module_param_named(usage_based_cwnd, bbr_usage_based_cwnd, bool, 0664); ++module_param_named(ecn_enable, bbr_ecn_enable, bool, 0664); ++ ++static void bbr2_exit_probe_rtt(struct sock *sk); ++static void bbr2_reset_congestion_signals(struct sock *sk); ++ ++static void bbr_check_probe_rtt_done(struct sock *sk); ++ ++/* Do we estimate that STARTUP filled the pipe? */ ++static bool bbr_full_bw_reached(const struct sock *sk) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return bbr->full_bw_reached; ++} ++ ++/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ ++static u32 bbr_max_bw(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); ++} ++ ++/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ ++static u32 bbr_bw(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return min(bbr_max_bw(sk), bbr->bw_lo); ++} ++ ++/* Return maximum extra acked in past k-2k round trips, ++ * where k = bbr_extra_acked_win_rtts. ++ */ ++static u16 bbr_extra_acked(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return max(bbr->extra_acked[0], bbr->extra_acked[1]); ++} ++ ++/* Return rate in bytes per second, optionally with a gain. ++ * The order here is chosen carefully to avoid overflow of u64. This should ++ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. ++ */ ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) ++{ ++ unsigned int mss = tcp_sk(sk)->mss_cache; ++ ++ rate *= mss; ++ rate *= gain; ++ rate >>= BBR_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); ++} ++ ++static u64 bbr_rate_kbps(struct sock *sk, u64 rate) ++{ ++ rate = bbr_bw_bytes_per_sec(sk, rate); ++ rate *= 8; ++ do_div(rate, 1000); ++ return rate; ++} ++ ++static u32 bbr_tso_segs_goal(struct sock *sk); ++static void bbr_debug(struct sock *sk, u32 acked, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ static const char ca_states[] = { ++ [TCP_CA_Open] = 'O', ++ [TCP_CA_Disorder] = 'D', ++ [TCP_CA_CWR] = 'C', ++ [TCP_CA_Recovery] = 'R', ++ [TCP_CA_Loss] = 'L', ++ }; ++ static const char mode[] = { ++ 'G', /* Growing - BBR_STARTUP */ ++ 'D', /* Drain - BBR_DRAIN */ ++ 'W', /* Window - BBR_PROBE_BW */ ++ 'M', /* Min RTT - BBR_PROBE_RTT */ ++ }; ++ static const char ack_phase[] = { /* bbr_ack_phase strings */ ++ 'I', /* BBR_ACKS_INIT - 'Init' */ ++ 'R', /* BBR_ACKS_REFILLING - 'Refilling' */ ++ 'B', /* BBR_ACKS_PROBE_STARTING - 'Before' */ ++ 'F', /* BBR_ACKS_PROBE_FEEDBACK - 'Feedback' */ ++ 'A', /* BBR_ACKS_PROBE_STOPPING - 'After' */ ++ }; ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 una = tp->snd_una - bbr->debug.snd_isn; ++ const u32 fack = tcp_highest_sack_seq(tp); ++ const u16 dport = ntohs(inet_sk(sk)->inet_dport); ++ bool is_port_match = (bbr_debug_port_mask && ++ ((dport & bbr_debug_port_mask) == 0)); ++ char debugmsg[320]; ++ ++ if (sk->sk_state == TCP_SYN_SENT) ++ return; /* no bbr_init() yet if SYN retransmit -> CA_Loss */ ++ ++ if (!tp->snd_cwnd || tp->snd_cwnd > bbr_cwnd_warn_val) { ++ char addr[INET6_ADDRSTRLEN + 10] = { 0 }; ++ ++ if (sk->sk_family == AF_INET) ++ snprintf(addr, sizeof(addr), "%pI4:%u", ++ &inet_sk(sk)->inet_daddr, dport); ++ else if (sk->sk_family == AF_INET6) ++ snprintf(addr, sizeof(addr), "%pI6:%u", ++ &sk->sk_v6_daddr, dport); ++ ++ WARN_ONCE(1, ++ "BBR %s cwnd alert: %u " ++ "snd_una: %u ca: %d pacing_gain: %u cwnd_gain: %u " ++ "bw: %u rtt: %u min_rtt: %u " ++ "acked: %u tso_segs: %u " ++ "bw: %d %ld %d pif: %u\n", ++ addr, tp->snd_cwnd, ++ una, inet_csk(sk)->icsk_ca_state, ++ bbr->pacing_gain, bbr->cwnd_gain, ++ bbr_max_bw(sk), (tp->srtt_us >> 3), bbr->min_rtt_us, ++ acked, bbr_tso_segs_goal(sk), ++ rs->delivered, rs->interval_us, rs->is_retrans, ++ tcp_packets_in_flight(tp)); ++ } ++ ++ if (likely(!bbr_debug_with_printk && !bbr_debug_ftrace)) ++ return; ++ ++ if (!sock_flag(sk, SOCK_DBG) && !is_port_match) ++ return; ++ ++ if (!ctx->log && !tp->app_limited && !(bbr_flags & FLAG_DEBUG_VERBOSE)) ++ return; ++ ++ if (ipv4_is_loopback(inet_sk(sk)->inet_daddr) && ++ !(bbr_flags & FLAG_DEBUG_LOOPBACK)) ++ return; ++ ++ snprintf(debugmsg, sizeof(debugmsg) - 1, ++ "BBR %pI4:%-5u %5u,%03u:%-7u %c " ++ "%c %2u br %2u cr %2d rtt %5ld d %2d i %5ld mrtt %d %cbw %llu " ++ "bw %llu lb %llu ib %llu qb %llu " ++ "a %u if %2u %c %c dl %u l %u al %u # %u t %u %c %c " ++ "lr %d er %d ea %d bwl %lld il %d ih %d c %d " ++ "v %d %c %u %c %s\n", ++ &inet_sk(sk)->inet_daddr, dport, ++ una / 1000, una % 1000, fack - tp->snd_una, ++ ca_states[inet_csk(sk)->icsk_ca_state], ++ bbr->debug.undo ? '@' : mode[bbr->mode], ++ tp->snd_cwnd, ++ bbr_extra_acked(sk), /* br (legacy): extra_acked */ ++ rs->tx_in_flight, /* cr (legacy): tx_inflight */ ++ rs->rtt_us, ++ rs->delivered, ++ rs->interval_us, ++ bbr->min_rtt_us, ++ rs->is_app_limited ? '_' : 'l', ++ bbr_rate_kbps(sk, ctx->sample_bw), /* lbw: latest sample bw */ ++ bbr_rate_kbps(sk, bbr_max_bw(sk)), /* bw: max bw */ ++ 0ULL, /* lb: [obsolete] */ ++ 0ULL, /* ib: [obsolete] */ ++ (u64)sk->sk_pacing_rate * 8 / 1000, ++ acked, ++ tcp_packets_in_flight(tp), ++ rs->is_ack_delayed ? 'd' : '.', ++ bbr->round_start ? '*' : '.', ++ tp->delivered, tp->lost, ++ tp->app_limited, ++ 0, /* #: [obsolete] */ ++ ctx->target_cwnd, ++ tp->reord_seen ? 'r' : '.', /* r: reordering seen? */ ++ ca_states[bbr->prev_ca_state], ++ (rs->lost + rs->delivered) > 0 ? ++ (1000 * rs->lost / ++ (rs->lost + rs->delivered)) : 0, /* lr: loss rate x1000 */ ++ (rs->delivered) > 0 ? ++ (1000 * rs->delivered_ce / ++ (rs->delivered)) : 0, /* er: ECN rate x1000 */ ++ 1000 * bbr->ecn_alpha >> BBR_SCALE, /* ea: ECN alpha x1000 */ ++ bbr->bw_lo == ~0U ? ++ -1 : (s64)bbr_rate_kbps(sk, bbr->bw_lo), /* bwl */ ++ bbr->inflight_lo, /* il */ ++ bbr->inflight_hi, /* ih */ ++ bbr->bw_probe_up_cnt, /* c */ ++ 2, /* v: version */ ++ bbr->debug.event, ++ bbr->cycle_idx, ++ ack_phase[bbr->ack_phase], ++ bbr->bw_probe_samples ? "Y" : "N"); ++ debugmsg[sizeof(debugmsg) - 1] = 0; ++ ++ /* printk takes a higher precedence. */ ++ if (bbr_debug_with_printk) ++ printk(KERN_DEBUG "%s", debugmsg); ++ ++ if (unlikely(bbr->debug.undo)) ++ bbr->debug.undo = 0; ++} ++ ++/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ ++static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) ++{ ++ u64 rate = bw; ++ ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); ++ rate = min_t(u64, rate, sk->sk_max_pacing_rate); ++ return rate; ++} ++ ++/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++static void bbr_init_pacing_rate_from_rtt(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ u32 rtt_us; ++ ++ if (tp->srtt_us) { /* any RTT sample yet? */ ++ rtt_us = max(tp->srtt_us >> 3, 1U); ++ bbr->has_seen_rtt = 1; ++ } else { /* no RTT sample yet */ ++ rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ ++ } ++ bw = (u64)tp->snd_cwnd * BW_UNIT; ++ do_div(bw, rtt_us); ++ sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr->params.high_gain); ++} ++ ++/* Pace using current bw estimate and a gain factor. */ ++static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); ++ ++ if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) ++ bbr_init_pacing_rate_from_rtt(sk); ++ if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) ++ sk->sk_pacing_rate = rate; ++} ++ ++static u32 bbr_min_tso_segs(struct sock *sk) ++{ ++ return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; ++} ++ ++/* Return the number of segments BBR would like in a TSO/GSO skb, given ++ * a particular max gso size as a constraint. ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr->params.tso_rtt_shift) { ++ r = bbr->min_rtt_us >> bbr->params.tso_rtt_shift; ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk)); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) ++{ ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); ++} ++ ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ ++static u32 bbr_tso_segs_goal(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); ++} ++ ++/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ ++static void bbr_save_cwnd(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) ++ bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ ++ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ ++ bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); ++} ++ ++static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (event == CA_EVENT_TX_START && tp->app_limited) { ++ bbr->idle_restart = 1; ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ /* Avoid pointless buffer overflows: pace at est. bw if we don't ++ * need more speed (we're restarting from idle and app-limited). ++ */ ++ if (bbr->mode == BBR_PROBE_BW) ++ bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); ++ else if (bbr->mode == BBR_PROBE_RTT) ++ bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_ecn_enable && ++ bbr->params.precise_ece_ack) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ if (tp->fast_ack_mode == 2 && event == CA_EVENT_ECN_IS_CE) ++ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); ++ } ++} ++ ++/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: ++ * ++ * bdp = ceil(bw * min_rtt * gain) ++ * ++ * The key factor, gain, controls the amount of queue. While a small gain ++ * builds a smaller queue, it becomes more vulnerable to noise in RTT ++ * measurements (e.g., delayed ACKs or other ACK compression effects). This ++ * noise may cause BBR to under-estimate the rate. ++ */ ++static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; ++ u64 w; ++ ++ /* If we've never had a valid RTT sample, cap cwnd at the initial ++ * default. This should only happen when the connection is not using TCP ++ * timestamps and has retransmitted all of the SYN/SYNACK/data packets ++ * ACKed so far. In this case, an RTO can cut cwnd to 1, in which ++ * case we need to slow-start up toward something safe: initial cwnd. ++ */ ++ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ ++ ++ w = (u64)bw * bbr->min_rtt_us; ++ ++ /* Apply a gain to the given value, remove the BW_SCALE shift, and ++ * round the value up to avoid a negative feedback loop. ++ */ ++ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; ++ ++ return bdp; ++} ++ ++/* To achieve full performance in high-speed paths, we budget enough cwnd to ++ * fit full-sized skbs in-flight on both end hosts to fully utilize the path: ++ * - one skb in sending host Qdisc, ++ * - one skb in sending host TSO/GSO engine ++ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine ++ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because ++ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * which allows 2 outstanding 2-packet sequences, to try to keep pipe ++ * full even with ACK-every-other-packet delayed ACKs. ++ */ ++static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; ++ ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); ++ ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ if (bbr->params.cwnd_tso_budget == 1) { ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); ++ } else { ++ cwnd += tso_segs_goal; ++ cwnd = (cwnd + 1) & ~1U; ++ } ++ /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ cwnd += 2; ++ ++ return cwnd; ++} ++ ++/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ ++static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) ++{ ++ u32 inflight; ++ ++ inflight = bbr_bdp(sk, bw, gain); ++ inflight = bbr_quantization_budget(sk, inflight); ++ ++ return inflight; ++} ++ ++/* With pacing at lower layers, there's often less data "in the network" than ++ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), ++ * we often have several skbs queued in the pacing layer with a pre-scheduled ++ * earliest departure time (EDT). BBR adapts its pacing rate based on the ++ * inflight level that it estimates has already been "baked in" by previous ++ * departure time decisions. We calculate a rough estimate of the number of our ++ * packets that might be in the network at the earliest departure time for the ++ * next skb scheduled: ++ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw ++ * If we're increasing inflight, then we want to know if the transmit of the ++ * EDT skb will push inflight above the target, so inflight_at_edt includes ++ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, ++ * then estimate if inflight will sink too low just before the EDT transmit. ++ */ ++static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 now_ns, edt_ns, interval_us; ++ u32 interval_delivered, inflight_at_edt; ++ ++ now_ns = tp->tcp_clock_cache; ++ edt_ns = max(tp->tcp_wstamp_ns, now_ns); ++ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); ++ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; ++ inflight_at_edt = inflight_now; ++ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ ++ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ ++ if (interval_delivered >= inflight_at_edt) ++ return 0; ++ return inflight_at_edt - interval_delivered; ++} ++ ++/* Find the cwnd increment based on estimate of ack aggregation */ ++static u32 bbr_ack_aggregation_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 max_aggr_cwnd, aggr_cwnd = 0; ++ ++ if (bbr->params.extra_acked_gain && ++ (bbr_full_bw_reached(sk) || bbr->params.extra_acked_in_startup)) { ++ max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) ++ / BW_UNIT; ++ aggr_cwnd = (bbr->params.extra_acked_gain * bbr_extra_acked(sk)) ++ >> BBR_SCALE; ++ aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); ++ } ++ ++ return aggr_cwnd; ++} ++ ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->params.probe_rtt_cwnd_gain == 0) ++ return bbr->params.cwnd_min_target; ++ return max_t(u32, bbr->params.cwnd_min_target, ++ bbr_bdp(sk, bbr_bw(sk), bbr->params.probe_rtt_cwnd_gain)); ++} ++ ++/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss ++ * has drawn us down below target), or snap down to target if we're above it. ++ */ ++static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 target_cwnd = 0, prev_cwnd = tp->snd_cwnd, max_probe; ++ ++ if (!acked) ++ goto done; /* no packet fully ACKed; just apply caps */ ++ ++ target_cwnd = bbr_bdp(sk, bw, gain); ++ ++ /* Increment the cwnd to account for excess ACKed data that seems ++ * due to aggregation (of data and/or ACKs) visible in the ACK stream. ++ */ ++ target_cwnd += bbr_ack_aggregation_cwnd(sk); ++ target_cwnd = bbr_quantization_budget(sk, target_cwnd); ++ ++ /* If we're below target cwnd, slow start cwnd toward target cwnd. */ ++ bbr->debug.target_cwnd = target_cwnd; ++ ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } ++ ++ /* When growing cwnd, don't grow beyond twice what we just probed. */ ++ if (bbr->params.usage_based_cwnd) { ++ max_probe = max(2 * tp->max_packets_out, tp->snd_cwnd); ++ cwnd = min(cwnd, max_probe); ++ } ++ ++ cwnd = max_t(u32, cwnd, bbr->params.cwnd_min_target); ++done: ++ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ ++ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ ++ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, bbr_probe_rtt_cwnd(sk)); ++ ++ ctx->target_cwnd = target_cwnd; ++ ctx->log = (tp->snd_cwnd != prev_cwnd); ++} ++ ++/* See if we have reached next round trip */ ++static void bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->round_start = 0; ++ ++ /* See if we've reached the next RTT */ ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->round_start = 1; ++ } ++} ++ ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw = 0; ++ ++ /* Divide delivered by the interval to find a (lower bound) bottleneck ++ * bandwidth sample. Delivered is in packets and interval_us in uS and ++ * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. ++ */ ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); ++ } ++ ++ ctx->sample_bw = bw; ++ bbr->debug.rs_bw = bw; ++} ++ ++/* Estimates the windowed max degree of ack aggregation. ++ * This is used to provision extra in-flight data to keep sending during ++ * inter-ACK silences. ++ * ++ * Degree of ack aggregation is estimated as extra data acked beyond expected. ++ * ++ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" ++ * cwnd += max_extra_acked ++ * ++ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). ++ * Max filter is an approximate sliding window of 5-10 (packet timed) round ++ * trips for non-startup phase, and 1-2 round trips for startup. ++ */ ++static void bbr_update_ack_aggregation(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ u32 epoch_us, expected_acked, extra_acked; ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr->params.extra_acked_win_rtts; ++ ++ if (!bbr->params.extra_acked_gain || rs->acked_sacked <= 0 || ++ rs->delivered < 0 || rs->interval_us <= 0) ++ return; ++ ++ if (bbr->round_start) { ++ bbr->extra_acked_win_rtts = min(0x1F, ++ bbr->extra_acked_win_rtts + 1); ++ if (bbr->params.extra_acked_in_startup && ++ !bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? ++ 0 : 1; ++ bbr->extra_acked[bbr->extra_acked_win_idx] = 0; ++ } ++ } ++ ++ /* Compute how many packets we expected to be delivered over epoch. */ ++ epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, ++ bbr->ack_epoch_mstamp); ++ expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; ++ ++ /* Reset the aggregation epoch if ACK rate is below expected rate or ++ * significantly large no. of ack received since epoch (potentially ++ * quite old epoch). ++ */ ++ if (bbr->ack_epoch_acked <= expected_acked || ++ (bbr->ack_epoch_acked + rs->acked_sacked >= ++ bbr_ack_epoch_acked_reset_thresh)) { ++ bbr->ack_epoch_acked = 0; ++ bbr->ack_epoch_mstamp = tp->delivered_mstamp; ++ expected_acked = 0; ++ } ++ ++ /* Compute excess data delivered, beyond what was expected. */ ++ bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, ++ bbr->ack_epoch_acked + rs->acked_sacked); ++ extra_acked = bbr->ack_epoch_acked - expected_acked; ++ extra_acked = min(extra_acked, tp->snd_cwnd); ++ if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) ++ bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh; ++ ++ if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) ++ return; ++ ++ bw_thresh = (u64)bbr->full_bw * bbr->params.full_bw_thresh >> BBR_SCALE; ++ if (bbr_max_bw(sk) >= bw_thresh) { ++ bbr->full_bw = bbr_max_bw(sk); ++ bbr->full_bw_cnt = 0; ++ return; ++ } ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_reached = bbr->full_bw_cnt >= bbr->params.full_bw_cnt; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static bool bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr2_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) ++ return true; /* exiting DRAIN now */ ++ return false; ++} ++ ++static void bbr_check_probe_rtt_done(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (!(bbr->probe_rtt_done_stamp && ++ after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) ++ return; ++ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ ++ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); ++ bbr2_exit_probe_rtt(sk); ++} ++ ++/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and ++ * periodically drain the bottleneck queue, to converge to measure the true ++ * min_rtt (unloaded propagation delay). This allows the flows to keep queues ++ * small (reducing queuing delay and packet loss) and achieve fairness among ++ * BBR flows. ++ * ++ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, ++ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. ++ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed ++ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and ++ * re-enter the previous mode. BBR uses 200ms to approximately bound the ++ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). ++ * ++ * Note that flows need only pay 2% if they are busy sending over the last 10 ++ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have ++ * natural silences or low-rate periods within 10 seconds where the rate is low ++ * enough for long enough to drain its queue in the bottleneck. We pick up ++ * these min RTT measurements opportunistically with our min_rtt filter. :-) ++ */ ++static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; ++ ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr->params.probe_rtt_win_ms); ++ probe_rtt_expired = after(tcp_jiffies32, expire); ++ if (rs->rtt_us >= 0 && ++ (rs->rtt_us <= bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr->params.min_rtt_win_sec * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; ++ } ++ ++ if (bbr->params.probe_rtt_mode_ms > 0 && probe_rtt_expired && ++ !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { ++ bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ ++ bbr_save_cwnd(sk); /* note cwnd so we can restore it */ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ } ++ ++ if (bbr->mode == BBR_PROBE_RTT) { ++ /* Ignore low rate samples during this mode. */ ++ tp->app_limited = ++ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; ++ /* Maintain min packets in flight for max(200 ms, 1 round). */ ++ if (!bbr->probe_rtt_done_stamp && ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { ++ bbr->probe_rtt_done_stamp = tcp_jiffies32 + ++ msecs_to_jiffies(bbr->params.probe_rtt_mode_ms); ++ bbr->probe_rtt_round_done = 0; ++ bbr->next_rtt_delivered = tp->delivered; ++ } else if (bbr->probe_rtt_done_stamp) { ++ if (bbr->round_start) ++ bbr->probe_rtt_round_done = 1; ++ if (bbr->probe_rtt_round_done) ++ bbr_check_probe_rtt_done(sk); ++ } ++ } ++ /* Restart after idle ends only once we process a new S/ACK for data */ ++ if (rs->delivered > 0) ++ bbr->idle_restart = 0; ++} ++ ++static void bbr_update_gains(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ bbr->pacing_gain = bbr->params.high_gain; ++ bbr->cwnd_gain = bbr->params.startup_cwnd_gain; ++ break; ++ case BBR_DRAIN: ++ bbr->pacing_gain = bbr->params.drain_gain; /* slow, to drain */ ++ bbr->cwnd_gain = bbr->params.startup_cwnd_gain; /* keep cwnd */ ++ break; ++ case BBR_PROBE_BW: ++ bbr->pacing_gain = bbr->params.pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr->params.cwnd_gain; ++ break; ++ case BBR_PROBE_RTT: ++ bbr->pacing_gain = BBR_UNIT; ++ bbr->cwnd_gain = BBR_UNIT; ++ break; ++ default: ++ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); ++ break; ++ } ++} ++ ++static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ int i; ++ ++ WARN_ON_ONCE(tp->snd_cwnd >= bbr_cwnd_warn_val); ++ ++ bbr->initialized = 1; ++ bbr->params.high_gain = min(0x7FF, bbr_high_gain); ++ bbr->params.drain_gain = min(0x3FF, bbr_drain_gain); ++ bbr->params.startup_cwnd_gain = min(0x7FF, bbr_startup_cwnd_gain); ++ bbr->params.cwnd_gain = min(0x7FF, bbr_cwnd_gain); ++ bbr->params.cwnd_tso_budget = min(0x1U, bbr_cwnd_tso_budget); ++ bbr->params.cwnd_min_target = min(0xFU, bbr_cwnd_min_target); ++ bbr->params.min_rtt_win_sec = min(0x1FU, bbr_min_rtt_win_sec); ++ bbr->params.probe_rtt_mode_ms = min(0x1FFU, bbr_probe_rtt_mode_ms); ++ bbr->params.full_bw_cnt = min(0x7U, bbr_full_bw_cnt); ++ bbr->params.full_bw_thresh = min(0x3FFU, bbr_full_bw_thresh); ++ bbr->params.extra_acked_gain = min(0x7FF, bbr_extra_acked_gain); ++ bbr->params.extra_acked_win_rtts = min(0x1FU, bbr_extra_acked_win_rtts); ++ bbr->params.drain_to_target = bbr_drain_to_target ? 1 : 0; ++ bbr->params.precise_ece_ack = bbr_precise_ece_ack ? 1 : 0; ++ bbr->params.extra_acked_in_startup = bbr_extra_acked_in_startup ? 1 : 0; ++ bbr->params.probe_rtt_cwnd_gain = min(0xFFU, bbr_probe_rtt_cwnd_gain); ++ bbr->params.probe_rtt_win_ms = ++ min(0x3FFFU, ++ min_t(u32, bbr_probe_rtt_win_ms, ++ bbr->params.min_rtt_win_sec * MSEC_PER_SEC)); ++ for (i = 0; i < CYCLE_LEN; i++) ++ bbr->params.pacing_gain[i] = min(0x3FF, bbr_pacing_gain[i]); ++ bbr->params.usage_based_cwnd = bbr_usage_based_cwnd ? 1 : 0; ++ bbr->params.tso_rtt_shift = min(0xFU, bbr_tso_rtt_shift); ++ ++ bbr->debug.snd_isn = tp->snd_una; ++ bbr->debug.target_cwnd = 0; ++ bbr->debug.undo = 0; ++ ++ bbr->init_cwnd = min(0x7FU, tp->snd_cwnd); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = 0; ++ bbr->prev_ca_state = TCP_CA_Open; ++ bbr->packet_conservation = 0; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; ++ bbr->full_bw_cnt = 0; ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ bbr->mode = BBR_STARTUP; ++ bbr->debug.rs_bw = 0; ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++} ++ ++static u32 bbr_sndbuf_expand(struct sock *sk) ++{ ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; ++} ++ ++/* __________________________________________________________________________ ++ * ++ * Functions new to BBR v2 ("bbr") congestion control are below here. ++ * __________________________________________________________________________ ++ */ ++ ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr2_take_bw_hi_sample(struct sock *sk, u32 bw) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); ++} ++ ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr2_advance_bw_hi_filter(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} ++ ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr2_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); ++ ++ return min(bdp, tcp_sk(sk)->snd_cwnd); ++} ++ ++static bool bbr2_is_probing_bandwidth(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr2_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr2_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->full_bw_reached = 1; ++ bbr->inflight_hi = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr2_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr->params.full_ecn_cnt || !bbr->params.ecn_thresh) ++ return; ++ ++ if (ce_ratio >= bbr->params.ecn_thresh) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr->params.full_ecn_cnt) { ++ bbr->debug.event = 'E'; /* ECN caused STARTUP exit */ ++ bbr2_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++static void bbr2_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ ++ if (bbr->params.ecn_factor == 0) ++ return; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_ecn_enable && ++ (bbr->min_rtt_us <= bbr->params.ecn_max_rtt_us || ++ !bbr->params.ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ gain = bbr->params.ecn_alpha_gain; ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr2_check_ecn_too_high_in_startup(sk, ce_ratio); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr2_raise_inflight_hi_slope(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tp->snd_cwnd / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++ bbr->debug.event = 'G'; /* Grow inflight_hi slope */ ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr2_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tp->snd_cwnd < bbr->inflight_hi) { ++ bbr->bw_probe_up_acks = 0; /* don't accmulate unused credits */ ++ return; /* not fully using inflight_hi, so don't grow it */ ++ } ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->debug.event = 'I'; /* Increment inflight_hi */ ++ } ++ ++ if (bbr->round_start) ++ bbr2_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which ++ * uses it to notice when loss/ECN rates suggest inflight is too high. ++ */ ++static bool bbr2_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; ++ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr->params.loss_thresh >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) ++ return true; ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && bbr->params.ecn_thresh) { ++ ecn_thresh = (u64)rs->delivered * bbr->params.ecn_thresh >> ++ BBR_SCALE; ++ if (rs->delivered_ce >= ecn_thresh) ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr2_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh = bbr->params.loss_thresh; ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (WARN_ONCE(inflight_prev < 0, ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg)) ++ return ~0U; ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ON_ONCE(lost_prev < 0)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr2_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr->params.inflight_headroom; ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr->params.cwnd_min_target); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr2_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr2_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr->params.cwnd_min_target); ++ tp->snd_cwnd = min(cap, tp->snd_cwnd); ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr2_adapt_lower_bounds(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut, ecn_inflight_lo, beta; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr2_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && bbr->ecn_eligible && bbr->params.ecn_factor) { ++ /* Reduce inflight to (1 - alpha*ecn_factor). */ ++ ecn_cut = (BBR_UNIT - ++ ((bbr->ecn_alpha * bbr->params.ecn_factor) >> ++ BBR_SCALE)); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tp->snd_cwnd; ++ ecn_inflight_lo = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++ } else { ++ ecn_inflight_lo = ~0U; ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ /* Reduce bw and inflight to (1 - beta). */ ++ if (bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tp->snd_cwnd; ++ beta = bbr->params.beta; ++ bbr->bw_lo = ++ max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ bbr->inflight_lo = ++ max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss or ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr2_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr2_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr2_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; ++ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr2_take_bw_hi_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ /* Update rate and volume of delivered data from latest round trip: */ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (before(rs->prior_delivered, bbr->loss_round_delivered)) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr->loss_round_delivered = tp->delivered; /* mark round trip */ ++ bbr->loss_round_start = 1; ++ bbr2_adapt_lower_bounds(sk); ++ ++ /* Update windowed "latest" (single-round-trip) filters. */ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr2_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 inflight, rounds, reno_gain, reno_rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = bbr->params.bw_probe_max_rounds; ++ ++ reno_gain = bbr->params.bw_probe_reno_gain; ++ if (reno_gain) { ++ inflight = bbr2_target_inflight(sk); ++ reno_rounds = ((u64)inflight * reno_gain) >> BBR_SCALE; ++ rounds = min(rounds, reno_rounds); ++ } ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr2_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ prandom_u32_max(bbr->params.bw_probe_rand_rounds); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr->params.bw_probe_base_us + ++ prandom_u32_max(bbr->params.bw_probe_rand_us); ++} ++ ++static void bbr2_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr2_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr2_reset_lower_bounds(sk); ++ if (bbr->inflight_hi != ~0U) ++ bbr->inflight_hi += bbr->params.refill_add_inc; ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr2_start_bw_probe_up(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr2_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr2_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr2_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr2_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr2_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr2_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr2_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr->params.beta; ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ bbr->debug.event = 'L'; /* Loss/ECN too high */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr2_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr2_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr2_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr2_advance_bw_hi_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr->debug.event = 'R'; /* reprobe */ ++ bbr2_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ ++ if (bbr2_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr2_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ if (bbr->inflight_hi == ~0U) /* no excess queue signals yet? */ ++ return false; ++ ++ /* To be resilient to random loss, we must raise inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ bbr->debug.event = 'U'; /* raise up inflight_hi */ ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr2_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr2_check_time_to_probe_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case a burst of cross-traffic has ceased and freed up bw, ++ * or in case we are sharing with multiplicatively probing traffic). ++ */ ++ if (bbr->params.ecn_reprobe_gain && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ bbr->debug.event = 'A'; /* *A*ll clear to probe *A*gain */ ++ /* Calculate n so that when bbr2_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr->params.ecn_reprobe_gain) >> BBR_SCALE)); ++ bbr2_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr2_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr2_is_reno_coexistence_probe_time(sk)) { ++ bbr2_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr2_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool is_under_bdp, is_long_enough; ++ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr2_inflight_with_headroom(sk)) ++ return false; ++ ++ is_under_bdp = inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++ if (bbr->params.drain_to_target) ++ return is_under_bdp; ++ ++ is_long_enough = bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us); ++ return is_under_bdp || is_long_enough; ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr2_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool is_risky = false, is_queuing = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr2_adapt_upper_bounds(sk, rs)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr2_check_time_to_probe_bw(sk)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr2_start_bw_probe_up(sk); ++ } ++ break; ++ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (checked here) ++ * (2) We have probed for at least 1*min_rtt_us, and the ++ * estimated queue is high enough (inflight > 1.25 * estimated_bdp). ++ * (checked here) ++ * (3) Loss filter says loss rate is "too high". ++ * (checked in bbr_is_inflight_too_high()) ++ * (4) ECN filter says ECN mark rate is "too high". ++ * (checked in bbr_is_inflight_too_high()) ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_risky = true; ++ bbr->debug.event = 'D'; /* D for danger */ ++ } else if (bbr2_has_elapsed_in_phase(sk, bbr->min_rtt_us) && ++ inflight >= ++ bbr_inflight(sk, bw, ++ bbr->params.bw_probe_pif_gain)) { ++ is_queuing = true; ++ bbr->debug.event = 'Q'; /* building Queue */ ++ } ++ if (is_risky || is_queuing) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr2_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr2_check_time_to_probe_bw(sk)) ++ return; /* already decided state transition */ ++ if (bbr2_check_time_to_cruise(sk, inflight, bw)) ++ bbr2_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr2_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr2_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr2_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr2_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr2_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr->params.full_loss_cnt && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr->params.full_loss_cnt && ++ bbr2_is_inflight_too_high(sk, rs)) { ++ bbr->debug.event = 'P'; /* Packet loss caused STARTUP exit */ ++ bbr2_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* If we are done draining, advance into steady state operation in PROBE_BW. */ ++static void bbr2_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_check_drain(sk, rs, ctx)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr2_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr2_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr2_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr2_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs); ++ bbr2_check_drain(sk, rs, ctx); ++ bbr2_update_cycle_phase(sk, rs); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr2_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr->params.fast_path && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr2_check_drain(sk, rs, ctx); ++ bbr2_update_cycle_phase(sk, rs); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) ++ return true; ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++static void bbr2_main(struct sock *sk, const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw; ++ ++ bbr->debug.event = '.'; /* init to default NOP (no event yet) */ ++ ++ bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ bbr2_update_ecn_alpha(sk); ++ } ++ ++ bbr->ecn_in_round |= rs->is_ece; ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ ++ if (bbr2_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr2_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tp->snd_cwnd, &ctx); ++ bbr2_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++ ++ bbr_debug(sk, rs->acked_sacked, rs, &ctx); ++} ++ ++/* Module parameters that are settable by TCP_CONGESTION_PARAMS are declared ++ * down here, so that the algorithm functions that use the parameters must use ++ * the per-socket parameters; if they accidentally use the global version ++ * then there will be a compile error. ++ * TODO(ncardwell): move all per-socket parameters down to this section. ++ */ ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. Max allwed value is 255. ++ */ ++static u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE. ++ * Max allowed value is 255. ++ */ ++static u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; /* 1/16 = 6.25% */ ++ ++/* The initial value for the ecn_alpha state variable. Default and max ++ * BBR_UNIT (256), representing 1.0. This allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static u32 bbr_ecn_alpha_init = BBR_UNIT; /* 1.0, to respond quickly */ ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. Max allwed value is 255. ++ */ ++static u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. Max allowed is 255. ++ */ ++static u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ * Max allowed for the parameter is 524287 (0x7ffff) us, ~524 ms. ++ */ ++static u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then use a multiplicative increase to quickly reprobe bw by ++ * starting inflight probing at the given multiple of inflight_hi. ++ * Default for this experimental knob is 0 (disabled). ++ * Planned value for experiments: BBR_UNIT * 1 / 2 = 128, representing 0.5. ++ */ ++static u32 bbr_ecn_reprobe_gain; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. Max allowed value is 15 (0xF). ++ */ ++static u32 bbr_full_loss_cnt = 8; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. Max allowed value is 3. ++ */ ++static u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* Multiplier to get target inflight (as multiple of BDP) for PROBE_UP phase. ++ * Default is 1.25x, as in BBR v1. Max allowed is 511. ++ */ ++static u32 bbr_bw_probe_pif_gain = BBR_UNIT * 5 / 4; ++ ++/* Multiplier to get Reno-style probe epoch duration as: k * BDP round trips. ++ * If zero, disables this BBR v2 Reno-style BDP-scaled coexistence mechanism. ++ * Max allowed is 511. ++ */ ++static u32 bbr_bw_probe_reno_gain = BBR_UNIT; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ * Max value is 15. ++ */ ++static u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Undo the model changes made in loss recovery if recovery was spurious? */ ++static bool bbr_undo = true; ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static bool bbr_fast_path = true; /* default: enabled */ ++ ++/* Use fast ack mode ? */ ++static int bbr_fast_ack_mode = 1; /* default: rwnd check off */ ++ ++/* How much to additively increase inflight_hi when entering REFILL? */ ++static u32 bbr_refill_add_inc; /* default: disabled */ ++ ++module_param_named(beta, bbr_beta, uint, 0644); ++module_param_named(ecn_alpha_gain, bbr_ecn_alpha_gain, uint, 0644); ++module_param_named(ecn_alpha_init, bbr_ecn_alpha_init, uint, 0644); ++module_param_named(ecn_factor, bbr_ecn_factor, uint, 0644); ++module_param_named(ecn_thresh, bbr_ecn_thresh, uint, 0644); ++module_param_named(ecn_max_rtt_us, bbr_ecn_max_rtt_us, uint, 0644); ++module_param_named(ecn_reprobe_gain, bbr_ecn_reprobe_gain, uint, 0644); ++module_param_named(loss_thresh, bbr_loss_thresh, uint, 0664); ++module_param_named(full_loss_cnt, bbr_full_loss_cnt, uint, 0664); ++module_param_named(full_ecn_cnt, bbr_full_ecn_cnt, uint, 0664); ++module_param_named(inflight_headroom, bbr_inflight_headroom, uint, 0664); ++module_param_named(bw_probe_pif_gain, bbr_bw_probe_pif_gain, uint, 0664); ++module_param_named(bw_probe_reno_gain, bbr_bw_probe_reno_gain, uint, 0664); ++module_param_named(bw_probe_max_rounds, bbr_bw_probe_max_rounds, uint, 0664); ++module_param_named(bw_probe_rand_rounds, bbr_bw_probe_rand_rounds, uint, 0664); ++module_param_named(bw_probe_base_us, bbr_bw_probe_base_us, uint, 0664); ++module_param_named(bw_probe_rand_us, bbr_bw_probe_rand_us, uint, 0664); ++module_param_named(undo, bbr_undo, bool, 0664); ++module_param_named(fast_path, bbr_fast_path, bool, 0664); ++module_param_named(fast_ack_mode, bbr_fast_ack_mode, uint, 0664); ++module_param_named(refill_add_inc, bbr_refill_add_inc, uint, 0664); ++ ++static void bbr2_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_init(sk); /* run shared init code for v1 and v2 */ ++ ++ /* BBR v2 parameters: */ ++ bbr->params.beta = min_t(u32, 0xFFU, bbr_beta); ++ bbr->params.ecn_alpha_gain = min_t(u32, 0xFFU, bbr_ecn_alpha_gain); ++ bbr->params.ecn_alpha_init = min_t(u32, BBR_UNIT, bbr_ecn_alpha_init); ++ bbr->params.ecn_factor = min_t(u32, 0xFFU, bbr_ecn_factor); ++ bbr->params.ecn_thresh = min_t(u32, 0xFFU, bbr_ecn_thresh); ++ bbr->params.ecn_max_rtt_us = min_t(u32, 0x7ffffU, bbr_ecn_max_rtt_us); ++ bbr->params.ecn_reprobe_gain = min_t(u32, 0x1FF, bbr_ecn_reprobe_gain); ++ bbr->params.loss_thresh = min_t(u32, 0xFFU, bbr_loss_thresh); ++ bbr->params.full_loss_cnt = min_t(u32, 0xFU, bbr_full_loss_cnt); ++ bbr->params.full_ecn_cnt = min_t(u32, 0x3U, bbr_full_ecn_cnt); ++ bbr->params.inflight_headroom = ++ min_t(u32, 0xFFU, bbr_inflight_headroom); ++ bbr->params.bw_probe_pif_gain = ++ min_t(u32, 0x1FFU, bbr_bw_probe_pif_gain); ++ bbr->params.bw_probe_reno_gain = ++ min_t(u32, 0x1FFU, bbr_bw_probe_reno_gain); ++ bbr->params.bw_probe_max_rounds = ++ min_t(u32, 0xFFU, bbr_bw_probe_max_rounds); ++ bbr->params.bw_probe_rand_rounds = ++ min_t(u32, 0xFU, bbr_bw_probe_rand_rounds); ++ bbr->params.bw_probe_base_us = ++ min_t(u32, (1 << 26) - 1, bbr_bw_probe_base_us); ++ bbr->params.bw_probe_rand_us = ++ min_t(u32, (1 << 26) - 1, bbr_bw_probe_rand_us); ++ bbr->params.undo = bbr_undo; ++ bbr->params.fast_path = bbr_fast_path ? 1 : 0; ++ bbr->params.refill_add_inc = min_t(u32, 0x3U, bbr_refill_add_inc); ++ ++ /* BBR v2 state: */ ++ bbr->initialized = 1; ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr2_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr->params.ecn_alpha_init; ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ ++ tp->fast_ack_mode = min_t(u32, 0x2U, bbr_fast_ack_mode); ++ ++ if ((tp->ecn_flags & TCP_ECN_OK) && bbr_ecn_enable) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++static void bbr2_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs; ++ ++ /* Capture "current" data over the full round trip of loss, ++ * to have a better chance to see the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ memset(&rs, 0, sizeof(rs)); ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr2_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr2_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr2_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++static u32 bbr2_undo_cwnd(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->debug.undo = 1; ++ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ bbr->full_bw_cnt = 0; ++ bbr->loss_in_round = 0; ++ ++ if (!bbr->params.undo) ++ return tp->snd_cwnd; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ ++static u32 bbr2_ssthresh(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; ++ return tcp_sk(sk)->snd_ssthresh; ++} ++ ++static enum tcp_bbr2_phase bbr2_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR2_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR2_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR2_PHASE_PROBE_RTT; ++ default: ++ return BBR2_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR2_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR2_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR2_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR2_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR2_PHASE_INVALID; ++ } ++} ++ ++static size_t bbr2_get_info(struct sock *sk, u32 ext, int *attr, ++ union tcp_cc_info *info) ++{ ++ if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || ++ ext & (1 << (INET_DIAG_VEGASINFO - 1))) { ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ ++ memset(&info->bbr2, 0, sizeof(info->bbr2)); ++ info->bbr2.bbr_bw_lsb = (u32)bw; ++ info->bbr2.bbr_bw_msb = (u32)(bw >> 32); ++ info->bbr2.bbr_min_rtt = bbr->min_rtt_us; ++ info->bbr2.bbr_pacing_gain = bbr->pacing_gain; ++ info->bbr2.bbr_cwnd_gain = bbr->cwnd_gain; ++ info->bbr2.bbr_bw_hi_lsb = (u32)bw_hi; ++ info->bbr2.bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ info->bbr2.bbr_bw_lo_lsb = (u32)bw_lo; ++ info->bbr2.bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ info->bbr2.bbr_mode = bbr->mode; ++ info->bbr2.bbr_phase = (__u8)bbr2_get_phase(bbr); ++ info->bbr2.bbr_version = (__u8)2; ++ info->bbr2.bbr_inflight_lo = bbr->inflight_lo; ++ info->bbr2.bbr_inflight_hi = bbr->inflight_hi; ++ info->bbr2.bbr_extra_acked = bbr_extra_acked(sk); ++ *attr = INET_DIAG_BBRINFO; ++ return sizeof(info->bbr2); ++ } ++ return 0; ++} ++ ++static void bbr2_set_state(struct sock *sk, u8 new_state) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (new_state == TCP_CA_Loss) { ++ struct rate_sample rs = { .losses = 1 }; ++ struct bbr_context ctx = { 0 }; ++ ++ bbr->prev_ca_state = TCP_CA_Loss; ++ bbr->full_bw = 0; ++ if (!bbr2_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tp->snd_cwnd, bbr->prior_cwnd); ++ } ++ bbr_debug(sk, 0, &rs, &ctx); ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++ } ++} ++ ++static struct tcp_congestion_ops tcp_bbr2_cong_ops __read_mostly = { ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, ++ .name = "bbr2", ++ .owner = THIS_MODULE, ++ .init = bbr2_init, ++ .cong_control = bbr2_main, ++ .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr2_skb_marked_lost, ++ .undo_cwnd = bbr2_undo_cwnd, ++ .cwnd_event = bbr_cwnd_event, ++ .ssthresh = bbr2_ssthresh, ++ .tso_segs = bbr_tso_segs, ++ .get_info = bbr2_get_info, ++ .set_state = bbr2_set_state, ++}; ++ ++static int __init bbr_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); ++ return tcp_register_congestion_control(&tcp_bbr2_cong_ops); ++} ++ ++static void __exit bbr_unregister(void) ++{ ++ tcp_unregister_congestion_control(&tcp_bbr2_cong_ops); ++} ++ ++module_init(bbr_register); ++module_exit(bbr_unregister); ++ ++MODULE_AUTHOR("Van Jacobson "); ++MODULE_AUTHOR("Neal Cardwell "); ++MODULE_AUTHOR("Yuchung Cheng "); ++MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++ ++MODULE_LICENSE("Dual BSD/GPL"); ++MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); +diff --git a/include/net/tcp.h b/include/net/tcp.h +index c87291dd64a76..0c55d57e4b835 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -874,10 +874,11 @@ struct tcp_skb_cb { + __u32 ack_seq; /* Sequence number ACK'd */ + union { + struct { ++#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1) + /* There is space for up to 24 bytes */ +- __u32 in_flight:30,/* Bytes in flight at transmit */ +- is_app_limited:1, /* cwnd not fully used? */ ++ __u32 is_app_limited:1, /* cwnd not fully used? */ +- unused:1; ++ delivered_ce:20, ++ unused:11; + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +@@ -1028,7 +1030,9 @@ struct ack_sample { + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ ++ u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ + s32 delivered; /* number of packets delivered over interval */ ++ s32 delivered_ce; /* number of packets delivered w/ CE marks*/ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index 0de6935659635..fbab921670cc9 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -65,6 +65,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; ++ TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; + } + +@@ -138,6 +140,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + } + rs->delivered = tp->delivered - rs->prior_delivered; + ++ rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; ++ /* delivered_ce occupies less than 32 bits in the skb control block */ ++ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; ++ + /* Model sending data and receiving ACKs as separate pipeline phases + * for a window. Usually the ACK phase is longer, but with ACK + * compression the send phase can be longer. To be safe we use the +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 3437789f10e93..da11fde1610fd 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -377,7 +377,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index 7ac8aa4a0a367..c657923d2233d 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -111,17 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; ++ rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 0c55d57e4b835..65e39dc90c7ba 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -807,6 +807,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) + { + return tcp_ns_to_ts(skb->skb_mstamp_ns); +@@ -882,9 +887,9 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index fbab921670cc9..b64d82481ae93 100644 +@@ -150,7 +151,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 65e39dc90c7ba..b6f667a268ee9 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -890,6 +890,10 @@ struct tcp_skb_cb { + u32 first_tx_mstamp; + /* when we reached the "delivered" count */ + u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1036,6 +1040,7 @@ struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ + s32 delivered; /* number of packets delivered over interval */ + s32 delivered_ce; /* number of packets delivered w/ CE marks*/ + long interval_us; /* time for tp->delivered to incr "delivered" */ +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index b64d82481ae93..3ecc831206b58 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -40,6 +40,7 @@ + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + { + struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; + + /* In general we need to start delivery rate samples from the + * time we received the most recent ACK, to ensure we include +@@ -67,6 +68,18 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out); ++ in_flight = min(in_flight, TCPCB_IN_FLIGHT_MAX); ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +#@@ -92,6 +105,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, +# rs->prior_mstamp = scb->tx.delivered_mstamp; +# rs->is_app_limited = scb->tx.is_app_limited; +# rs->is_retrans = scb->sacked & TCPCB_RETRANS; +#+ rs->tx_in_flight = scb->tx.in_flight; +# +# /* Record send time of most recently ACKed packet: */ +# tp->first_tx_mstamp = tcp_skb_timestamp_us(skb); +diff --git a/include/net/tcp.h b/include/net/tcp.h +index b6f667a268ee9..61782eedb55de 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -894,6 +894,7 @@ struct tcp_skb_cb { + #define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) + u32 in_flight:20, /* packets in flight at transmit */ + unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1038,11 +1039,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ + u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index 3ecc831206b58..f454fce5e1fd9 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -67,6 +67,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; + + /* Check, sanitize, and record packets in flight after skb was sent. */ +@@ -154,6 +156,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 71e60a4e373cc..9580ef2b099c4 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1172,6 +1172,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + } + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index f454fce5e1fd9..796fa6e5310ce 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,13 +34,30 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 in_flight; + + /* In general we need to start delivery rate samples from the + * time we received the most recent ACK, to ensure we include +@@ -69,18 +86,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; + TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; +- +- /* Check, sanitize, and record packets in flight after skb was sent. */ +- in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); +- WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, +- "insane in_flight %u cc %s mss %u " +- "cwnd %u pif %u %u %u %u\n", +- in_flight, inet_csk(sk)->icsk_ca_ops->name, +- tp->mss_cache, tp->snd_cwnd, +- tp->packets_out, tp->retrans_out, +- tp->sacked_out, tp->lost_out); +- in_flight = min(in_flight, TCPCB_IN_FLIGHT_MAX); +- TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index 48d8a363319e5..1bd559c69e839 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -225,7 +225,8 @@ struct tcp_sock { + u8 compressed_ack; + u8 dup_ack_counter:2, + tlp_retrans:1, /* TLP is a retransmission */ +- unused:5; ++ fast_ack_mode:2, /* which fast ack mode ? */ ++ unused:3; + u32 chrono_start; /* Start time in jiffies of a TCP chrono */ + u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ + u8 chrono_type:2, /* current chronograph type */ +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index f5c336f8b0c8e..7de20f142443c 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3044,6 +3044,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index db5831e6c136a..153ed9010c0c2 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -179,6 +179,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index d85ce5cf94995..425014b2c9207 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5463,13 +5463,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 61782eedb55de..244798d8dcac8 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1056,6 +1056,7 @@ struct rate_sample { + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 71e6baca8cf1c..10516595f379d 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -3878,6 +3878,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + delivered = tcp_newly_delivered(sk, delivered, flag); + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 9580ef2b099c4..f22999a698a28 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1019,7 +1019,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED 0x1 + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN 0x2 +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS 0x4 ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1147,6 +1151,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 0b3db3ceee092..d85ce5cf94995 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +@@ -360,7 +360,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 244798d8dcac8..71e60a4e373cc 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1083,6 +1083,9 @@ struct tcp_congestion_ops { + /* override sysctl_tcp_min_tso_segs */ + u32 (*min_tso_segs)(struct sock *sk); + ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); ++ + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) + */ +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 10516595f379d..94ad315d8ab94 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -1071,7 +1071,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +diff --git a/include/net/tcp.h b/include/net/tcp.h +index f22999a698a28..82389ead6304e 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1084,8 +1084,8 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); + + /* react to a specific lost skb (optional) */ + void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); +diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c +index 0dcee9df13268..503eb58c67870 100644 +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -20,7 +20,7 @@ static u32 optional_ops[] = { + offsetof(struct tcp_congestion_ops, cwnd_event), + offsetof(struct tcp_congestion_ops, in_ack_event), + offsetof(struct tcp_congestion_ops, pkts_acked), +- offsetof(struct tcp_congestion_ops, min_tso_segs), ++ offsetof(struct tcp_congestion_ops, tso_segs), + offsetof(struct tcp_congestion_ops, sndbuf_expand), + offsetof(struct tcp_congestion_ops, cong_control), + }; +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 6274462b86b4b..c0d5a4211fc18 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -292,26 +292,40 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + sk->sk_pacing_rate = rate; + } + +-/* override sysctl_tcp_min_tso_segs */ + static u32 bbr_min_tso_segs(struct sock *sk) + { + return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; + } + ++/* Return the number of segments BBR would like in a TSO/GSO skb, given ++ * a particular max gso size as a constraint. ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ u32 segs; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk)); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) ++{ ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); ++} ++ ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), +- GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -1147,7 +1161,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index d9f1b83be6047..3437789f10e93 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1990,13 +1990,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; ++ u32 tso_segs; + +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs; +- +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + diff --git a/root/include/kernel-version.mk b/root/include/kernel-version.mk index 5de5b978..1862202d 100755 --- a/root/include/kernel-version.mk +++ b/root/include/kernel-version.mk @@ -9,7 +9,7 @@ endif LINUX_VERSION-5.4 = .194 LINUX_VERSION-5.10 = .64 LINUX_VERSION-5.14 = .6 -LINUX_VERSION-5.15 = .36 +LINUX_VERSION-5.15 = .50 LINUX_KERNEL_HASH-5.4.132 = 8466adbfb3579e751ede683496df7bb20f258b5f882250f3dd82be63736d00ef LINUX_KERNEL_HASH-5.4.182 = b2f1201f64f010e9e3c85d6f303a559a7944a80a0244a86b8f5035bd23f1f40d @@ -22,6 +22,7 @@ LINUX_KERNEL_HASH-5.15.15 = 1d3c57cf8071af174933df3e5d77da801e240a59da3c5e8406f7 LINUX_KERNEL_HASH-5.15.17 = 2787f5c0cc59984902fd97916dc604f39718c73817497c25f963141bfb70abde LINUX_KERNEL_HASH-5.15.29 = 5905e684602c47ae95746d4003cb834335e5451aca4ac7c3013f15dd49ed876e LINUX_KERNEL_HASH-5.15.36 = 36345db17a937c197c72ca9c7f34c262b3a12f927c237ff7770193014e29c690 +LINUX_KERNEL_HASH-5.15.50 = 554d507d37a23810fe8c83912761e4a4f73c40794bc685ff7ca98042fe1bd70f remove_uri_prefix=$(subst git://,,$(subst http://,,$(subst https://,,$(1)))) sanitize_uri=$(call qstrip,$(subst @,_,$(subst :,_,$(subst .,_,$(subst -,_,$(subst /,_,$(1))))))) diff --git a/root/include/target.mk b/root/include/target.mk index fb57553f..948bc50f 100755 --- a/root/include/target.mk +++ b/root/include/target.mk @@ -57,7 +57,7 @@ DEFAULT_PACKAGES.router:=\ firewall \ ip6tables \ iptables \ - kmod-ipt-offload \ + kmod-nft-offload \ odhcp6c \ odhcpd-ipv6only \ ppp \ diff --git a/root/target/linux/generic/config-5.15 b/root/target/linux/generic/config-5.15 index cf742d89..96198f84 100755 --- a/root/target/linux/generic/config-5.15 +++ b/root/target/linux/generic/config-5.15 @@ -5208,6 +5208,7 @@ CONFIG_SELECT_MEMORY_MODEL=y # CONFIG_SENSORS_DS620 is not set # CONFIG_SENSORS_EMC1403 is not set # CONFIG_SENSORS_EMC2103 is not set +# CONFIG_SENSORS_EMC2305 is not set # CONFIG_SENSORS_EMC6W201 is not set # CONFIG_SENSORS_F71805F is not set # CONFIG_SENSORS_F71882FG is not set @@ -6142,6 +6143,7 @@ CONFIG_SYSVIPC_SYSCTL=y # CONFIG_TCIC is not set CONFIG_TCP_CONG_ADVANCED=y # CONFIG_TCP_CONG_BBR is not set +# CONFIG_TCP_CONG_BBR2 is not set # CONFIG_TCP_CONG_BIC is not set # CONFIG_TCP_CONG_CDG is not set CONFIG_TCP_CONG_CUBIC=y diff --git a/root/target/linux/generic/hack-5.4/690-mptcp_v0.96.patch b/root/target/linux/generic/hack-5.4/690-mptcp_v0.96.patch index 7c2d820a..03490aaa 100755 --- a/root/target/linux/generic/hack-5.4/690-mptcp_v0.96.patch +++ b/root/target/linux/generic/hack-5.4/690-mptcp_v0.96.patch @@ -1,8 +1,8 @@ diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 979423e1b639..c70f5d160b48 100644 +index db9d53b879f8..3d859ac99b73 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -2748,6 +2748,10 @@ +@@ -2784,6 +2784,10 @@ allocations which rules out almost all kernel allocations. Use with caution! @@ -287,10 +287,10 @@ index 34c4436fd18f..828f79528b32 100644 union { diff --git a/include/net/mptcp.h b/include/net/mptcp.h new file mode 100644 -index 000000000000..bb18dacc310f +index 000000000000..630977f67614 --- /dev/null +++ b/include/net/mptcp.h -@@ -0,0 +1,1549 @@ +@@ -0,0 +1,1548 @@ +/* + * MPTCP implementation + * @@ -1207,7 +1207,6 @@ index 000000000000..bb18dacc310f +int mptcp_conn_request(struct sock *sk, struct sk_buff *skb); +void mptcp_enable_sock(struct sock *sk); +void mptcp_disable_sock(struct sock *sk); -+void mptcp_disable_static_key(void); +void mptcp_cookies_reqsk_init(struct request_sock *req, + struct mptcp_options_received *mopt, + struct sk_buff *skb); @@ -2118,7 +2117,7 @@ index 7f213cfcb3cc..c1be2daccb54 100644 /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS diff --git a/include/net/tcp.h b/include/net/tcp.h -index 65be8bd1f0f4..b31fc84741a0 100644 +index 65be8bd1f0f4..cf89f928640e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -182,6 +182,7 @@ @@ -2238,7 +2237,7 @@ index 65be8bd1f0f4..b31fc84741a0 100644 +u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb); +void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, u32 prior_snd_una); + -+void skb_clone_fraglist(struct sk_buff *skb); ++/* void skb_clone_fraglist(struct sk_buff *skb); */ + +void inet_twsk_free(struct inet_timewait_sock *tw); +int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb); @@ -2425,10 +2424,10 @@ index 65be8bd1f0f4..b31fc84741a0 100644 } +#ifdef CONFIG_MPTCP -+extern struct static_key mptcp_static_key; ++DECLARE_STATIC_KEY_FALSE(mptcp_static_key); +static inline bool mptcp(const struct tcp_sock *tp) +{ -+ return static_key_false(&mptcp_static_key) && tp->mpc; ++ return static_branch_unlikely(&mptcp_static_key) && tp->mpc; +} +#else +static inline bool mptcp(const struct tcp_sock *tp) @@ -2917,7 +2916,7 @@ index a03036456221..aebb337662c3 100644 IFF_ALLMULTI)); diff --git a/net/core/filter.c b/net/core/filter.c -index e16b2b5cda98..d038517091c6 100644 +index eba96343c7af..c84249eec838 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -73,6 +73,7 @@ @@ -2991,19 +2990,6 @@ index 283ddb2dbc7d..8f526a0d1912 100644 EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); + +EXPORT_TRACEPOINT_SYMBOL_GPL(mptcp_retransmit); -diff --git a/net/core/skbuff.c b/net/core/skbuff.c -index 5bdb3cd20d61..d430e46373f3 100644 ---- a/net/core/skbuff.c -+++ b/net/core/skbuff.c -@@ -582,7 +582,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb) - skb_drop_list(&skb_shinfo(skb)->frag_list); - } - --static void skb_clone_fraglist(struct sk_buff *skb) -+void skb_clone_fraglist(struct sk_buff *skb) - { - struct sk_buff *list; - diff --git a/net/core/sock.c b/net/core/sock.c index c84f68bff7f5..44675ce7e8de 100644 --- a/net/core/sock.c @@ -3187,7 +3173,7 @@ index a926de2e42b5..6d73dc6e2586 100644 default "dctcp" if DEFAULT_DCTCP default "cdg" if DEFAULT_CDG diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c -index a7a6b1adb698..8ebca975f8c8 100644 +index a7a6b1adb698..e1ccbe866a90 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -100,6 +100,7 @@ @@ -3198,17 +3184,7 @@ index a7a6b1adb698..8ebca975f8c8 100644 #include #include #include -@@ -150,6 +151,9 @@ void inet_sock_destruct(struct sock *sk) - return; - } - -+ if (sock_flag(sk, SOCK_MPTCP)) -+ mptcp_disable_static_key(); -+ - WARN_ON(atomic_read(&sk->sk_rmem_alloc)); - WARN_ON(refcount_read(&sk->sk_wmem_alloc)); - WARN_ON(sk->sk_wmem_queued); -@@ -227,6 +231,8 @@ int inet_listen(struct socket *sock, int backlog) +@@ -227,6 +228,8 @@ int inet_listen(struct socket *sock, int backlog) tcp_fastopen_init_key_once(sock_net(sk)); } @@ -3217,7 +3193,7 @@ index a7a6b1adb698..8ebca975f8c8 100644 err = inet_csk_listen_start(sk, backlog); if (err) goto out; -@@ -244,8 +250,7 @@ int inet_listen(struct socket *sock, int backlog) +@@ -244,8 +247,7 @@ int inet_listen(struct socket *sock, int backlog) * Create an inet socket. */ @@ -3227,7 +3203,7 @@ index a7a6b1adb698..8ebca975f8c8 100644 { struct sock *sk; struct inet_protosw *answer; -@@ -739,6 +744,24 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags, +@@ -739,6 +741,24 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags, lock_sock(sk2); sock_rps_record_flow(sk2); @@ -3252,7 +3228,7 @@ index a7a6b1adb698..8ebca975f8c8 100644 WARN_ON(!((1 << sk2->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_RECV | TCPF_CLOSE_WAIT | TCPF_CLOSE))); -@@ -1981,6 +2004,9 @@ static int __init inet_init(void) +@@ -1981,6 +2001,9 @@ static int __init inet_init(void) if (init_ipv4_mibs()) panic("%s: Cannot init ipv4 mibs\n", __func__); @@ -4417,7 +4393,7 @@ index a5ec77a5ad6f..f9fb4a268b9b 100644 * and queues the child into listener accept queue. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index b0e6fc2c5e10..925f03f425d4 100644 +index 0808110451a0..d278b28035ad 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -76,35 +76,15 @@ @@ -4561,7 +4537,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* Normally R but no L won't result in plain S */ if (!dup_sack && (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) -@@ -2962,7 +2969,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, +@@ -2965,7 +2972,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, */ tcp_update_rtt_min(sk, ca_rtt_us, flag); tcp_rtt_estimator(sk, seq_rtt_us); @@ -4570,7 +4546,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* RFC6298: only reset backoff on valid RTT measurement. */ inet_csk(sk)->icsk_backoff = 0; -@@ -3030,7 +3037,7 @@ static void tcp_set_xmit_timer(struct sock *sk) +@@ -3033,7 +3040,7 @@ static void tcp_set_xmit_timer(struct sock *sk) } /* If we get here, the whole TSO packet has not been acked. */ @@ -4579,7 +4555,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 { struct tcp_sock *tp = tcp_sk(sk); u32 packets_acked; -@@ -3050,8 +3057,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) +@@ -3053,8 +3060,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) return packets_acked; } @@ -4589,7 +4565,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 { const struct skb_shared_info *shinfo; -@@ -3156,6 +3162,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, +@@ -3159,6 +3165,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, */ if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { flag |= FLAG_DATA_ACKED; @@ -4598,7 +4574,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 } else { flag |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; -@@ -3276,7 +3284,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, +@@ -3279,7 +3287,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, return flag; } @@ -4607,7 +4583,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 { struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *head = tcp_send_head(sk); -@@ -3350,9 +3358,8 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, +@@ -3353,9 +3361,8 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ @@ -4619,7 +4595,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 { return after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || -@@ -3590,7 +3597,7 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +@@ -3593,7 +3600,7 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) } /* This routine deals with incoming acks, but not outgoing ones. */ @@ -4628,7 +4604,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); -@@ -3713,6 +3720,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -3716,6 +3723,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); @@ -4643,7 +4619,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); -@@ -3857,8 +3872,10 @@ static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) +@@ -3860,8 +3875,10 @@ static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) */ void tcp_parse_options(const struct net *net, const struct sk_buff *skb, @@ -4656,7 +4632,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 { const unsigned char *ptr; const struct tcphdr *th = tcp_hdr(skb); -@@ -3944,6 +3961,10 @@ void tcp_parse_options(const struct net *net, +@@ -3947,6 +3964,10 @@ void tcp_parse_options(const struct net *net, */ break; #endif @@ -4667,7 +4643,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, -@@ -4011,7 +4032,9 @@ static bool tcp_fast_parse_options(const struct net *net, +@@ -4014,7 +4035,9 @@ static bool tcp_fast_parse_options(const struct net *net, return true; } @@ -4678,7 +4654,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; -@@ -4121,7 +4144,7 @@ static inline bool tcp_paws_discard(const struct sock *sk, +@@ -4124,7 +4147,7 @@ static inline bool tcp_paws_discard(const struct sock *sk, static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) { return !before(end_seq, tp->rcv_wup) && @@ -4687,7 +4663,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 } /* When we get a reset we do this. */ -@@ -4170,6 +4193,11 @@ void tcp_fin(struct sock *sk) +@@ -4173,6 +4196,11 @@ void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -4699,7 +4675,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 inet_csk_schedule_ack(sk); sk->sk_shutdown |= RCV_SHUTDOWN; -@@ -4180,6 +4208,10 @@ void tcp_fin(struct sock *sk) +@@ -4183,6 +4211,10 @@ void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); @@ -4710,7 +4686,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 inet_csk_enter_pingpong_mode(sk); break; -@@ -4202,9 +4234,16 @@ void tcp_fin(struct sock *sk) +@@ -4205,9 +4237,16 @@ void tcp_fin(struct sock *sk) tcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: @@ -4728,7 +4704,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 break; default: /* Only TCP_LISTEN and TCP_CLOSE are left, in these -@@ -4226,6 +4265,10 @@ void tcp_fin(struct sock *sk) +@@ -4229,6 +4268,10 @@ void tcp_fin(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); @@ -4739,7 +4715,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* Do not send POLL_HUP for half duplex close. */ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) -@@ -4440,6 +4483,9 @@ static bool tcp_try_coalesce(struct sock *sk, +@@ -4443,6 +4486,9 @@ static bool tcp_try_coalesce(struct sock *sk, *fragstolen = false; @@ -4749,7 +4725,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* Its possible this segment overlaps with prior segment in queue */ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; -@@ -4494,7 +4540,7 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb) +@@ -4497,7 +4543,7 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb) /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -4758,7 +4734,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; -@@ -4517,7 +4563,14 @@ static void tcp_ofo_queue(struct sock *sk) +@@ -4520,7 +4566,14 @@ static void tcp_ofo_queue(struct sock *sk) p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); @@ -4774,7 +4750,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 tcp_drop(sk, skb); continue; } -@@ -4547,6 +4600,9 @@ static void tcp_ofo_queue(struct sock *sk) +@@ -4550,6 +4603,9 @@ static void tcp_ofo_queue(struct sock *sk) static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size) { @@ -4784,7 +4760,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, size)) { -@@ -4561,7 +4617,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, +@@ -4564,7 +4620,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, return 0; } @@ -4793,7 +4769,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 { struct tcp_sock *tp = tcp_sk(sk); struct rb_node **p, *parent; -@@ -4633,7 +4689,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) +@@ -4636,7 +4692,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) continue; } if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { @@ -4803,7 +4779,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* All the bits are present. Drop. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); -@@ -4680,6 +4737,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) +@@ -4683,6 +4740,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq); break; } @@ -4815,7 +4791,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 rb_erase(&skb1->rbnode, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); -@@ -4691,7 +4753,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) +@@ -4694,7 +4756,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tp->ooo_last_skb = skb; add_sack: @@ -4824,7 +4800,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 tcp_sack_new_ofo_skb(sk, seq, end_seq); end: if (skb) { -@@ -4705,8 +4767,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) +@@ -4708,8 +4770,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) } } @@ -4835,7 +4811,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 { int eaten; struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); -@@ -4781,7 +4843,8 @@ void tcp_data_ready(struct sock *sk) +@@ -4784,7 +4846,8 @@ void tcp_data_ready(struct sock *sk) if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && !sock_flag(sk, SOCK_DONE) && @@ -4845,7 +4821,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 return; sk->sk_data_ready(sk); -@@ -4793,10 +4856,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +@@ -4796,10 +4859,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) bool fragstolen; int eaten; @@ -4861,7 +4837,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); -@@ -4807,7 +4874,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +@@ -4810,7 +4877,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) * Out of sequence packets to the out_of_order_queue. */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { @@ -4870,7 +4846,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } -@@ -4823,7 +4890,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +@@ -4826,7 +4893,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } eaten = tcp_queue_rcv(sk, skb, &fragstolen); @@ -4879,7 +4855,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); -@@ -4845,7 +4912,11 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +@@ -4848,7 +4915,11 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten > 0) kfree_skb_partial(skb, fragstolen); @@ -4892,7 +4868,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 tcp_data_ready(sk); return; } -@@ -4865,7 +4936,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +@@ -4868,7 +4939,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } /* Out of window. F.e. zero window probe. */ @@ -4902,7 +4878,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 goto out_of_window; if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { -@@ -4875,7 +4947,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +@@ -4878,7 +4950,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. */ @@ -4911,7 +4887,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } -@@ -5188,7 +5260,7 @@ static int tcp_prune_queue(struct sock *sk) +@@ -5191,7 +5263,7 @@ static int tcp_prune_queue(struct sock *sk) return -1; } @@ -4920,7 +4896,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 { const struct tcp_sock *tp = tcp_sk(sk); -@@ -5223,7 +5295,7 @@ static void tcp_new_space(struct sock *sk) +@@ -5226,7 +5298,7 @@ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -4929,7 +4905,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 tcp_sndbuf_expand(sk); tp->snd_cwnd_stamp = tcp_jiffies32; } -@@ -5247,10 +5319,11 @@ void tcp_check_space(struct sock *sk) +@@ -5250,10 +5322,11 @@ void tcp_check_space(struct sock *sk) sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); /* pairs with tcp_poll() */ smp_mb(); @@ -4944,7 +4920,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } } -@@ -5269,6 +5342,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) +@@ -5272,6 +5345,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_sock *tp = tcp_sk(sk); unsigned long rtt, delay; @@ -4953,7 +4929,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && -@@ -5277,8 +5352,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) +@@ -5280,8 +5355,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) * If application uses SO_RCVLOWAT, we want send ack now if * we have not received enough bytes to satisfy the condition. */ @@ -4964,7 +4940,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ -@@ -5413,6 +5488,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t +@@ -5416,6 +5491,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t { struct tcp_sock *tp = tcp_sk(sk); @@ -4975,7 +4951,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* Check if we get a new urgent pointer - normally not. */ if (th->urg) tcp_check_urg(sk, th); -@@ -5555,9 +5634,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, +@@ -5558,9 +5637,15 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, goto discard; } @@ -4991,7 +4967,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 tcp_drop(sk, skb); return false; } -@@ -5614,6 +5699,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) +@@ -5617,6 +5702,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tp->rx_opt.saw_tstamp = 0; @@ -5002,7 +4978,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made * 'S' will always be tp->tcp_header_len >> 2 -@@ -5788,7 +5877,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) +@@ -5791,7 +5880,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_call_bpf(sk, bpf_op, 0, NULL); tcp_init_congestion_control(sk); @@ -5011,7 +4987,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 } void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) -@@ -5825,17 +5914,24 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, +@@ -5828,17 +5917,24 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); @@ -5038,7 +5014,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 mss = opt.mss_clamp; } -@@ -5859,7 +5955,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, +@@ -5862,7 +5958,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); @@ -5051,7 +5027,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 skb_rbtree_walk_from(data) { if (__tcp_retransmit_skb(sk, data, 1)) break; -@@ -5914,9 +6014,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, +@@ -5917,9 +6017,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; @@ -5066,7 +5042,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; -@@ -5977,11 +6081,41 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, +@@ -5980,11 +6084,41 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_try_undo_spurious_syn(sk); tcp_ack(sk, skb, FLAG_SLOWPATH); @@ -5108,7 +5084,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. -@@ -6003,6 +6137,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, +@@ -6006,6 +6140,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } @@ -5120,7 +5096,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); -@@ -6026,9 +6165,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, +@@ -6029,9 +6168,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, } if (fastopen_fail) return -1; @@ -5135,7 +5111,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * -@@ -6067,6 +6209,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, +@@ -6070,6 +6212,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_paws_reject(&tp->rx_opt, 0)) goto discard_and_undo; @@ -5143,7 +5119,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 if (th->syn) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. -@@ -6083,9 +6226,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, +@@ -6086,9 +6229,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } @@ -5159,7 +5135,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. -@@ -6173,6 +6322,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) +@@ -6176,6 +6325,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) */ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) @@ -5167,7 +5143,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); -@@ -6215,6 +6365,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) +@@ -6218,6 +6368,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->rx_opt.saw_tstamp = 0; tcp_mstamp_refresh(tp); queued = tcp_rcv_synsent_state_process(sk, skb, th); @@ -5184,7 +5160,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 if (queued >= 0) return queued; -@@ -6287,6 +6447,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) +@@ -6290,6 +6450,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -5193,7 +5169,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); -@@ -6296,6 +6458,30 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) +@@ -6299,6 +6461,30 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_initialize_rcv_mss(sk); tcp_fast_path_on(tp); @@ -5224,7 +5200,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 break; case TCP_FIN_WAIT1: { -@@ -6336,7 +6522,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) +@@ -6339,7 +6525,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); @@ -5234,7 +5210,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing * and not so rare event. We still can lose it now, -@@ -6345,7 +6532,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) +@@ -6348,7 +6535,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) */ inet_csk_reset_keepalive_timer(sk, tmo); } else { @@ -5243,7 +5219,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 goto discard; } break; -@@ -6353,7 +6540,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) +@@ -6356,7 +6543,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_CLOSING: if (tp->snd_una == tp->write_seq) { @@ -5252,7 +5228,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 goto discard; } break; -@@ -6365,6 +6552,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) +@@ -6368,6 +6555,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) goto discard; } break; @@ -5262,7 +5238,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 } /* step 6: check the URG bit */ -@@ -6386,7 +6576,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) +@@ -6389,7 +6579,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) */ if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && @@ -5272,7 +5248,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk); return 1; -@@ -6488,6 +6679,8 @@ static void tcp_openreq_init(struct request_sock *req, +@@ -6491,6 +6682,8 @@ static void tcp_openreq_init(struct request_sock *req, ireq->wscale_ok = rx_opt->wscale_ok; ireq->acked = 0; ireq->ecn_ok = 0; @@ -5281,7 +5257,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 ireq->ir_rmt_port = tcp_hdr(skb)->source; ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); -@@ -6613,12 +6806,17 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, +@@ -6616,12 +6809,17 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. @@ -5300,7 +5276,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 } if (sk_acceptq_is_full(sk)) { -@@ -6636,8 +6834,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, +@@ -6639,8 +6837,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; tmp_opt.user_mss = tp->rx_opt.user_mss; @@ -5311,7 +5287,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); -@@ -6652,7 +6850,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, +@@ -6655,7 +6853,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); @@ -5321,7 +5297,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; -@@ -6688,7 +6887,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, +@@ -6691,7 +6890,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_ecn_create_request(req, skb, sk, dst); if (want_cookie) { @@ -5330,7 +5306,7 @@ index b0e6fc2c5e10..925f03f425d4 100644 req->cookie_ts = tmp_opt.tstamp_ok; if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; -@@ -6703,17 +6902,25 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, +@@ -6706,17 +6905,25 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); } if (fastopen_sk) { @@ -6089,7 +6065,7 @@ index 9b038cb0a43d..84db337f5282 100644 return ret; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 67493ec6318a..f201d6a394ad 100644 +index 739fc69cdcc6..a4fa05e5562d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -37,6 +37,12 @@ @@ -6922,10 +6898,10 @@ index fa2ae96ecdc4..d2b3e30b8788 100644 } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c -index 92b32d131e1c..4490be6d3e43 100644 +index e29553e4f4ee..a4882b96f59a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c -@@ -967,6 +967,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) +@@ -978,6 +978,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) kfree_rcu(ifp, rcu); } @@ -7681,10 +7657,10 @@ index 063898cae3e5..78d91dfc3f06 100644 /* thinking of making this const? Don't. diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig new file mode 100644 -index 000000000000..6e05dab4c632 +index 000000000000..1a7e9bbc766d --- /dev/null +++ b/net/mptcp/Kconfig -@@ -0,0 +1,154 @@ +@@ -0,0 +1,155 @@ +# +# MPTCP configuration +# @@ -7767,6 +7743,7 @@ index 000000000000..6e05dab4c632 + default "fullmesh" if DEFAULT_FULLMESH + default "ndiffports" if DEFAULT_NDIFFPORTS + default "binder" if DEFAULT_BINDER ++ default "netlink" if DEFAULT_NETLINK + default "default" + +menuconfig MPTCP_SCHED_ADVANCED @@ -9396,10 +9373,10 @@ index 000000000000..9eb7628053f6 +MODULE_VERSION("0.1"); diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c new file mode 100644 -index 000000000000..38c97eaa0ddc +index 000000000000..9a1b5a048b70 --- /dev/null +++ b/net/mptcp/mptcp_ctrl.c -@@ -0,0 +1,3346 @@ +@@ -0,0 +1,3302 @@ +/* + * MPTCP implementation - MPTCP-control + * @@ -9479,7 +9456,7 @@ index 000000000000..38c97eaa0ddc + +bool mptcp_init_failed __read_mostly; + -+struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE; ++DEFINE_STATIC_KEY_FALSE(mptcp_static_key); +EXPORT_SYMBOL(mptcp_static_key); + +static void mptcp_key_hash(u8 version, u64 key, u32 *token, u64 *idsn); @@ -9793,71 +9770,14 @@ index 000000000000..38c97eaa0ddc + mptcp_key_hash(tp->mptcp_ver, tp->mptcp_loc_key, &tp->mptcp_loc_token, NULL); +} + -+#ifdef CONFIG_JUMP_LABEL -+static atomic_t mptcp_needed_deferred; -+static atomic_t mptcp_wanted; -+ -+static void mptcp_clear(struct work_struct *work) -+{ -+ int deferred = atomic_xchg(&mptcp_needed_deferred, 0); -+ int wanted; -+ -+ wanted = atomic_add_return(deferred, &mptcp_wanted); -+ if (wanted > 0) -+ static_key_enable(&mptcp_static_key); -+ else -+ static_key_disable(&mptcp_static_key); -+} -+ -+static DECLARE_WORK(mptcp_work, mptcp_clear); -+#endif -+ -+static void mptcp_enable_static_key_bh(void) -+{ -+#ifdef CONFIG_JUMP_LABEL -+ int wanted; -+ -+ while (1) { -+ wanted = atomic_read(&mptcp_wanted); -+ if (wanted <= 0) -+ break; -+ if (atomic_cmpxchg(&mptcp_wanted, wanted, wanted + 1) == wanted) -+ return; -+ } -+ atomic_inc(&mptcp_needed_deferred); -+ schedule_work(&mptcp_work); -+#else -+ static_key_slow_inc(&mptcp_static_key); -+#endif -+} -+ +static void mptcp_enable_static_key(void) +{ -+#ifdef CONFIG_JUMP_LABEL -+ atomic_inc(&mptcp_wanted); -+ static_key_enable(&mptcp_static_key); -+#else -+ static_key_slow_inc(&mptcp_static_key); -+#endif -+} ++ if (!static_branch_unlikely(&mptcp_static_key)) { ++ static int __mptcp_static_key = 0; + -+void mptcp_disable_static_key(void) -+{ -+#ifdef CONFIG_JUMP_LABEL -+ int wanted; -+ -+ while (1) { -+ wanted = atomic_read(&mptcp_wanted); -+ if (wanted <= 1) -+ break; -+ if (atomic_cmpxchg(&mptcp_wanted, wanted, wanted - 1) == wanted) -+ return; ++ if (cmpxchg(&__mptcp_static_key, 0, 1) == 0) ++ static_branch_enable(&mptcp_static_key); + } -+ atomic_dec(&mptcp_needed_deferred); -+ schedule_work(&mptcp_work); -+#else -+ static_key_slow_dec(&mptcp_static_key); -+#endif +} + +void mptcp_enable_sock(struct sock *sk) @@ -9898,8 +9818,6 @@ index 000000000000..38c97eaa0ddc + else + inet_csk(sk)->icsk_af_ops = &ipv6_specific; +#endif -+ -+ mptcp_disable_static_key(); + } +} + @@ -10120,8 +10038,6 @@ index 000000000000..38c97eaa0ddc + mptcp_mpcb_cleanup(tp->mpcb); + } + -+ WARN_ON(!static_key_false(&mptcp_static_key)); -+ + /* Must be called here, because this will decrement the jump-label. */ + inet_sock_destruct(sk); +} @@ -10682,6 +10598,28 @@ index 000000000000..38c97eaa0ddc + meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1; +} + ++/* Inspired by inet_csk_prepare_forced_close */ ++static void mptcp_icsk_forced_close(struct sock *sk) ++{ ++ /* The problem with inet_csk_prepare_forced_close is that it unlocks ++ * before calling tcp_done. That is fine for sockets that are not ++ * yet in the ehash table. But for us we already are there. Thus, ++ * if we unlock we run the risk of processing packets while inside ++ * tcp_done() and friends. That can cause all kind of problems... ++ */ ++ ++ /* The below has to be done to allow calling inet_csk_destroy_sock */ ++ sock_set_flag(sk, SOCK_DEAD); ++ percpu_counter_inc(sk->sk_prot->orphan_count); ++ /* inet_sk(sk)->inet_num = 0; */ ++ ++ tcp_done(sk); ++ ++ /* sk_clone_lock locked the socket and set refcnt to 2 */ ++ bh_unlock_sock(sk); ++ sock_put(sk); ++} ++ +static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, + int rem_key_set, __u8 mptcp_ver, u32 window) +{ @@ -10897,10 +10835,8 @@ index 000000000000..38c97eaa0ddc + meta_sk->sk_max_ack_backlog = 32; + meta_sk->sk_ack_backlog = 0; + -+ if (!sock_flag(meta_sk, SOCK_MPTCP)) { -+ mptcp_enable_static_key_bh(); ++ if (!sock_flag(meta_sk, SOCK_MPTCP)) + sock_set_flag(meta_sk, SOCK_MPTCP); -+ } + + /* Redefine function-pointers as the meta-sk is now fully ready */ + meta_tp->mpc = 1; @@ -10937,8 +10873,7 @@ index 000000000000..38c97eaa0ddc + kmem_cache_free(mptcp_sock_cache, master_tp->mptcp); + master_tp->mptcp = NULL; + -+ inet_csk_prepare_forced_close(master_sk); -+ tcp_done(master_sk); ++ mptcp_icsk_forced_close(master_sk); + return -EINVAL; + +err_inherit_port: @@ -10993,9 +10928,8 @@ index 000000000000..38c97eaa0ddc + tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb); + /* No more space for more subflows? */ + if (!tp->mptcp->path_index) { -+ WARN_ON(is_master_tp(tp)); -+ + kmem_cache_free(mptcp_sock_cache, tp->mptcp); ++ tp->mptcp = NULL; + return -EPERM; + } + @@ -11005,10 +10939,8 @@ index 000000000000..38c97eaa0ddc + tp->mpcb = mpcb; + tp->meta_sk = meta_sk; + -+ if (!sock_flag(sk, SOCK_MPTCP)) { -+ mptcp_enable_static_key_bh(); ++ if (!sock_flag(sk, SOCK_MPTCP)) + sock_set_flag(sk, SOCK_MPTCP); -+ } + + tp->mpc = 1; + tp->ops = &mptcp_sub_specific; @@ -11696,8 +11628,7 @@ index 000000000000..38c97eaa0ddc + if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key, + mtreq->rem_key_set, mtreq->mptcp_ver, + child_tp->snd_wnd)) { -+ inet_csk_prepare_forced_close(meta_sk); -+ tcp_done(meta_sk); ++ mptcp_icsk_forced_close(meta_sk); + + return -ENOBUFS; + } @@ -11918,9 +11849,11 @@ index 000000000000..38c97eaa0ddc + /* Drop this request - sock creation failed. */ + inet_csk_reqsk_queue_drop(meta_sk, req); + reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req); -+ inet_csk_prepare_forced_close(child); -+ tcp_done(child); ++ ++ mptcp_icsk_forced_close(child); ++ + bh_unlock_sock(meta_sk); ++ + return meta_sk; +} + @@ -17527,10 +17460,10 @@ index 000000000000..7ce97409e1e2 +} diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c new file mode 100644 -index 000000000000..7594c8bafb81 +index 000000000000..c908e02c72e1 --- /dev/null +++ b/net/mptcp/mptcp_ipv4.c -@@ -0,0 +1,432 @@ +@@ -0,0 +1,433 @@ +/* + * MPTCP implementation - IPv4-specific functions + * @@ -17798,7 +17731,7 @@ index 000000000000..7594c8bafb81 +{ + struct tcp_sock *tp; + struct sock *sk; -+ struct sockaddr_in loc_in, rem_in; ++ struct sockaddr_in sockaddr; + struct socket_alloc sock_full; + struct socket *sock = (struct socket *)&sock_full; + int ret; @@ -17836,38 +17769,39 @@ index 000000000000..7594c8bafb81 + timer_setup(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, 0); + + /** Then, connect the socket to the peer */ -+ loc_in.sin_family = AF_INET; -+ rem_in.sin_family = AF_INET; -+ loc_in.sin_port = sport; -+ if (rem->port) -+ rem_in.sin_port = rem->port; -+ else -+ rem_in.sin_port = inet_sk(meta_sk)->inet_dport; -+ loc_in.sin_addr = loc->addr; -+ rem_in.sin_addr = rem->addr; ++ sockaddr.sin_family = AF_INET; ++ sockaddr.sin_port = sport; ++ sockaddr.sin_addr = loc->addr; + + if (loc->if_idx) + sk->sk_bound_dev_if = loc->if_idx; + -+ ret = kernel_bind(sock, (struct sockaddr *)&loc_in, ++ ret = kernel_bind(sock, (struct sockaddr *)&sockaddr, + sizeof(struct sockaddr_in)); + if (ret < 0) { + net_err_ratelimited("%s: token %#x bind() to %pI4 index %d failed, error %d\n", + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, -+ &loc_in.sin_addr, loc->if_idx, ret); ++ &sockaddr.sin_addr, loc->if_idx, ret); + goto error; + } + -+ mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d ifidx: %d\n", -+ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, -+ tp->mptcp->path_index, &loc_in.sin_addr, -+ ntohs(loc_in.sin_port), &rem_in.sin_addr, -+ ntohs(rem_in.sin_port), loc->if_idx); -+ + if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4) + tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr); + -+ ret = kernel_connect(sock, (struct sockaddr *)&rem_in, ++ sockaddr.sin_family = AF_INET; ++ if (rem->port) ++ sockaddr.sin_port = rem->port; ++ else ++ sockaddr.sin_port = inet_sk(meta_sk)->inet_dport; ++ sockaddr.sin_addr = rem->addr; ++ ++ mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d ifidx: %d\n", ++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, ++ tp->mptcp->path_index, &loc->addr, ++ ntohs(sport), &sockaddr.sin_addr, ++ ntohs(sockaddr.sin_port), loc->if_idx); ++ ++ ret = kernel_connect(sock, (struct sockaddr *)&sockaddr, + sizeof(struct sockaddr_in), O_NONBLOCK); + if (ret < 0 && ret != -EINPROGRESS) { + net_err_ratelimited("%s: MPTCP subsocket connect() failed, error %d\n", @@ -17965,10 +17899,10 @@ index 000000000000..7594c8bafb81 +} diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c new file mode 100644 -index 000000000000..fa13a99b735e +index 000000000000..ebe3f5f97460 --- /dev/null +++ b/net/mptcp/mptcp_ipv6.c -@@ -0,0 +1,481 @@ +@@ -0,0 +1,482 @@ +/* + * MPTCP implementation - IPv6-specific functions + * @@ -18265,7 +18199,7 @@ index 000000000000..fa13a99b735e +{ + struct tcp_sock *tp; + struct sock *sk; -+ struct sockaddr_in6 loc_in, rem_in; ++ struct sockaddr_in6 sockaddr; + struct socket_alloc sock_full; + struct socket *sock = (struct socket *)&sock_full; + int ret; @@ -18303,38 +18237,39 @@ index 000000000000..fa13a99b735e + timer_setup(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, 0); + + /** Then, connect the socket to the peer */ -+ loc_in.sin6_family = AF_INET6; -+ rem_in.sin6_family = AF_INET6; -+ loc_in.sin6_port = sport; -+ if (rem->port) -+ rem_in.sin6_port = rem->port; -+ else -+ rem_in.sin6_port = inet_sk(meta_sk)->inet_dport; -+ loc_in.sin6_addr = loc->addr; -+ rem_in.sin6_addr = rem->addr; ++ sockaddr.sin6_family = AF_INET6; ++ sockaddr.sin6_port = sport; ++ sockaddr.sin6_addr = loc->addr; + + if (loc->if_idx) + sk->sk_bound_dev_if = loc->if_idx; + -+ ret = kernel_bind(sock, (struct sockaddr *)&loc_in, ++ ret = kernel_bind(sock, (struct sockaddr *)&sockaddr, + sizeof(struct sockaddr_in6)); + if (ret < 0) { + net_err_ratelimited("%s: token %#x bind() to %pI6 index %d failed, error %d\n", + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, -+ &loc_in.sin6_addr, loc->if_idx, ret); ++ &sockaddr.sin6_addr, loc->if_idx, ret); + goto error; + } + -+ mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d ifidx: %u\n", -+ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, -+ tp->mptcp->path_index, &loc_in.sin6_addr, -+ ntohs(loc_in.sin6_port), &rem_in.sin6_addr, -+ ntohs(rem_in.sin6_port), loc->if_idx); -+ + if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6) + tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr); + -+ ret = kernel_connect(sock, (struct sockaddr *)&rem_in, ++ sockaddr.sin6_family = AF_INET6; ++ if (rem->port) ++ sockaddr.sin6_port = rem->port; ++ else ++ sockaddr.sin6_port = inet_sk(meta_sk)->inet_dport; ++ sockaddr.sin6_addr = rem->addr; ++ ++ mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d ifidx: %u\n", ++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, ++ tp->mptcp->path_index, &loc->addr, ++ ntohs(sport), &sockaddr.sin6_addr, ++ ntohs(sockaddr.sin6_port), loc->if_idx); ++ ++ ret = kernel_connect(sock, (struct sockaddr *)&sockaddr, + sizeof(struct sockaddr_in6), O_NONBLOCK); + if (ret < 0 && ret != -EINPROGRESS) { + net_err_ratelimited("%s: MPTCP subsocket connect() failed, error %d\n", @@ -20232,10 +20167,10 @@ index 000000000000..161a63f336d7 +MODULE_VERSION("0.1"); diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c new file mode 100644 -index 000000000000..8bf9eb4724fb +index 000000000000..a8a5787adbf1 --- /dev/null +++ b/net/mptcp/mptcp_output.c -@@ -0,0 +1,2008 @@ +@@ -0,0 +1,2015 @@ +/* + * MPTCP implementation - Sending side + * @@ -20868,6 +20803,13 @@ index 000000000000..8bf9eb4724fb + if (!tp->mptcp->fully_established) { + tp->mptcp->second_packet = 1; + tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq; ++ if (mptcp_is_data_fin(skb)) { ++ /* If this is a data-fin, do not account for it. Because, ++ * a data-fin does not consume space in the subflow ++ * sequence number space. ++ */ ++ tp->mptcp->last_end_data_seq--; ++ } + } + + return true; @@ -21828,7 +21770,7 @@ index 000000000000..8bf9eb4724fb + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer, + jiffies + icsk->icsk_rto); -+ if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) ++ if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0)) + __sk_dst_reset(sk); + +out:; @@ -22053,7 +21995,7 @@ index 000000000000..8bf9eb4724fb + * linear-timeout retransmissions into a black hole + */ + if (meta_sk->sk_state == TCP_ESTABLISHED && -+ (meta_tp->thin_lto || sock_net(meta_sk)->ipv4.sysctl_tcp_thin_linear_timeouts) && ++ (meta_tp->thin_lto || READ_ONCE(sock_net(meta_sk)->ipv4.sysctl_tcp_thin_linear_timeouts)) && + tcp_stream_is_thin(meta_tp) && + meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { + meta_icsk->icsk_backoff = 0; @@ -24201,3 +24143,268 @@ index 0bfad86ec960..ed7013398991 100644 BPF_TCP_MAX_STATES /* Leave at the end! */ }; +diff --git a/include/net/mptcp.h b/include/net/mptcp.h +index 630977f67614..f2efa46027d0 100644 +--- a/include/net/mptcp.h ++++ b/include/net/mptcp.h +@@ -732,6 +732,7 @@ static inline struct sock *mptcp_to_sock(const struct mptcp_tcp_sock *mptcp) + + #define MPTCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mptcp.mptcp_statistics, field) + #define MPTCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mptcp.mptcp_statistics, field) ++#define MPTCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mptcp.mptcp_statistics, field, val) + + enum + { +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index d278b28035ad..c0572253c723 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4603,17 +4603,16 @@ static int tcp_prune_queue(struct sock *sk); + static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, + unsigned int size) + { +- if (mptcp(tcp_sk(sk))) +- sk = mptcp_meta_sk(sk); ++ struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk; + +- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || ++ if (atomic_read(&meta_sk->sk_rmem_alloc) > meta_sk->sk_rcvbuf || + !sk_rmem_schedule(sk, skb, size)) { + +- if (tcp_prune_queue(sk) < 0) ++ if (tcp_prune_queue(meta_sk) < 0) + return -1; + + while (!sk_rmem_schedule(sk, skb, size)) { +- if (!tcp_prune_ofo_queue(sk)) ++ if (!tcp_prune_ofo_queue(meta_sk)) + return -1; + } + } +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index a4fa05e5562d..2cb4c4a0ce4e 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1391,6 +1391,12 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + ++#ifdef CONFIG_MPTCP ++ memcpy(TCP_SKB_CB(buff)->dss, TCP_SKB_CB(skb)->dss, ++ sizeof(TCP_SKB_CB(skb)->dss)); ++ TCP_SKB_CB(buff)->mptcp_flags = TCP_SKB_CB(skb)->mptcp_flags; ++#endif ++ + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->tcp_flags; + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); +@@ -1954,6 +1960,12 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + ++#ifdef CONFIG_MPTCP ++ memcpy(TCP_SKB_CB(buff)->dss, TCP_SKB_CB(skb)->dss, ++ sizeof(TCP_SKB_CB(skb)->dss)); ++ TCP_SKB_CB(buff)->mptcp_flags = TCP_SKB_CB(skb)->mptcp_flags; ++#endif ++ + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->tcp_flags; + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); +diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c +index 9a1b5a048b70..e6cac7e4de31 100644 +--- a/net/mptcp/mptcp_ctrl.c ++++ b/net/mptcp/mptcp_ctrl.c +@@ -1773,6 +1773,7 @@ static int mptcp_sub_send_fin(struct sock *sk) + /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ + tcp_init_nondata_skb(skb, tp->write_seq, + TCPHDR_ACK | TCPHDR_FIN); ++ sk_forced_mem_schedule(sk, skb->truesize); + tcp_queue_skb(sk, skb); + } + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); +@@ -2420,6 +2421,7 @@ struct sock *mptcp_check_req_child(struct sock *meta_sk, + * some of the fields + */ + child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio; ++ child_tp->mptcp->low_prio = mtreq->low_prio; + + /* We should allow proper increase of the snd/rcv-buffers. Thus, we + * use the original values instead of the bloated up ones from the +diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c +index 7ce97409e1e2..01a81e3f7690 100644 +--- a/net/mptcp/mptcp_input.c ++++ b/net/mptcp/mptcp_input.c +@@ -1023,6 +1023,7 @@ static int mptcp_queue_skb(struct sock *sk) + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; + mptcp_prepare_skb(tmp1, sk); + __skb_unlink(tmp1, &sk->sk_receive_queue); ++ sk_forced_mem_schedule(meta_sk, tmp1->truesize); + /* MUST be done here, because fragstolen may be true later. + * Then, kfree_skb_partial will not account the memory. + */ +@@ -1054,6 +1055,7 @@ static int mptcp_queue_skb(struct sock *sk) + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; + mptcp_prepare_skb(tmp1, sk); + __skb_unlink(tmp1, &sk->sk_receive_queue); ++ sk_forced_mem_schedule(meta_sk, tmp1->truesize); + /* MUST be done here, because fragstolen may be true. + * Then, kfree_skb_partial will not account the memory. + */ +diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c +index c908e02c72e1..fbcf47c46783 100644 +--- a/net/mptcp/mptcp_ipv4.c ++++ b/net/mptcp/mptcp_ipv4.c +@@ -171,14 +171,14 @@ int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb) + if (!sk) + goto new_subflow; + +- if (is_meta_sk(sk)) { +- WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); +- sock_put(sk); ++ if (sk->sk_state == TCP_TIME_WAIT) { ++ inet_twsk_put(inet_twsk(sk)); + goto discard; + } + +- if (sk->sk_state == TCP_TIME_WAIT) { +- inet_twsk_put(inet_twsk(sk)); ++ if (is_meta_sk(sk)) { ++ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); ++ sock_put(sk); + goto discard; + } + +diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c +index ebe3f5f97460..915dd7892037 100644 +--- a/net/mptcp/mptcp_ipv6.c ++++ b/net/mptcp/mptcp_ipv6.c +@@ -200,14 +200,14 @@ int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb) + if (!sk) + goto new_subflow; + +- if (is_meta_sk(sk)) { +- WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); +- sock_put(sk); ++ if (sk->sk_state == TCP_TIME_WAIT) { ++ inet_twsk_put(inet_twsk(sk)); + goto discard; + } + +- if (sk->sk_state == TCP_TIME_WAIT) { +- inet_twsk_put(inet_twsk(sk)); ++ if (is_meta_sk(sk)) { ++ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__); ++ sock_put(sk); + goto discard; + } + +diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c +index a8a5787adbf1..226084d11961 100644 +--- a/net/mptcp/mptcp_output.c ++++ b/net/mptcp/mptcp_output.c +@@ -293,6 +293,7 @@ static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk + void mptcp_reinject_data(struct sock *sk, int clone_it) + { + struct sock *meta_sk = mptcp_meta_sk(sk); ++ struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb_it, *tmp; + enum tcp_queue tcp_queue; + +@@ -320,6 +321,10 @@ void mptcp_reinject_data(struct sock *sk, int clone_it) + TCP_FRAG_IN_WRITE_QUEUE); + } + ++ /* We are emptying the rtx-queue. highest_sack is invalid */ ++ if (!clone_it) ++ tp->highest_sack = NULL; ++ + skb_it = tcp_rtx_queue_head(sk); + skb_rbtree_walk_from_safe(skb_it, tmp) { + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it); +@@ -352,11 +357,11 @@ void mptcp_reinject_data(struct sock *sk, int clone_it) + + /* If sk has sent the empty data-fin, we have to reinject it too. */ + if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 && +- TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tcp_sk(sk)->mptcp->path_index)) { ++ TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) { + __mptcp_reinject_data(skb_it, meta_sk, NULL, 1, tcp_queue); + } + +- tcp_sk(sk)->pf = 1; ++ tp->pf = 1; + + mptcp_push_pending_frames(meta_sk); + } +@@ -554,9 +559,20 @@ static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject) + struct tcp_skb_cb *tcb; + struct sk_buff *subskb = NULL; + +- if (!reinject) ++ if (reinject) { ++ /* Make sure to update counters and MIB in case of meta-retrans ++ * AKA reinjections, similar to what is done in ++ * __tcp_retransmit_skb(). ++ */ ++ int segs = tcp_skb_pcount(skb); ++ ++ MPTCP_ADD_STATS(sock_net(meta_sk), MPTCP_MIB_RETRANSSEGS, segs); ++ tcp_sk(meta_sk)->total_retrans += segs; ++ tcp_sk(meta_sk)->bytes_retrans += skb->len; ++ } else { + TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ? + MPTCPHDR_SEQ64_INDEX : 0); ++ } + + tcp_skb_tsorted_save(skb) { + subskb = pskb_copy_for_clone(skb, GFP_ATOMIC); +@@ -615,6 +631,7 @@ static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject) + + tcp_add_write_queue_tail(sk, subskb); + sk->sk_wmem_queued += subskb->truesize; ++ sk_forced_mem_schedule(sk, subskb->truesize); + sk_mem_charge(sk, subskb->truesize); + } else { + /* Necessary to initialize for tcp_transmit_skb. mss of 1, as +@@ -1483,6 +1500,7 @@ void mptcp_send_fin(struct sock *meta_sk) + tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK); + TCP_SKB_CB(skb)->end_seq++; + TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN; ++ sk_forced_mem_schedule(meta_sk, skb->truesize); + tcp_queue_skb(meta_sk, skb); + } + __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF); +@@ -1652,7 +1670,9 @@ int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb) + */ + if (refcount_read(&meta_sk->sk_wmem_alloc) > + min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) { +- return -EAGAIN; ++ err = -EAGAIN; ++ ++ goto failed; + } + + /* We need to make sure that the retransmitted segment can be sent on a +@@ -1699,9 +1719,6 @@ int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb) + if (!mptcp_skb_entail(subsk, skb, -1)) + goto failed; + +- /* Update global TCP statistics. */ +- MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_RETRANSSEGS); +- + /* Diff to tcp_retransmit_skb */ + + /* Save stamp of the first retransmit. */ +@@ -1718,6 +1735,12 @@ int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb) + + failed: + NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL); ++ /* Save stamp of the first attempted retransmit. */ ++ if (!meta_tp->retrans_stamp) { ++ tcp_mstamp_refresh(meta_tp); ++ meta_tp->retrans_stamp = tcp_time_stamp(meta_tp); ++ } ++ + return err; + } + diff --git a/root/target/linux/generic/hack-5.4/922-add-full-modems.patch b/root/target/linux/generic/hack-5.4/922-add-full-modems.patch new file mode 100644 index 00000000..3bc891ca --- /dev/null +++ b/root/target/linux/generic/hack-5.4/922-add-full-modems.patch @@ -0,0 +1,100 @@ +Index: linux-5.4.179/drivers/usb/serial/option.c +=================================================================== +--- linux-5.4.179.orig/drivers/usb/serial/option.c ++++ linux-5.4.179/drivers/usb/serial/option.c +@@ -566,6 +566,17 @@ static void option_instat_callback(struc + #define WETELECOM_PRODUCT_WMD300 0x6803 + + ++//david add Fibocom products ++#define FIBOCOM_VENDOR_ID 0x2cb7 ++#define FIBOCOM_PRODUCT_L71X 0x0001 ++#define FIBOCOM_USB_VENDOR_AND_INTERFACE_INFO(vend, cl, sc, pr) \ ++.match_flags = USB_DEVICE_ID_MATCH_INT_INFO \ ++| USB_DEVICE_ID_MATCH_VENDOR, \ ++.idVendor = (vend), \ ++.bInterfaceClass = (cl), \ ++.bInterfaceSubClass = (sc), \ ++.bInterfaceProtocol = (pr) ++ + /* Device flags */ + + /* Highest interface number which can be used with NCTRL() and RSVD() */ +@@ -585,6 +596,77 @@ static void option_instat_callback(struc + + + static const struct usb_device_id option_ids[] = { ++ { USB_DEVICE(0x2ecc, 0x3010) },/* td-tech 909s */ ++//fibocom L610 ++ { USB_DEVICE(0x1782, 0x4d11) , .driver_info = RSVD(4) }, ++ { USB_DEVICE(0x1782, 0x4d10) }, ++ {USB_DEVICE(0x2949, 0x7401), .driver_info = RSVD(1) }, ++ {USB_DEVICE(0x2949, 0x7402), .driver_info = RSVD(0) }, ++//add 5g quectel rm500q, foxconn T99W240T00, meige srm815 ++ { USB_DEVICE(0x2C7C, 0x0800) }, /* Quectel RG500Q/RM500Q/RG510Q/RM510Q */ ++ { USB_DEVICE(0x2C7C, 0x0900) }, /* Quectel RG500U RG200U */ ++ { USB_DEVICE(0x05C6, 0x90DB), .driver_info = RSVD(2)|RSVD(3)|RSVD(4)|RSVD(5) }, ++ { USB_DEVICE(0x05C6, 0x90D5), .driver_info = RSVD(2)|RSVD(3) }, ++ { USB_DEVICE(0x2dee, 0x4d22), .driver_info = RSVD(4)|RSVD(5) }, ++//add simcom a7600e ++ { USB_DEVICE(0x1e0e, 0x9011), .driver_info = RSVD(0)|RSVD(1) }, ++//add U9300 ++ { USB_DEVICE(0x1c9e, 0x9b3c), .driver_info = RSVD(4)|RSVD(5) }, ++//add ucloud m2 ++ { USB_DEVICE_AND_INTERFACE_INFO(0x1782, 0x5d22, 0xff, 0x00, 0x00), .driver_info = RSVD(3) }, ++//add yuge clm920 ++ { USB_DEVICE(0x1286, 0x4e3c), .driver_info = RSVD(0)|RSVD(1) }, ++//add nodecom nl660 ++ { USB_DEVICE(0x1508, 0x1001), .driver_info = RSVD(4)|RSVD(5) }, ++//add quectel product ++ { USB_DEVICE(0x05C6, 0x9090), .driver_info = RSVD(4)|RSVD(5) }, ++ { USB_DEVICE(0x05C6, 0x9003), .driver_info = RSVD(4)|RSVD(5) }, ++ { USB_DEVICE(0x05C6, 0X9215), .driver_info = RSVD(4)|RSVD(5) }, ++ { USB_DEVICE(0x2c7c, 0x0125), .driver_info = RSVD(4)|RSVD(5) }, ++ { USB_DEVICE(0x2c7c, 0x0121), .driver_info = RSVD(4)|RSVD(5) }, ++//add xin feng wei ye ++ { USB_DEVICE(0x5c6, 0x5012), .driver_info = RSVD(4)|RSVD(5) }, ++ { USB_DEVICE(0x5c6, 0x5013), .driver_info = RSVD(4)|RSVD(5) },//gobinet id ++//add em8000 ++ { USB_DEVICE_AND_INTERFACE_INFO(0x19d2, 0x0532, 0xff, 0xff, 0xff), .driver_info = RSVD(6)|RSVD(7) }, ++//add end ++//add fibocom L71x ++ { FIBOCOM_USB_VENDOR_AND_INTERFACE_INFO(0x2cb7, 0xff, 0xff, 0xff) }, ++ { FIBOCOM_USB_VENDOR_AND_INTERFACE_INFO(0x2cb7, 0x0a, 0x00, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(0x19d2, 0x0256, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(0x19d2, 0x0579, 0xff, 0xff, 0xff), .driver_info = RSVD(0)|RSVD(1)|RSVD(5)|RSVD(6) }, ++//add end ++//add mu709s ++ { USB_DEVICE(0x12d1, 0x1c25) }, ++//add forege 630b ++ { USB_DEVICE(0x05c6, 0x9025) }, ++//add kuanyi BM806U ++ //{ USB_DEVICE(0x2020, 0x2033) }, ++ { USB_DEVICE_INTERFACE_CLASS(0x2020, 0x2033, 0xff), .driver_info = RSVD(4) }, ++//add forege 730 ++ { USB_DEVICE(0x05c6, 0xf601), .driver_info = RSVD(4)|RSVD(5) }, ++//add Meig slm790 ++ { USB_DEVICE(0x2dee,0x4d20), .driver_info = RSVD(0) }, ++ { USB_DEVICE(0x2dee,0x4d57), .driver_info = RSVD(0) |RSVD(1)}, ++//add end ++//add Quectel EC200T ++ { USB_DEVICE(0x2c7c, 0x6026) }, ++//add end ++//add Luat air720,air724 ++ { USB_DEVICE(0x1286,0x4e3d) }, ++ { USB_DEVICE(0x1782,0x4e00) }, ++//add end ++//add N720 and N720V5 ++ { USB_DEVICE(0x2949,0x8247), .driver_info = RSVD(4)|RSVD(5) }, ++ { USB_DEVICE(0x2949,0x8241), .driver_info = RSVD(0)}, ++ { USB_DEVICE(0x2949,0x8242), .driver_info = RSVD(0)}, ++ { USB_DEVICE(0x2949,0x8243), .driver_info = RSVD(0)}, ++ { USB_DEVICE(0x2949,0x8700) }, ++//add end ++//david add em7355 em7455(DELL 3P10Y) ++ { USB_DEVICE_INTERFACE_CLASS(0x1199, 0x9041, 0xff), .driver_info = RSVD(8)|RSVD(10)|RSVD(11) }, /* MC7305/MC7355 */ ++ { USB_DEVICE_INTERFACE_CLASS(0x413c, 0x81b6, 0xff), .driver_info = RSVD(8)|RSVD(10)|RSVD(11) }, /* EM7455 */ ++//add end + { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) }, + { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) }, + { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA_LIGHT) }, diff --git a/root/target/linux/ipq40xx/base-files/etc/hotplug.d/net/21_adjust_network b/root/target/linux/ipq40xx/base-files/etc/hotplug.d/net/21_adjust_network deleted file mode 100755 index 7aa4f6f7..00000000 --- a/root/target/linux/ipq40xx/base-files/etc/hotplug.d/net/21_adjust_network +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh - -[ -f /lib/adjust_network.sh ] && { - . /lib/adjust_network.sh - - adjust_eth_queue -} diff --git a/root/target/linux/ipq40xx/base-files/etc/init.d/adjust_network b/root/target/linux/ipq40xx/base-files/etc/init.d/adjust_network deleted file mode 100755 index 02af8198..00000000 --- a/root/target/linux/ipq40xx/base-files/etc/init.d/adjust_network +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh /etc/rc.common -# Copyright (C) 2006-2011 OpenWrt.org - -START=11 -STOP=98 - -adjust_smp_affinity() { - test -f /lib/adjust_network.sh && { - . /lib/adjust_network.sh - - adjust_eth_queue - adjust_edma_smp_affinity - adjust_radio_smp_affinity - } -} - -boot() { - adjust_smp_affinity -} diff --git a/root/target/linux/ipq40xx/base-files/lib/adjust_network.sh b/root/target/linux/ipq40xx/base-files/lib/adjust_network.sh deleted file mode 100755 index 99423022..00000000 --- a/root/target/linux/ipq40xx/base-files/lib/adjust_network.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/sh -# this scripts is used for adjust cpu's choice of interrupts. -# - -################################################ -# Adjust smp_affinity of edma -# Globals: -# None -# Arguments: -# None -# Returns: -# None -# Remark: -# execute only once on start-up. -################################################ -adjust_edma_smp_affinity() { - grep -q edma_eth_ /proc/interrupts || return 0 - local nr=`cat /proc/cpuinfo | grep processor | wc -l` - local cpu=0 - local tx_irq_num - - for tx_num in `seq 0 1 15` ; do - cpu=`printf "%x" $((1<<((tx_num/4+0)%nr)))` - tx_irq_num=`grep -m1 edma_eth_tx$tx_num /proc/interrupts | cut -d ':' -f 1 | tail -n1 | tr -d ' '` - [ -n "$tx_irq_num" ] && echo $cpu > /proc/irq/$tx_irq_num/smp_affinity - done - - for rx_num in `seq 0 1 7` ; do - cpu=`printf "%x" $((1<<((rx_num/2)%nr)))` - rx_irq_num=`grep -m1 edma_eth_rx$rx_num /proc/interrupts | cut -d ':' -f 1 | tail -n1 | tr -d ' '` - [ -n "$rx_irq_num" ] && echo $cpu > /proc/irq/$rx_irq_num/smp_affinity - done -} - -################################################ -# Adjust smp_affinity of ath10k for 2G and 5G -# Globals: -# None -# Arguments: -# None -# Returns: -# None -# Remark: -# execute only once on start-up. -################################################ -adjust_radio_smp_affinity() { - local irqs="`grep -E 'ath10k' /proc/interrupts | cut -d ':' -f 1 | tr -d ' '`" - local nr=`cat /proc/cpuinfo | grep processor | wc -l` - local idx=2 - - for irq in $irqs; do - cpu=`printf "%x" $((1<<((idx)%nr)))` - echo $cpu > /proc/irq/$irq/smp_affinity - idx=$((idx+1)) - done -} - -################################################ -# Adjust queue of eth -# Globals: -# None -# Arguments: -# None -# Returns: -# None -# Remark: -# Each network reboot needs to be executed. -################################################ -adjust_eth_queue() { - local nr=`cat /proc/cpuinfo | grep processor | wc -l` - local idx=0 - - for epath in /sys/class/net/eth[0-9]*; do - test -e $epath || break - echo $epath | grep -q "\." && continue - eth=`basename $epath` - idx=0 - for exps in /sys/class/net/$eth/queues/rx-[0-9]*/rps_cpus; do - test -e $exps || break - cpu=`printf "%x" $((1<<((idx+1)%nr)))` - idx=$((idx+1)) - echo $cpu > $exps - echo 256 > `dirname $exps`/rps_flow_cnt - done - which ethtool >/dev/null 2>&1 && ethtool -K $eth gro off - done - - echo 1024 > /proc/sys/net/core/rps_sock_flow_entries -} diff --git a/root/target/linux/ipq40xx/config-5.4 b/root/target/linux/ipq40xx/config-5.4 index f9403650..2a9fec93 100644 --- a/root/target/linux/ipq40xx/config-5.4 +++ b/root/target/linux/ipq40xx/config-5.4 @@ -480,4 +480,4 @@ CONFIG_ZBOOT_ROM_TEXT=0 CONFIG_ZLIB_DEFLATE=y CONFIG_ZLIB_INFLATE=y CONFIG_ZSTD_COMPRESS=y -CONFIG_ZSTD_DECOMPRESS=y +CONFIG_ZSTD_DECOMPRESS=y \ No newline at end of file diff --git a/root/target/linux/ipq40xx/files/arch/arm/boot/dts/qcom-ipq4019-nhx4019.dts b/root/target/linux/ipq40xx/files/arch/arm/boot/dts/qcom-ipq4019-nhx4019.dts index 4264673d..789a0e90 100755 --- a/root/target/linux/ipq40xx/files/arch/arm/boot/dts/qcom-ipq4019-nhx4019.dts +++ b/root/target/linux/ipq40xx/files/arch/arm/boot/dts/qcom-ipq4019-nhx4019.dts @@ -137,6 +137,21 @@ label = "nhx:xnet"; gpio = <&tlmm 0 GPIO_ACTIVE_HIGH>; }; + + 5g1pwr { + label = "green:5g1pwr "; + gpios = <&tlmm 51 GPIO_ACTIVE_HIGH>; + }; + + 5g2pwr { + label = "green:5g2pwr "; + gpios = <&tlmm 50 GPIO_ACTIVE_HIGH>; + }; + + 5g3pwr { + label = "green:5g3pwr"; + gpios = <&tlmm 46 GPIO_ACTIVE_HIGH>; + }; }; keys { diff --git a/root/target/linux/ipq40xx/image/Makefile b/root/target/linux/ipq40xx/image/Makefile new file mode 100644 index 00000000..92780850 --- /dev/null +++ b/root/target/linux/ipq40xx/image/Makefile @@ -0,0 +1,17 @@ +include $(TOPDIR)/rules.mk +include $(INCLUDE_DIR)/image.mk + +define Device/Default + PROFILES := Default + KERNEL_DEPENDS = $$(wildcard $(DTS_DIR)/$$(DEVICE_DTS).dts) + KERNEL_LOADADDR := 0x80208000 + DEVICE_DTS = $$(SOC)-$(lastword $(subst _, ,$(1))) + DEVICE_DTS_CONFIG := config@1 + IMAGES := sysupgrade.bin + IMAGE/sysupgrade.bin = sysupgrade-tar | append-metadata + IMAGE/sysupgrade.bin/squashfs := +endef + +include $(SUBTARGET).mk + +$(eval $(call BuildImage)) \ No newline at end of file diff --git a/root/target/linux/ipq40xx/image/generic.mk b/root/target/linux/ipq40xx/image/generic.mk index 38c8f0fe..e5349f8b 100755 --- a/root/target/linux/ipq40xx/image/generic.mk +++ b/root/target/linux/ipq40xx/image/generic.mk @@ -841,6 +841,5 @@ define Device/nhx_nhx4019 SOC := qcom-ipq4019 BLOCKSIZE := 128k PAGESIZE := 2048 - DEVICE_PACKAGES := uboot-envtools endef TARGET_DEVICES += nhx_nhx4019 diff --git a/root/target/linux/mvebu/patches-5.15/314-arm64-dts-uDPU-switch-PHY-operation-mode-to-2500base.patch b/root/target/linux/mvebu/patches-5.15/314-arm64-dts-uDPU-switch-PHY-operation-mode-to-2500base.patch deleted file mode 100755 index 2240d0b3..00000000 --- a/root/target/linux/mvebu/patches-5.15/314-arm64-dts-uDPU-switch-PHY-operation-mode-to-2500base.patch +++ /dev/null @@ -1,34 +0,0 @@ -Certain SFP modules (most notably Nokia GPON ones) first check -connectivity on 1000base-x, and switch to 2500base-x afterwards. This -is considered a quirk so the phylink switches the interface to -2500base-x as well. - -However, after power-cycling the uDPU device, network interface/SFP module -will not work correctly until the module is re-seated. This patch -resolves this issue by forcing the interface to be brought up in -2500base-x mode by default. - -Signed-off-by: Jakov Petrina -Signed-off-by: Vladimir Vid -Cc: Luka Perkov - ---- a/arch/arm64/boot/dts/marvell/armada-3720-uDPU.dts -+++ b/arch/arm64/boot/dts/marvell/armada-3720-uDPU.dts -@@ -162,7 +162,7 @@ - }; - - ð0 { -- phy-mode = "sgmii"; -+ phy-mode = "2500base-x"; - status = "okay"; - managed = "in-band-status"; - phys = <&comphy1 0>; -@@ -170,7 +170,7 @@ - }; - - ð1 { -- phy-mode = "sgmii"; -+ phy-mode = "2500base-x"; - status = "okay"; - managed = "in-band-status"; - phys = <&comphy0 1>;